In [6]:
!pip install pyspark

In [7]:
import numpy as np
import pandas as pd
import pyspark
import os
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import pyspark.pandas as ps
from pyspark.sql import SparkSession
from gensim import corpora, models
from pprint import pprint
from tqdm import tqdm

import nltk
nltk.download('wordnet')

In [8]:
def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in tqdm(enumerate(ldamodel[corpus])):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [9]:
from pyspark.sql import SQLContext
from pyspark.sql.functions import isnan, when, count, col, translate
from pyspark.sql.types import IntegerType
from pyspark.sql.types import FloatType
from pyspark.sql import functions as F
from pyspark import SparkConf
from pyspark.context import SparkContext
sc = SparkContext.getOrCreate(SparkConf())
sqlContext = SQLContext(sc)

In [None]:
df_spark = sqlContext.read.csv("../input/kisan-query-analysis-dataset/query_agg.csv", header=True, inferSchema = True)
df_spark.count()

In [None]:
df_spark.printSchema()

In [None]:
df_spark.show(5)

In [None]:
df_spark.filter((col("DistrictName")=="AMRITSAR")).count()

crop count high and low

In [None]:
df_spark.groupby('Crop').count().orderBy('count', ascending=False).head(5)

In [None]:
df_spark.groupby('Crop').count().orderBy('count', ascending=False).head(298)

In [None]:
df_spark = df_spark.withColumn('Category', translate('Category', '418', 'Others'))
df_spark = df_spark.withColumn('Category', translate('Category', '0', 'Others'))
df_spark = df_spark.withColumn('Category', translate('Category', '-1', 'Others'))
df_spark=df_spark.withColumn("Crop", F.when(F.col("Crop").cast("int").isNotNull(), "Others").otherwise(F.col("Crop")))



In [None]:
df_spark=df_spark.withColumn("Crop", F.when(F.col("Crop").isNull(), "Others").otherwise(F.col("Crop")))
df_spark.groupby('Crop').count().orderBy('count', ascending=False).show()

In [None]:
values=[]
values.append(df_spark.filter(df_spark["StateName"]=="PUNJAB").groupby('Crop').count().orderBy('count', ascending=False).head(5)[2]['count'])
values.append(df_spark.filter(df_spark["StateName"]=="HARYANA").groupby('Crop').count().orderBy('count', ascending=False).head(5)[1]['count'])
values.append(df_spark.filter(df_spark["StateName"]=="RAJASTHAN").groupby('Crop').count().orderBy('count', ascending=False).head(5)[1]['count'])
values.append(df_spark.filter(df_spark["StateName"]=="HIMACHAL PRADESH").groupby('Crop').count().orderBy('count', ascending=False).head(5)[4]['count'])
values.append(df_spark.filter(df_spark["StateName"]=="UTTARAKHAND").groupby('Crop').count().orderBy('count', ascending=False).head(5)[1]['count'])


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
names=['PUNJAB','HARYANA','RAJASTHAN','HIMANCHAL PRADESH','UTTARAKHAND']
plt.figure(figsize=(15,15))
plt.pie(values,labels=names)
plt.show()

In [None]:
df_spark = df_spark.withColumn('Sector', translate('Sector', '256', 'Others'))
df_spark = df_spark.withColumn('Sector', translate('Sector', '825', 'Others'))


In [None]:
sector_values=[]
sector_values.append(df_spark.groupBy('Sector').count().orderBy('count', ascending=False).head(5)[0]['count'])
sector_values.append(df_spark.groupBy('Sector').count().orderBy('count', ascending=False).head(5)[1]['count'])
sector_values.append(df_spark.groupBy('Sector').count().orderBy('count', ascending=False).head(5)[2]['count'])
sector_values.append(df_spark.groupBy('Sector').count().orderBy('count', ascending=False).head(5)[3]['count'])
sector_values.append(df_spark.groupBy('Sector').count().orderBy('count', ascending=False).head(5)[4]['count'])
sector=["AGRICULTURE","HORTICULTURE","ANIMAL HUSBANDRY","Others","FISHERIES"]
plt.figure(figsize=(15,15))
plt.pie(sector_values,labels=sector)
plt.show()

issue with plotting category and state name

In [None]:
data_category=df_spark.select('Category').toPandas()
data_state=df_spark.select('StateName').toPandas()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(15,15))
plt.xticks(rotation=90)
sns.countplot(data_category["Category"])

In [1]:
!pip install pyarrow

In [10]:
df_dominant_topic = sqlContext.read.csv("../input/kisandocument/Dominant_Topic_for_Queries.csv", header=True, inferSchema = True)
print(df_dominant_topic.count())

data = sqlContext.read.csv("../input/kisan-query-analysis-dataset/query_agg.csv", header=True, inferSchema = True)
print(data.count())

In [11]:
df_dominant_topic = df_dominant_topic.where(col("Text").isNotNull())

using pandas

In [5]:
data = pd.read_csv('../input/kisan-query-analysis-dataset/query_agg.csv')
data.head()

In [None]:
data = data[data['QueryText'].notna()]
data['KccAns'].fillna("Nan", inplace=True)

In [None]:
columns = data.columns.tolist()

In [None]:
pre_processed_query_text = data['QueryText'].map(preprocess)

In [12]:
dictionary_qt = gensim.corpora.Dictionary(pre_processed_query_text)
bow_corpus_qt = [dictionary_qt.doc2bow(doc) for doc in pre_processed_query_text]
tfidf_qt = models.TfidfModel(bow_corpus_qt)
corpus_qt_tfidf = tfidf_qt[bow_corpus_qt]



In [None]:
lsi_model_qt_tfidf = gensim.models.LsiModel(corpus_qt_tfidf, num_topics=8, id2word=dictionary_qt,)

In [14]:
import pickle

In [None]:
pickle.dump(lsi_model_qt_tfidf, open("lsi_model_tfidf", "wb"))

In [40]:
dictionary_qt = pickle.load(open("../input/kisantfidf/dictionary", "rb"))
tfidf_qt = pickle.load(open("../input/kisan-tfidf/tfidf", "rb"))
lsi_model_qt_tfidf = pickle.load(open("../input/kisan-lsi/lsi_model_tfidf", "rb"))
# corpus_qt_tfidf = pickle.load(open("models\\corpus_tfidf", "rb"))
corp = pickle.load(open("../input/kisan-tfidf/corp_tfidf", "rb"))

In [48]:
#df_dominant_topic = sqlContext.read.csv("../input/kisandocument/Dominant_Topic_for_Queries.csv", header=True, inferSchema = True)
import pandas as pd
data=pd.read_csv("../input/kisan-query-analysis-dataset/query_agg.csv")

In [49]:
df_dominant_topic.head()

In [50]:
def pipeline(tt):
    tt = pd.Series(tt)
    processed = tt.map(preprocess)
#     dictionary = gensim.corpora.Dictionary(processed)
    bow_corpus = [dictionary_qt.doc2bow(doc) for doc in processed]
#     tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf_qt[bow_corpus]
#     print("Processed: {} \n BoW: {} \n Corpus TF-IDF: {}".format(processed, bow_corpus, corpus_tfidf[0]))
#     ctr = True
    for index, score in sorted(lsi_model_qt_tfidf[bow_corpus[0]], key=lambda tup: -1*tup[1]):
        print("\nScore: {}\t \nTopic: {}".format(score, lsi_model_qt_tfidf.print_topic(index, 8)))
    aa1 = df_dominant_topic[df_dominant_topic["Text"].isnull()!=True]
    index = gensim.similarities.MatrixSimilarity(lsi_model_qt_tfidf[corpus_tfidf])
    top=index[lsi_model_qt_tfidf[corpus_tfidf]]
    pos_doc = aa1[aa1["Dominant_Topic"] == top[0][0]][:10]
    a1 = pos_doc["Text"].tolist()
#     fin_ans = dd[dd['QueryText'] == a1]['KccAns']
    
    for i in data.values:
        if(i[5] in a1):
            fin_ans.append(i[6])
    return fin_ans[:10]
    

In [51]:
aa = "weather"
fin = pipeline(aa)
fin