## Installs

In [14]:
!pip install pyspark

## Imports

In [15]:
import numpy as np
import pandas as pd
import pyspark
import os
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import pyspark.pandas as ps
from pyspark.sql import SparkSession
from gensim import corpora, models
from pprint import pprint
from tqdm import tqdm

import nltk
nltk.download('wordnet')

## Functions

In [16]:
def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in tqdm(enumerate(ldamodel[corpus])):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

# Data

## Loading

In [17]:
data = pd.read_csv('../input/kisan-query-analysis-dataset/query_agg.csv')

In [18]:
# spark = SparkSession.builder.getOrCreate()

In [19]:
# df = (spark.read.format("csv").options(header="true").load("../input/kisan-query-analysis-dataset/query_agg.csv"))

In [20]:
data.head()

## Pre-processing

In [21]:
data = data[data['QueryText'].notna()]
data['KccAns'].fillna("Nan", inplace=True)

In [22]:
columns = data.columns.tolist()

In [23]:
# doc_sample = documents[documents['index'] == 4310].values[0][0]
doc_sample = data[data['Sector'] == "AGRICULTURE"].values[0][5]
doc_sample

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

In [24]:
# dd = data.sample(10000)

In [25]:
pre_processed_query_text = data['QueryText'].map(preprocess)
# pre_processed_kcc_ans  = data['KccAns'].map(preprocess)

In [26]:
print(pre_processed_query_text[0:5])
# print(pre_processed_kcc_ans[0:5])

In [27]:
data['QueryText'].iloc[4]

## BoW on the data

In [28]:
dictionary_qt = gensim.corpora.Dictionary(pre_processed_query_text)
# dictionary_ka = gensim.corpora.Dictionary(pre_processed_kcc_ans)

In [29]:
c = 0
for k,v in dictionary_qt.iteritems():
    print(k, v)
    c += 1
    if c > 10:
        break
# print()
# c = 0
# for k,v in dictionary_ka.iteritems():
#     print(k, v)
#     c += 1
#     if c > 10:
#         break

In [30]:
dictionary_qt.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
# dictionary_ka.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [31]:
data.iloc[5]

In [32]:
bow_corpus_qt = [dictionary_qt.doc2bow(doc) for doc in pre_processed_query_text]
bow_corpus_qt[5]

In [33]:
# bow_corpus_ka = [dictionary_ka.doc2bow(doc) for doc in pre_processed_kcc_ans]
# bow_corpus_ka[5]

In [34]:
bow_doc_qt_5 = bow_corpus_qt[201]
for i in range(len(bow_doc_qt_5)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_qt_5[i][0],dictionary_qt[bow_doc_qt_5[i][0]],bow_doc_qt_5[i][1]))

In [35]:
# bow_doc_ka_5 = bow_corpus_ka[201]
# for i in range(len(bow_doc_ka_5)):
#     print("Word {} (\"{}\") appears {} time.".format(bow_doc_ka_5[i][0],dictionary_ka[bow_doc_ka_5[i][0]],bow_doc_ka_5[i][1]))

## TF-IDF on the BoW

In [36]:
tfidf_qt = models.TfidfModel(bow_corpus_qt)
corpus_qt_tfidf = tfidf_qt[bow_corpus_qt]

pprint(corpus_qt_tfidf[201])

In [37]:
# tfidf_ka = models.TfidfModel(bow_corpus_ka)
# corpus_ka_tfidf = tfidf_ka[bow_corpus_ka]

# pprint(corpus_ka_tfidf[201])

## LDA with BoW data

In [38]:
lda_model_qt = gensim.models.LdaMulticore(bow_corpus_qt, num_topics=8, id2word=dictionary_qt, passes=2, workers=2)

In [39]:
# lda_model_ka = gensim.models.LdaMulticore(bow_corpus_ka, num_topics=8, id2word=dictionary_ka, passes=2, workers=2)

In [40]:
for idx, topic in lda_model_qt.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))
# print()
# for idx, topic in lda_model_ka.print_topics(-1):
#     print('Topic: {} \nWords: {}'.format(idx, topic))

## LDA with TF-IDF data

In [41]:
lda_model_qt_tfidf = gensim.models.LdaMulticore(corpus_qt_tfidf, num_topics=8, id2word=dictionary_qt, passes=2, workers=4)
for idx, topic in lda_model_qt_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))
    
# print()
# lda_model_ka_tfidf = gensim.models.LdaMulticore(corpus_ka_tfidf, num_topics=8, id2word=dictionary_ka, passes=2, workers=4)
# for idx, topic in lda_model_ka_tfidf.print_topics(-1):
#     print('Topic: {} Word: {}'.format(idx, topic))

In [42]:
pprint(pre_processed_query_text.iloc[201])
# pprint(pre_processed_kcc_ans.iloc[201])

In [43]:
data['QueryText'].iloc[789]

In [44]:
# bow_corpus_ka[789]

In [45]:
# for index, score in sorted(lda_model_ka[bow_corpus_ka[789]], key=lambda tup: -1*tup[1]):
#     print("\nScore: {}\t \nTopic: {}".format(score, lda_model_ka.print_topic(index, 10)))

In [147]:
import pickle
# dump : put the data of the object in a file
# pickle.dump(lda_model_qt_tfidf, open("lda_model_tfidf", "wb"))
# pickle.dump(dictionary_qt, open("dictionary", "wb"))
# pickle.dump(tfidf_qt, open("tfidf", "wb"))
# pickle.dump(corpus_qt_tfidf, open("corpus_tfidf", "wb"))
# corp
pickle.dump(corp, open("corp_tfidf", "wb"))

## Document retrival for a given topic

In [77]:
corpus_qt_tfidf

In [None]:
aa = list(corpus_qt_tfidf)

In [124]:
dd = data["QueryText"].sample(100000)

In [125]:
indd = dd.index.values.tolist()

In [137]:
corp = []
for i in indd:
    if(i > 3554294):
        i = i-200-1
        print(i)
    corp.append(aa[i])

In [128]:
len(data)

In [127]:
max(indd)

In [79]:
dd.head()

In [138]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model_qt_tfidf, corpus=corp, texts=dd)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic[df_dominant_topic['Text'].notna()].head(10)

In [140]:
len(df_dominant_topic[df_dominant_topic['Text'].notna()])

In [144]:
df_dominant_topic.to_csv("Dominant_Topic_for_Queries.csv", index=False)

In [148]:
dd.to_csv('Doc.csv', index=False)

In [None]:
# df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model_ka_tfidf, corpus=corpus_ka_tfidf, texts=dd['KccAns'])

# # Format
# df_dominant_topic = df_topic_sents_keywords.reset_index()
# df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# # Show
# df_dominant_topic[df_dominant_topic['Text'].notna()].head(10)

# Trial

In [None]:
dictionary_qt = pickle.load(open("../input/models-kisan-query/dictionary", "rb"))
tfidf_qt = pickle.load(open("../input/models-kisan-query/tfidf", "rb"))
lda_model_qt_tfidf = pickle.load(open("../input/models-kisan-query/lda_model_tfidf", "rb"))

In [141]:
def pipeline(tt):
    tt = pd.Series(tt)
    processed = tt.map(preprocess)
#     dictionary = gensim.corpora.Dictionary(processed)
    bow_corpus = [dictionary_qt.doc2bow(doc) for doc in processed]
#     tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf_qt[bow_corpus]
#     print("Processed: {} \n BoW: {} \n Corpus TF-IDF: {}".format(processed, bow_corpus, corpus_tfidf[0]))
#     ctr = True
    for index, score in sorted(lda_model_qt_tfidf[bow_corpus[0]], key=lambda tup: -1*tup[1]):
        print("\nScore: {}\t \nTopic: {}".format(score, lda_model_qt_tfidf.print_topic(index, 8)))
    
    tops = lda_model_qt_tfidf.get_document_topics(bow_corpus[0])
    max1 = (0,0)
    for i in tops:
        if(i[1] > max1[1]):
            max1 = i
    aa1 = df_dominant_topic[df_dominant_topic["Text"].notna()]
    pos_doc = aa1[aa1["Dominant_Topic"] == max1[0]][:10]
    a1 = pos_doc["Text"].tolist()
#     fin_ans = dd[dd['QueryText'] == a1]['KccAns']
    
    fin_ans = []
    for i in data.values:
        if(i[5] in a1):
            fin_ans.append(i[6])
    return fin_ans[:10]

In [145]:
aa = "rice"
fin = pipeline(aa)

In [146]:
fin

# End