In [1]:
!pip install pyspark

In [2]:
import numpy as np
import pandas as pd
import pyspark
import os
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import pyspark.pandas as ps
from pyspark.sql import SparkSession
from gensim import corpora, models
from pprint import pprint
from tqdm import tqdm

import nltk
nltk.download('wordnet')

In [3]:
def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in tqdm(enumerate(ldamodel[corpus])):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [4]:
data = pd.read_csv('../input/kisan-query-analysis-dataset/query_agg.csv')
data.head()

In [5]:
data = data[data['QueryText'].notna()]
data['KccAns'].fillna("Nan", inplace=True)

In [6]:
columns = data.columns.tolist()

In [15]:
dictionary_qt = pickle.load(open("../input/kisantfidf/dictionary", "rb"))
tfidf_qt = pickle.load(open("../input/kisan-tfidf/tfidf", "rb"))
#lda_model_qt_tfidf = pickle.load(open("models\\lda_model_tfidf", "rb"))
# corpus_qt_tfidf = pickle.load(open("models\\corpus_tfidf", "rb"))
corp = pickle.load(open("../input/kisan-tfidf/corp_tfidf", "rb"))

In [16]:
lsi_model_qt_tfidf = gensim.models.LsiModel(corp, num_topics=8, id2word=dictionary_qt,)

In [17]:
for idx, topic in lsi_model_qt_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [26]:
def pipeline(tt):
    tt = pd.Series(tt)
    processed = tt.map(preprocess)
#     dictionary = gensim.corpora.Dictionary(processed)
    bow_corpus = [dictionary_qt.doc2bow(doc) for doc in processed]
#     tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf_qt[bow_corpus]
#     print("Processed: {} \n BoW: {} \n Corpus TF-IDF: {}".format(processed, bow_corpus, corpus_tfidf[0]))
#     ctr = True
    for index, score in sorted(lsi_model_qt_tfidf[bow_corpus[0]], key=lambda tup: -1*tup[1]):
        print("\nScore: {}\t \nTopic: {}".format(score, lsi_model_qt_tfidf.print_topic(index, 8)))
    
    tops = lsi_model_qt_tfidf.get_document_topics(bow_corpus[0])
    max1 = (0,0)
    for i in tops:
        if(i[1] > max1[1]):
            max1 = i
    aa1 = df_dominant_topic[df_dominant_topic["Text"].notna()]
    pos_doc = aa1[aa1["Dominant_Topic"] == max1[0]][:10]
    a1 = pos_doc["Text"].tolist()
#     fin_ans = dd[dd['QueryText'] == a1]['KccAns']
    
    fin_ans = []
    for i in data.values:
        if(i[5] in a1):
            fin_ans.append(i[6])
    return fin_ans[:10]

In [27]:
aa = "rice"
fin = pipeline(aa)
fin