In [None]:
import pandas as pd
import numpy as np
import nltk
# nltk.download('wordnet')
# nltk.download('stopwords')
import gensim
from gensim import corpora
import pickle
import pyLDAvis.gensim
import copy

### Tokenize

In [None]:
import spacy
# spacy.load('en')
from spacy.lang.en import English

parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

### Lemmatize

In [None]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

### Part of Speech Filter

In [None]:
def part_of_speech(list_input):
    # nltk.download('averaged_perceptron_tagger')
    word_list = []
    for i in range(len(list_input)):
        word_list.append(list_input[i])
        word_list = ' '.join(word_list)
        word_list = nltk.word_tokenize(word_list)
        word_tagged = nltk.pos_tag(word_list)
        pos_list = ['VB', 'VBZ', 'VBN', 'VBG', 'VBD', 'NN', 'NNS', 'JJ', 'RB'] #verbs, nouns, adj, adv
        pos_selected_words = []
        for i, j in word_tagged:
            if j in pos_list:
                pos_selected_words.append(i)
    return pos_selected_words

### Prepare Text for LDA

In [None]:
# additional stopwords
en_stop = list(nltk.corpus.stopwords.words('english'))
for i in ['would like', 'would', 'when', 'with']:
    en_stop.append(i)

def prepare_by_line(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 3]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    if len(tokens)>1:
        temp_pos = part_of_speech(tokens)
        tokens = [token for token in temp_pos]
    return tokens

def prepare_text_for_lda(df_column):  
    text_data = []
    discard_index = []
    discard_content = []
    for i in range(len(df_column)):
        line = df_column.iloc[i][0]
        tokens = prepare_by_line(line)
        if len(tokens)>1:
            text_data.append(tokens)
        else:
            discard_index.append(i)
            discard_content.append(line)
            # print the rows that are left out from model data preparation 
            print(i, '  ', line)
    return text_data, discard_index, discard_content

### LDA Model

In [None]:
def LDA_fit(text_data, num_topics):
    # Create Dictionary & Corpus
    dictionary = corpora.Dictionary(text_data)
    
#     count = 0
#     for k, v in dictionary.iteritems():
#         print(k, v)
#         count += 1
#         if count > 10:
#             break
    
    dictionary.filter_extremes(no_below = 3, no_above=0.5)
    corpus = [dictionary.doc2bow(text) for text in text_data]
    pickle.dump(corpus, open('corpus.pkl', 'wb'))
    dictionary.save('dictionary.gensim')
    
    # Fit LDA
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = num_topics, id2word=dictionary, passes=20, random_state=234)
    ldamodel.save('model.gensim')
    topics = ldamodel.print_topics(num_words=5)
    for topic in topics:
        print(topic)
    
    # LDA Visualization
    dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
    corpus = pickle.load(open('corpus.pkl', 'rb'))
    lda = gensim.models.ldamodel.LdaModel.load('model.gensim')
    
    lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
    # pyLDAvis.save_html(lda_display, 'topic-'+which_column.split()[-1]+'.html')
    # pyLDAvis.display(lda_display)
    return text_data, corpus, ldamodel, lda_display

### Output Dataframe with Topics

In [None]:
def combine_topic(model, corpus, df, filter_id, filter_topic_value):
    topic_assigned = []
    for i, row_list in enumerate(model[corpus]):
        which_topic = np.argmax(row_list, axis=0)[1]
        topic_assigned.append(which_topic)
        
    df_temp = df.iloc[filter_id]
    df_output = df.copy()
    df_output.drop(df.index[[filter_id]], inplace=True)    
    df_output['topic'] = topic_assigned
    df_temp['topic'] = filter_topic_value
    df_output = pd.concat([df_output, df_temp])
    df_output.reset_index(drop=True, inplace=True)
    
    return df_output

## Get Data and Run Model

In [None]:
df = pd.read_excel('Copy of Verbatims.xlsx')

num_topics = 3

which_column = 'Likes'
df_like = df.loc[:, [which_column]].dropna()
text_data, id_like, content_like = prepare_text_for_lda(df_like)
text_like, corpus_like, model_like, display_like = LDA_fit(text_data, num_topics)

which_column = 'Dislikes'
df_dislike = df.loc[:, [which_column]].dropna()
text_data, id_dislike, content_dislike = prepare_text_for_lda(df_dislike)
text_dislike, corpus_dislike, model_dislike, display_dislike = LDA_fit(text_data, num_topics)

which_column = 'Improvements'
df_imp = df.loc[:, [which_column]].dropna()
text_data, id_imp, content_imp = prepare_text_for_lda(df_imp)
text_imp, corpus_imp, model_imp, display_imp = LDA_fit(text_data, num_topics)

# create and save visulized html file
pyLDAvis.save_html(display_like, 'topic-'+which_column.split()[-1]+'.html')
pyLDAvis.save_html(display_dislike, 'topic-'+which_column.split()[-1]+'.html')
pyLDAvis.save_html(display_imp, 'topic-'+which_column.split()[-1]+'.html')

# pyLDAvis.display(display_like)

In [None]:
final_like = combine_topic(model_like, corpus_like, df_like, id_like, 'None-Like')
final_dislike = combine_topic(model_dislike, corpus_dislike, df_dislike, id_dislike, 'None-Disike')
final_imp = combine_topic(model_imp, corpus_imp, df_imp, id_imp, 'None-Imp')

In [None]:
with pd.ExcelWriter('Topic Modelling-Result.xlsx') as writer:
    final_like.to_excel(writer, sheet_name='like', index=False)
    final_dislike.to_excel(writer, sheet_name='dislike', index=False)
    final_imp.to_excel(writer, sheet_name='imp', index=False)

In [None]:
for i,topic in model_like.show_topics(formatted=False, num_topics=5, num_words=5):
    a = topic
    print(str(i))
    print(topic)

In [None]:
for i,topic in model_dislike.show_topics(formatted=False, num_topics=5, num_words=5):
    a = topic
    print(str(i))
    print(topic)

In [None]:
for i,topic in model_imp.show_topics(formatted=False, num_topics=5, num_words=5):
    a = topic
    print(str(i))
    print(topic)