In [1]:
import pandas as pd
import numpy as np
import nltk
# nltk.download('wordnet')
# nltk.download('stopwords')
import gensim
from gensim import corpora
import pickle
import pyLDAvis.gensim
import copy

### Tokenize

In [2]:
import spacy
# spacy.load('en')
from spacy.lang.en import English

parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

  from .optimizers import Adam, linear_decay
  from collections import Sequence, Sized, Iterable, Callable
  from collections import Sequence, Sized, Iterable, Callable


### Lemmatize

In [3]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

### Part of Speech Filter

In [4]:
def part_of_speech(list_input):
    # nltk.download('averaged_perceptron_tagger')
    word_list = []
    for i in range(len(list_input)):
        word_list.append(list_input[i])
        word_list = ' '.join(word_list)
        word_list = nltk.word_tokenize(word_list)
        word_tagged = nltk.pos_tag(word_list)
        pos_list = ['VB', 'VBZ', 'VBN', 'VBG', 'VBD', 'NN', 'NNS', 'JJ', 'RB'] #verbs, nouns, adj, adv
        pos_selected_words = []
        for i, j in word_tagged:
            if j in pos_list:
                pos_selected_words.append(i)
    return pos_selected_words

### Prepare Text for LDA

In [5]:
# additional stopwords
en_stop = list(nltk.corpus.stopwords.words('english'))
for i in ['would like', 'would', 'when', 'with']:
    en_stop.append(i)

def prepare_by_line(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 3]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    if len(tokens)>1:
        temp_pos = part_of_speech(tokens)
        tokens = [token for token in temp_pos]
    return tokens

def prepare_text_for_lda(df_column):  
    text_data = []
    discard_index = []
    discard_content = []
    for i in range(len(df_column)):
        line = df_column.iloc[i][0]
        tokens = prepare_by_line(line)
        if len(tokens)>1:
            text_data.append(tokens)
        else:
            discard_index.append(i)
            discard_content.append(line)
            print(i, '  ', line)
    return text_data, discard_index, discard_content

### LDA Model

In [6]:
def LDA_fit(text_data, num_topics):
    # Create Dictionary & Corpus
    dictionary = corpora.Dictionary(text_data)
    
#     count = 0
#     for k, v in dictionary.iteritems():
#         print(k, v)
#         count += 1
#         if count > 10:
#             break
    
    dictionary.filter_extremes(no_below = 3, no_above=0.5)
    corpus = [dictionary.doc2bow(text) for text in text_data]
    pickle.dump(corpus, open('corpus.pkl', 'wb'))
    dictionary.save('dictionary.gensim')
    
    # Fit LDA
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = num_topics, id2word=dictionary, passes=20, 
                                               random_state=234)
    ldamodel.save('model.gensim')
    topics = ldamodel.print_topics(num_words=5)
    for topic in topics:
        print(topic)
    
    # LDA Visualization
    dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
    corpus = pickle.load(open('corpus.pkl', 'rb'))
    lda = gensim.models.ldamodel.LdaModel.load('model.gensim')
    
    lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
    # pyLDAvis.save_html(lda_display, 'topic-'+which_column.split()[-1]+'.html')
    # pyLDAvis.display(lda_display)
    return text_data, corpus, ldamodel, lda_display

### Output Dataframe with Topics

In [7]:
def combine_topic(model, corpus, df, filter_id, filter_topic_value):
    topic_assigned = []
    for i, row_list in enumerate(model[corpus]):
        which_topic = np.argmax(row_list, axis=0)[1]
        topic_assigned.append(which_topic)
        
    df_temp = df.iloc[filter_id]
    df_output = df.copy()
    df_output.drop(df.index[[filter_id]], inplace=True)    
    df_output['topic'] = topic_assigned
    df_temp['topic'] = filter_topic_value
    df_output = pd.concat([df_output, df_temp])
    df_output.reset_index(drop=True, inplace=True)
    
    return df_output

## Get Data and Run Model

In [8]:
df = pd.read_excel('Copy of Verbatims.xlsx')

num_topics = 3

which_column = 'Likes'
df_like = df.loc[:, [which_column]].dropna()
text_data, id_like, content_like = prepare_text_for_lda(df_like)
text_like, corpus_like, model_like, display_like = LDA_fit(text_data, num_topics)

which_column = 'Dislikes'
df_dislike = df.loc[:, [which_column]].dropna()
text_data, id_dislike, content_dislike = prepare_text_for_lda(df_dislike)
text_dislike, corpus_dislike, model_dislike, display_dislike = LDA_fit(text_data, num_topics)

which_column = 'Improvements'
df_imp = df.loc[:, [which_column]].dropna()
text_data, id_imp, content_imp = prepare_text_for_lda(df_imp)
text_imp, corpus_imp, model_imp, display_imp = LDA_fit(text_data, num_topics)

# create and save visulized html file
pyLDAvis.save_html(display_like, 'topic-'+which_column.split()[-1]+'.html')
pyLDAvis.save_html(display_dislike, 'topic-'+which_column.split()[-1]+'.html')
pyLDAvis.save_html(display_imp, 'topic-'+which_column.split()[-1]+'.html')

# pyLDAvis.display(display_like)

  _XMLParser.__init__(self, html, target, encoding)
  _XMLParser.__init__(self, html, target, encoding)
  _XMLParser.__init__(self, html, target, encoding)
  _XMLParser.__init__(self, html, target, encoding)
  _XMLParser.__init__(self, html, target, encoding)
  _XMLParser.__init__(self, html, target, encoding)
  _XMLParser.__init__(self, html, target, encoding)
  _XMLParser.__init__(self, html, target, encoding)
  _XMLParser.__init__(self, html, target, encoding)


1    none
7    Didn't have a bad odor
20    no dust
32    I like the scent
34    No odors
36    everything
45    Everything
47    i liked everything
50    consistency
58    It was clumping
64    LIGHT
86    low dust
90    No it is new
96    was lightweight
(0, '0.127*"easy" + 0.096*"scoop" + 0.080*"litter" + 0.057*"pour" + 0.053*"odor"')
(1, '0.159*"smell" + 0.096*"scent" + 0.062*"clean" + 0.058*"litter" + 0.051*"easy"')
(2, '0.194*"clump" + 0.150*"well" + 0.135*"dust" + 0.076*"great" + 0.068*"really"')


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


4    Nothing
7    Nothing
12    Nothing
17    no
20    nothing
21    nothing
28    Nothing I didn’t like.
29    Na
31    None.
32    none
33    Nothing
34    Nothing
35    nothing
39    none
41    Tracked out of box
43    nothing
44    None
46    nothing
48    nothing
49    none
51    nothing
52    none
53    none
55    a bit heavy
61    Smell
62    didn't like the scent
65    nothing at all
68    Nothing
69    nothing
75    Nothing
81    Nothing
86    Nothing
90    Nothing
(0, '0.090*"clump" + 0.083*"stick" + 0.065*"litter" + 0.063*"track" + 0.063*"also"')
(1, '0.153*"clump" + 0.108*"dislike" + 0.077*"well" + 0.066*"nothing" + 0.066*"urine"')
(2, '0.227*"scent" + 0.109*"strong" + 0.107*"litter" + 0.107*"really" + 0.074*"little"')


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


4    Nothing
7    No improvement
9    hold in odors better
12    Nothing
19    A bit firmwr and not as lose.
20    none
21    none
29    Perfect as is.
31    not sure
34    None
40    nothing at all
44    nothing
45    None
47    nothing
49    none
50    none
52    no idea
54    unsure
55    clumping
62    I didn’t like the smell.
63    none
69    Nothing
70    nothing
72    Now ne
74    More fragrance
79    would like more scent
81    Clump better
87    Na
90    I would like to be more clumpy
93    none
95    not sure
(0, '0.160*"make" + 0.093*"clump" + 0.067*"little" + 0.066*"fine" + 0.052*"track"')
(1, '0.235*"scent" + 0.100*"smell" + 0.086*"fragrance" + 0.084*"think" + 0.059*"litter"')
(2, '0.161*"le" + 0.086*"need" + 0.086*"improvement" + 0.073*"scoop" + 0.060*"clumping"')


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [9]:
final_like = combine_topic(model_like, corpus_like, df_like, id_like, 'None-Like')
final_dislike = combine_topic(model_dislike, corpus_dislike, df_dislike, id_dislike, 'None-Disike')
final_imp = combine_topic(model_imp, corpus_imp, df_imp, id_imp, 'None-Imp')

  result = getitem(key)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [10]:
with pd.ExcelWriter('Topic Modelling-Result.xlsx') as writer:
    final_like.to_excel(writer, sheet_name='like', index=False)
    final_dislike.to_excel(writer, sheet_name='dislike', index=False)
    final_imp.to_excel(writer, sheet_name='imp', index=False)

In [11]:
# for i,topic in model_like.show_topics(formatted=False, num_topics=5, num_words=5):
#     a = topic
#     print(str(i))
#     print(topic)

0
[('easy', 0.12668732), ('scoop', 0.09605881), ('litter', 0.079764895), ('pour', 0.056900606), ('odor', 0.05289848)]
1
[('smell', 0.15874365), ('scent', 0.09579791), ('clean', 0.062003337), ('litter', 0.058434166), ('easy', 0.050696924)]
2
[('clump', 0.19439168), ('well', 0.15011688), ('dust', 0.13455021), ('great', 0.07555453), ('really', 0.06760639)]


In [12]:
# for i,topic in model_dislike.show_topics(formatted=False, num_topics=5, num_words=5):
#     a = topic
#     print(str(i))
#     print(topic)

0
[('clump', 0.08990982), ('stick', 0.08333188), ('litter', 0.06454768), ('track', 0.06339767), ('also', 0.06290527)]
1
[('clump', 0.1527611), ('dislike', 0.107528806), ('well', 0.077133484), ('nothing', 0.06595048), ('urine', 0.06581002)]
2
[('scent', 0.22685793), ('strong', 0.108979136), ('litter', 0.107217245), ('really', 0.10696874), ('little', 0.0742732)]


In [13]:
# for i,topic in model_imp.show_topics(formatted=False, num_topics=5, num_words=5):
#     a = topic
#     print(str(i))
#     print(topic)

0
[('make', 0.15967935), ('clump', 0.09318521), ('little', 0.06679565), ('fine', 0.06614576), ('track', 0.05159173)]
1
[('scent', 0.23459512), ('smell', 0.09985224), ('fragrance', 0.085822575), ('think', 0.08398967), ('litter', 0.058729425)]
2
[('le', 0.16142654), ('need', 0.08629921), ('improvement', 0.086147375), ('scoop', 0.07265034), ('clumping', 0.05960957)]
