In [3]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
nltk.download('wordnet')
import matplotlib.pyplot as plt
from gensim.models.coherencemodel import CoherenceModel

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Yang\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
def lemmatize_stemming(text, stemmer):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
    
def preprocess(text, stemmer):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    result = []
    for token in word_tokenize(text.lower()):
        eng_stopwords = set(stopwords.words('english'))
        if token not in eng_stopwords and len(token) >= 3:
            result.append(lemmatize_stemming(token, stemmer))
    return result

In [5]:
def lda_model(i, bow_corpus, dictionary):
    lda_model = gensim.models.ldamodel.LdaModel(bow_corpus, 
                                            num_topics=i,
                                            id2word=dictionary, 
                                            passes=2)
    return lda_model

In [6]:
def scores_plot():
    scores = []
    for i in range(1,20):
        lda_model = gensim.models.ldamodel.LdaModel(bow_corpus, 
                                                num_topics=i, 
                                                id2word=dictionary, 
                                                passes=2)
        cm = CoherenceModel(model=lda_model,texts = texts, corpus=bow_corpus, coherence='c_v')
        coherence = cm.get_coherence()
        scores.append(coherence)
    limit=20; start=1; step=1;
    x = range(start, limit, step)
    plt.plot(x, scores)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score")

In [7]:
def format_topics_sentences(lda_model, bow_corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(lda_model[bow_corpus]):
        row = row_list[0] if lda_model.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = lda_model.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [8]:
def sentence_keyword(final_result):
    con = final_result.groupby("Dominant_Topic")
    new = pd.DataFrame()
    for i, grp in con:
        new = pd.concat([new, grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                                axis=0)
    new.reset_index(drop=True, inplace=True)
    new.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]
    new = new.drop(['Topic_Perc_Contrib'], axis=1)
    return new

In [11]:
data = pd.read_csv("review_data_cleaned.csv")
text = data['review']
stemmer = PorterStemmer()
result = []
for i in range(len(text)):
    result.append(preprocess(text[i], stemmer))
dictionary = gensim.corpora.Dictionary(result)
bow_corpus = [dictionary.doc2bow(doc) for doc in result]

In [17]:
# scores_plot() # From the graph, we can see 10 is the best optimal number of topics
lda_model1 = lda_model(10, bow_corpus, dictionary)

In [13]:
# print 10 topics
for idx, topic in lda_model1.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.031*"museum" + 0.017*"place" + 0.016*"ever" + 0.016*"visit" + 0.015*"park" + 0.014*"best" + 0.012*"get" + 0.011*"one" + 0.011*"experi" + 0.011*"like"
Topic: 1 
Words: 0.029*"venu" + 0.018*"great" + 0.016*"see" + 0.015*"time" + 0.014*"place" + 0.013*"concert" + 0.012*"first" + 0.012*"go" + 0.011*"music" + 0.011*"seat"
Topic: 2 
Words: 0.026*"instal" + 0.026*"work" + 0.023*"home" + 0.022*"servic" + 0.019*"great" + 0.015*"system" + 0.015*"job" + 0.013*"profession" + 0.012*"use" + 0.011*"time"
Topic: 3 
Words: 0.046*"show" + 0.023*"see" + 0.021*"perform" + 0.020*"great" + 0.018*"theatr" + 0.018*"theater" + 0.017*"amaz" + 0.016*"product" + 0.013*"play" + 0.012*"talent"
Topic: 4 
Words: 0.033*"place" + 0.026*"movi" + 0.025*"theater" + 0.023*"great" + 0.018*"love" + 0.017*"nice" + 0.015*"food" + 0.014*"good" + 0.014*"come" + 0.011*"like"
Topic: 5 
Words: 0.022*"seat" + 0.021*"theater" + 0.015*"great" + 0.015*"theatr" + 0.011*"drink" + 0.011*"get" + 0.010*"bad" + 0.009*"see"

In [14]:
# Calculate each document sentence dominant topic and contribution
final_result = format_topics_sentences(lda_model1,bow_corpus, text )
final_result.head()

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,review
0,1.0,0.3891,"venu, great, see, time, place, concert, first,...",Clean and big spaces! Chairs are super comfy a...
1,8.0,0.6696,"show, night, go, time, last, ticket, see, get,...",My favorite theater severely disappointed me t...
2,4.0,0.5384,"place, movi, theater, great, love, nice, food,...",Cinemark has made a ton of improvements since ...
3,0.0,0.3492,"museum, place, ever, visit, park, best, get, o...",it's not the best experience you'll ever have ...
4,4.0,0.6198,"place, movi, theater, great, love, nice, food,...",I'm updating my review to two stars. This is ...


In [15]:
# Find the most representative sentence to the topics
sentence_keyword1 = sentence_keyword(final_result)
sentence_keyword1.head()

Unnamed: 0,Topic_Num,Keywords,Text
0,0.0,"museum, place, ever, visit, park, best, get, o...",A fine little museum with super nice people wo...
1,1.0,"venu, great, see, time, place, concert, first,...",Best place in town to see and hear a good even...
2,2.0,"instal, work, home, servic, great, system, job...",Kevin and his crew were awesome !!!!!\nGreat C...
3,3.0,"show, see, perform, great, theatr, theater, am...","I found the theatre quaint, personal and uniqu..."
4,4.0,"place, movi, theater, great, love, nice, food,...",Beautiful place to spend time with family and ...
