# References:

https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [214]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re

In [215]:
df = pd.read_csv('data/movie_data.csv', encoding='utf-8')
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 781.4+ KB
None


Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [216]:
df_test = df.iloc[1001:2000]
df = df.iloc[:1000]

# Data Preprocessing

In [217]:
import nltk
from pattern.en import tag
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

from gensim.parsing.preprocessing import STOPWORDS


def tokenize_text(text):
    tokens = nltk.word_tokenize(text) 
    tokens = [token.strip() for token in tokens]
    return tokens

def pos_tag_text(text):
    
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None

    tagged_text = nltk.pos_tag(nltk.word_tokenize(text))
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text


def remove_stopwords(text):
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in STOPWORDS]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

def remove_html_emoji(text):
    # Remove html
    text = re.sub('<[^>]*', '', text)
    # Remove emoji
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

def lemmatize_text(text):
    
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag
                         else word                     
                         for word, pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_tokens

def filter_text(text):
    
    # Tokenize and get pos words
    pos_words = pos_tag_text(text)
    
    processed_words = set()
    
    for word, tag in pos_words:
        if (tag == 'n') or (tag == 'J'):
            if len(word) > 3:
                
                word = wnl.lemmatize(word)
                
                processed_words.add(word)
    
    return list(processed_words)

  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
  text = (re.sub('[\W]+', ' ', text.lower()) +


In [218]:
# Data cleansing

def preprocessor(text):
    # Remove html emoji
    text = remove_html_emoji(text)
    # Remove stopwords
    text = remove_stopwords(text)
    # Lemmatize text
    text_list = filter_text(text)
    return text_list

# Example
preprocessor(df.loc[0, 'review'][-50:])

['brazil']

In [219]:
processed_docs = df['review'].apply(preprocessor)

processed_docs[:10]

0    [vote, grace, mark, carroll, screenplay, title...
1    [watch, hour, kris, line, motion, holy, nonsen...
2    [try, success, bruckheimer, jersey, script, ch...
3    [watch, people, strutters, time, mirror, movie...
4    [change, film, chorus, line, meaning, premise,...
5    [plot, adult, history, ton, swim, ghost, year,...
6    [liveliness, film, walk, backstage, doll, game...
7    [line, course, tie, family, figuring, step, fr...
8    [film, chavez, president, story, world, people...
9    [place, film, walker, action, janine, mouse, r...
Name: review, dtype: object

# Create Dictionary

In [220]:
import gensim

# Create dictionary

dictionary = gensim.corpora.Dictionary(processed_docs)

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 area
1 book
2 brazil
3 carroll
4 case
5 charge
6 christopher
7 class
8 connecticut
9 cover
10 crime


In [221]:
# Filter out tokens that appear in

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100_000)

# NOTE:
# - Less than 15 documents (absolute number) or
# - more than 0.5 documents (fraction of total corpus size, not absolute number)
# - Keep only the first 100_000 most frequent tokens

# Generate Model with Bag of Words

In [222]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Preview Bag of Words

bow_doc_50 = bow_corpus[50]

for i in range(len(bow_doc_50)):
    print(f"Word {bow_doc_50[i][0]} ('{dictionary[bow_doc_50[i][0]]}') appears {bow_doc_50[i][1]}")


Word 35 ('minute') appears 1
Word 81 ('life') appears 1
Word 123 ('video') appears 1
Word 136 ('work') appears 1
Word 234 ('daughter') appears 1
Word 243 ('today') appears 1


In [223]:
lda_bow = gensim.models.LdaMulticore(bow_corpus,
                                     num_topics=10,
                                     id2word=dictionary,
                                     passes=2, workers=2)

# Print topic
for idx, topic in lda_bow.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.017*"time" + 0.016*"world" + 0.015*"character" + 0.014*"story" + 0.012*"year" + 0.012*"action" + 0.012*"people" + 0.011*"thing" + 0.011*"woman" + 0.011*"family"
Topic: 1 
Words: 0.025*"time" + 0.016*"story" + 0.015*"character" + 0.013*"plot" + 0.013*"scene" + 0.012*"year" + 0.011*"fact" + 0.009*"thing" + 0.009*"horror" + 0.009*"actor"
Topic: 2 
Words: 0.034*"character" + 0.024*"story" + 0.021*"time" + 0.020*"life" + 0.017*"thing" + 0.016*"people" + 0.015*"actor" + 0.011*"director" + 0.011*"role" + 0.011*"watch"
Topic: 3 
Words: 0.023*"year" + 0.019*"time" + 0.017*"actor" + 0.013*"scene" + 0.013*"kind" + 0.012*"people" + 0.010*"child" + 0.010*"role" + 0.010*"thing" + 0.009*"performance"
Topic: 4 
Words: 0.031*"time" + 0.026*"scene" + 0.016*"effect" + 0.014*"actor" + 0.014*"story" + 0.014*"director" + 0.012*"thing" + 0.012*"plot" + 0.011*"shot" + 0.011*"character"
Topic: 5 
Words: 0.024*"time" + 0.021*"story" + 0.017*"scene" + 0.015*"character" + 0.014*"moment" + 0.013

In [224]:
# Try Model

print(processed_docs[50])
print()
for idx, score in sorted(lda_bow[bow_corpus[50]],
                         key=lambda tup: -1 * tup[1]):
    print(f"Score: {score}\t \nTopic {idx}: \n{lda_bow.print_topic(idx, 10)}\n")

['work', 'workout', 'purchase', 'minute', 'blessing', 'easy', 'video', 'sweat', 'weight', 'exercise', 'today', 'circulation', 'pray', 'daughter', 'born', 'routine', 'channel', 'cable', 'life']

Score: 0.8713967204093933	 
Topic 7: 
0.029*"character" + 0.021*"time" + 0.020*"thing" + 0.019*"story" + 0.018*"scene" + 0.016*"year" + 0.014*"star" + 0.014*"actor" + 0.014*"life" + 0.013*"love"

Score: 0.01429175678640604	 
Topic 9: 
0.022*"time" + 0.021*"life" + 0.020*"people" + 0.019*"work" + 0.016*"character" + 0.016*"story" + 0.014*"look" + 0.013*"comedy" + 0.013*"watch" + 0.011*"performance"

Score: 0.01428973488509655	 
Topic 5: 
0.024*"time" + 0.021*"story" + 0.017*"scene" + 0.015*"character" + 0.014*"moment" + 0.013*"play" + 0.013*"video" + 0.012*"watch" + 0.011*"look" + 0.011*"cast"

Score: 0.014289528131484985	 
Topic 8: 
0.025*"watch" + 0.021*"time" + 0.021*"character" + 0.018*"story" + 0.016*"scene" + 0.014*"action" + 0.013*"thing" + 0.013*"minute" + 0.011*"case" + 0.011*"plot"

Sco

# Generate Model with TF-IDF

In [225]:
from pprint import pprint

tfidf = gensim.models.TfidfModel(bow_corpus)
tfidf_corpus = tfidf[bow_corpus]

# Preview
for doc in tfidf_corpus:
    pprint(doc)
    break

[(0, 0.23803442652931867),
 (1, 0.15734207500678157),
 (2, 0.16006211377497004),
 (3, 0.20380103752239218),
 (4, 0.24174948604559954),
 (5, 0.19297610351571345),
 (6, 0.17980319458648258),
 (7, 0.23803442652931867),
 (8, 0.21234460410405814),
 (9, 0.1361064882546078),
 (10, 0.1491046976763981),
 (11, 0.24174948604559954),
 (12, 0.16909050652408372),
 (13, 0.1981344879510331),
 (14, 0.1913544908305697),
 (15, 0.1522596485022355),
 (16, 0.1981344879510331),
 (17, 0.17724439806243122),
 (18, 0.14258110830068366),
 (19, 0.23454466453554773),
 (20, 0.1852895767384028),
 (21, 0.06667932243151192),
 (22, 0.24174948604559954),
 (23, 0.24174948604559954),
 (24, 0.18977731152742275),
 (25, 0.21970313316476817),
 (26, 0.09410216857912154)]


In [226]:
lda_tfidf = gensim.models.LdaMulticore(tfidf_corpus, num_topics=10,
                                      id2word=dictionary, passes=2,
                                      workers=4)

# Print topic
for idx, topic in lda_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.010*"lot" + 0.009*"suit" + 0.008*"time" + 0.008*"night" + 0.007*"star" + 0.007*"comedy" + 0.007*"horror" + 0.007*"thing" + 0.007*"woman" + 0.006*"character"
Topic: 1 
Words: 0.010*"book" + 0.009*"change" + 0.009*"story" + 0.009*"love" + 0.008*"night" + 0.008*"family" + 0.007*"actor" + 0.007*"life" + 0.007*"joke" + 0.007*"character"
Topic: 2 
Words: 0.009*"scene" + 0.009*"plot" + 0.009*"minute" + 0.009*"surprise" + 0.008*"life" + 0.008*"effect" + 0.008*"recommend" + 0.008*"story" + 0.008*"problem" + 0.008*"time"
Topic: 3 
Words: 0.010*"story" + 0.010*"watch" + 0.008*"action" + 0.008*"series" + 0.008*"comedy" + 0.008*"thing" + 0.008*"character" + 0.008*"audience" + 0.007*"scene" + 0.007*"year"
Topic: 4 
Words: 0.010*"people" + 0.009*"watch" + 0.009*"plot" + 0.009*"john" + 0.008*"sense" + 0.008*"scene" + 0.008*"character" + 0.007*"time" + 0.007*"reason" + 0.007*"actor"
Topic: 5 
Words: 0.010*"direction" + 0.010*"script" + 0.009*"time" + 0.009*"scene" + 0.009*"parent" + 

In [227]:
# Try Model

print(processed_docs[50])
print()
for idx, score in sorted(lda_tfidf[bow_corpus[50]],
                         key=lambda tup: -1 * tup[1]):
    print(f"Score: {score}\t \nTopic {idx}: \n{lda_tfidf.print_topic(idx, 10)}\n")

['work', 'workout', 'purchase', 'minute', 'blessing', 'easy', 'video', 'sweat', 'weight', 'exercise', 'today', 'circulation', 'pray', 'daughter', 'born', 'routine', 'channel', 'cable', 'life']

Score: 0.8713927865028381	 
Topic 9: 
0.009*"manner" + 0.009*"work" + 0.009*"life" + 0.008*"production" + 0.008*"director" + 0.008*"people" + 0.008*"play" + 0.007*"character" + 0.007*"actor" + 0.007*"chance"

Score: 0.014291995204985142	 
Topic 2: 
0.009*"scene" + 0.009*"plot" + 0.009*"minute" + 0.009*"surprise" + 0.008*"life" + 0.008*"effect" + 0.008*"recommend" + 0.008*"story" + 0.008*"problem" + 0.008*"time"

Score: 0.014291148632764816	 
Topic 7: 
0.010*"watch" + 0.009*"character" + 0.008*"waste" + 0.008*"time" + 0.008*"story" + 0.008*"scene" + 0.008*"line" + 0.008*"look" + 0.008*"reason" + 0.008*"moment"

Score: 0.014290020801126957	 
Topic 8: 
0.011*"year" + 0.010*"actor" + 0.010*"people" + 0.009*"person" + 0.009*"story" + 0.008*"time" + 0.008*"hour" + 0.008*"character" + 0.008*"place" + 0

# Compute Model Perplexity and Coherence Score

In [228]:
from gensim.models import CoherenceModel

# Compute Perplexity
print('\nPerplexity: ', lda_bow.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_bow = CoherenceModel(model=lda_bow, texts=processed_docs.values,
                                     dictionary=dictionary, coherence='c_v')
coherence_bow = coherence_model_bow.get_coherence()
print('\nCoherence Score: ', coherence_bow)


Perplexity:  -6.3012268998269905

Coherence Score:  0.32062085403895685


In [229]:
# Compute Perplexity
print('\nPerplexity: ', lda_tfidf.log_perplexity(tfidf_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_tfidf = CoherenceModel(model=lda_tfidf, texts=processed_docs.values,
                                     dictionary=dictionary, coherence='c_v')
coherence_tfidf = coherence_model_tfidf.get_coherence()
print('\nCoherence Score: ', coherence_tfidf)


Perplexity:  -8.115745296839096

Coherence Score:  0.2823989186474825


# Optimization

In [230]:
def get_optimum_lda(dictionary, corpus, texts, limit,
                    start=2, step=1, get_result=False):
    coherence_values = []
    
    for n in range(start, limit, step):
        lda = gensim.models.LdaMulticore(corpus=corpus,
                                         num_topics=n,
                                         id2word=dictionary)
        
        # Create coherence
        coherence_model = CoherenceModel(model=lda, 
                                         texts=texts,
                                         dictionary=dictionary, 
                                         coherence='c_v')
        coherence_values.append(coherence_model.get_coherence())
#         print(f"Idx {n} {coherence_model.get_coherence()}")
    
    
    opt_num_topics = start + coherence_values.index(max(coherence_values))
    
    lda_opt = gensim.models.LdaMulticore(corpus=corpus,
                                         num_topics=opt_num_topics,
                                         id2word=dictionary)
    
    if get_result:
        print(coherence_values)
    
    return lda_opt



In [231]:
lda_opt_bow = get_optimum_lda(dictionary, bow_corpus,
                              processed_docs.values,
                              10, get_result=True)

# Print topic
for idx, topic in lda_opt_bow.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

[0.335416263411897, 0.33292674568743263, 0.3237980748399089, 0.33074832788538644, 0.3207986336690229, 0.3255201381235756, 0.3283921503793936, 0.3270229709566922]
Topic: 0 
Words: 0.025*"time" + 0.017*"character" + 0.017*"story" + 0.017*"scene" + 0.014*"actor" + 0.013*"thing" + 0.012*"life" + 0.011*"plot" + 0.011*"people" + 0.010*"watch"
Topic: 1 
Words: 0.021*"time" + 0.019*"character" + 0.019*"story" + 0.015*"people" + 0.013*"year" + 0.012*"thing" + 0.012*"scene" + 0.011*"director" + 0.011*"life" + 0.010*"watch"


In [232]:
lda_opt_tfidf = get_optimum_lda(dictionary, tfidf_corpus,
                                  processed_docs.values,
                                  10, get_result=True)

# Print topic
for idx, topic in lda_opt_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

[0.31333278371885903, 0.29888059485296486, 0.32294370584400506, 0.2886353863454591, 0.3160182194573718, 0.30392786395057253, 0.30562898273690253, 0.3114187071451975]
Topic: 0 
Words: 0.009*"watch" + 0.009*"scene" + 0.008*"character" + 0.007*"year" + 0.007*"time" + 0.007*"music" + 0.007*"thing" + 0.007*"work" + 0.007*"people" + 0.006*"life"
Topic: 1 
Words: 0.009*"life" + 0.008*"plot" + 0.008*"thing" + 0.008*"time" + 0.008*"story" + 0.007*"actor" + 0.007*"role" + 0.007*"scene" + 0.007*"performance" + 0.007*"watch"
Topic: 2 
Words: 0.009*"family" + 0.008*"horror" + 0.008*"time" + 0.008*"story" + 0.007*"character" + 0.007*"watch" + 0.007*"actor" + 0.007*"effect" + 0.007*"line" + 0.007*"people"
Topic: 3 
Words: 0.009*"story" + 0.009*"people" + 0.008*"actor" + 0.008*"time" + 0.008*"character" + 0.007*"scene" + 0.006*"look" + 0.006*"plot" + 0.006*"world" + 0.006*"thing"


# Labeling Topic

In [233]:
test = df_test['review'].apply(preprocessor)

In [237]:
def predict_topic(text):
    text = preprocessor(text)
    bow_vector = dictionary.doc2bow(text)
    
    result = sorted(lda_opt_tfidf[bow_vector], 
                    key=lambda x: -1*x[1])[0][0]
    
    return result
    
# Extract keywords into a dictionary or list
topics_dict = {}
for topic_num, topic in lda_opt_tfidf.show_topics(num_topics=10,
                                                num_words=10,
                                                formatted=False):
    keywords = [word for word, _ in topic]
    topics_dict[topic_num] = keywords

df_test['topic'] = df_test['review'].apply(predict_topic)
df_test['key_topic'] = df_test['topic'].apply(lambda x: topics_dict[x])

In [238]:
df_test

Unnamed: 0,review,sentiment,topic,key_topic
1001,I'd have little to add to bowlofsoul23's bull'...,0,0,"[watch, scene, character, year, time, music, t..."
1002,We just saw this film previewed before release...,1,1,"[life, plot, thing, time, story, actor, role, ..."
1003,After reading more than my fair share of revie...,1,0,"[watch, scene, character, year, time, music, t..."
1004,"I awake suddenly, aware that I'm drooling onto...",0,2,"[family, horror, time, story, character, watch..."
1005,The 700 Club gives a great perspective on worl...,1,0,"[watch, scene, character, year, time, music, t..."
...,...,...,...,...
1995,I was expecting this to be just like the other...,1,0,"[watch, scene, character, year, time, music, t..."
1996,...........as I was when I saw this movie) I w...,1,0,"[watch, scene, character, year, time, music, t..."
1997,Has anyone found a way to purchase copies of t...,1,0,"[watch, scene, character, year, time, music, t..."
1998,Perhaps once in a generation a film comes alon...,1,0,"[watch, scene, character, year, time, music, t..."


In [239]:
df_test['topic'].value_counts()

0    317
1    316
3    233
2    133
Name: topic, dtype: int64