# References:

https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [117]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re

In [118]:
df = pd.read_csv('data/movie_data.csv', encoding='utf-8')
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 781.4+ KB
None


Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [119]:
df = df.iloc[:1000]

# Data Preprocessing

In [120]:
import nltk
from pattern.en import tag
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

from gensim.parsing.preprocessing import STOPWORDS


def tokenize_text(text):
    tokens = nltk.word_tokenize(text) 
    tokens = [token.strip() for token in tokens]
    return tokens

def pos_tag_text(text):
    
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None

    tagged_text = nltk.pos_tag(nltk.word_tokenize(text))
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text


def remove_stopwords(text):
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in STOPWORDS]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

def remove_html_emoji(text):
    # Remove html
    text = re.sub('<[^>]*', '', text)
    # Remove emoji
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

def lemmatize_text(text):
    
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag
                         else word                     
                         for word, pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_tokens

def filter_text(text):
    
    # Tokenize and get pos words
    pos_words = pos_tag_text(text)
    
    processed_words = set()
    
    for word, tag in pos_words:
        if (tag == 'n') or (tag == 'J'):
            if len(word) > 3:
                
                word = wnl.lemmatize(word)
                
                processed_words.add(word)
    
    return list(processed_words)

In [121]:
# Data cleansing

def preprocessor(text):
    # Remove html emoji
    text = remove_html_emoji(text)
    # Remove stopwords
    text = remove_stopwords(text)
    # Lemmatize text
    text_list = filter_text(text)
    return text_list

# Example
preprocessor(df.loc[0, 'review'][-50:])

['brazil']

In [122]:
processed_docs = df['review'].apply(preprocessor)

processed_docs[:10]

0    [vote, grace, mark, carroll, screenplay, title...
1    [watch, hour, kris, line, motion, holy, nonsen...
2    [try, success, bruckheimer, jersey, script, ch...
3    [watch, people, strutters, time, mirror, movie...
4    [change, film, chorus, line, meaning, premise,...
5    [plot, adult, history, ton, swim, ghost, year,...
6    [liveliness, film, walk, backstage, doll, game...
7    [line, course, tie, family, figuring, step, fr...
8    [film, chavez, president, story, world, people...
9    [place, film, walker, action, janine, mouse, r...
Name: review, dtype: object

# Create Dictionary

In [124]:
import gensim

# Create dictionary

dictionary = gensim.corpora.Dictionary(processed_docs)

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 area
1 book
2 brazil
3 carroll
4 case
5 charge
6 christopher
7 class
8 connecticut
9 cover
10 crime


In [125]:
# Filter out tokens that appear in

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100_000)

# NOTE:
# - Less than 15 documents (absolute number) or
# - more than 0.5 documents (fraction of total corpus size, not absolute number)
# - Keep only the first 100_000 most frequent tokens

# Generate Model with Bag of Words

In [127]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Preview Bag of Words

bow_doc_50 = bow_corpus[50]

for i in range(len(bow_doc_50)):
    print(f"Word {bow_doc_50[i][0]} ('{dictionary[bow_doc_50[i][0]]}') appears {bow_doc_50[i][1]}")


Word 35 ('minute') appears 1
Word 81 ('life') appears 1
Word 123 ('video') appears 1
Word 136 ('work') appears 1
Word 234 ('daughter') appears 1
Word 243 ('today') appears 1


In [128]:
lda_bow = gensim.models.LdaMulticore(bow_corpus,
                                     num_topics=10,
                                     id2word=dictionary,
                                     passes=2, workers=2)

# Print topic
for idx, topic in lda_bow.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.026*"story" + 0.023*"time" + 0.023*"character" + 0.018*"watch" + 0.015*"scene" + 0.015*"actor" + 0.013*"thing" + 0.012*"life" + 0.012*"year" + 0.011*"world"
Topic: 1 
Words: 0.020*"horror" + 0.018*"idea" + 0.018*"plot" + 0.015*"year" + 0.014*"thing" + 0.013*"watch" + 0.013*"fan" + 0.013*"performance" + 0.012*"time" + 0.012*"people"
Topic: 2 
Words: 0.027*"time" + 0.022*"role" + 0.018*"people" + 0.018*"plot" + 0.016*"scene" + 0.014*"character" + 0.013*"action" + 0.012*"watch" + 0.012*"cast" + 0.011*"fan"
Topic: 3 
Words: 0.024*"story" + 0.022*"scene" + 0.020*"actor" + 0.019*"time" + 0.016*"director" + 0.016*"character" + 0.015*"people" + 0.014*"life" + 0.012*"year" + 0.012*"thing"
Topic: 4 
Words: 0.024*"time" + 0.022*"director" + 0.021*"character" + 0.020*"story" + 0.018*"scene" + 0.014*"actor" + 0.013*"comedy" + 0.012*"people" + 0.012*"line" + 0.012*"performance"
Topic: 5 
Words: 0.029*"time" + 0.022*"story" + 0.019*"life" + 0.018*"character" + 0.016*"people" + 0.01

In [140]:
# Try Model

print(processed_docs[50])
print()
for idx, score in sorted(lda_bow[bow_corpus[50]],
                         key=lambda tup: -1 * tup[1]):
    print(f"Score: {score}\t \nTopic {idx}: \n{lda_bow.print_topic(idx, 10)}\n")

['work', 'workout', 'purchase', 'minute', 'blessing', 'easy', 'video', 'sweat', 'weight', 'exercise', 'today', 'circulation', 'pray', 'daughter', 'born', 'routine', 'channel', 'cable', 'life']

Score: 0.871400773525238	 
Topic 0: 
0.026*"story" + 0.023*"time" + 0.023*"character" + 0.018*"watch" + 0.015*"scene" + 0.015*"actor" + 0.013*"thing" + 0.012*"life" + 0.012*"year" + 0.011*"world"

Score: 0.014290498569607735	 
Topic 3: 
0.024*"story" + 0.022*"scene" + 0.020*"actor" + 0.019*"time" + 0.016*"director" + 0.016*"character" + 0.015*"people" + 0.014*"life" + 0.012*"year" + 0.012*"thing"

Score: 0.014290282502770424	 
Topic 9: 
0.024*"time" + 0.022*"character" + 0.020*"thing" + 0.016*"scene" + 0.015*"plot" + 0.014*"life" + 0.012*"story" + 0.010*"minute" + 0.009*"watch" + 0.009*"year"

Score: 0.014289828017354012	 
Topic 5: 
0.029*"time" + 0.022*"story" + 0.019*"life" + 0.018*"character" + 0.016*"people" + 0.012*"world" + 0.011*"performance" + 0.011*"place" + 0.011*"scene" + 0.011*"serie

# Generate Model with TF-IDF

In [136]:
from pprint import pprint

tfidf = gensim.models.TfidfModel(bow_corpus)
tfidf_corpus = tfidf[bow_corpus]

# Preview
for doc in tfidf_corpus:
    pprint(doc)
    break

[(0, 0.23803442652931867),
 (1, 0.15734207500678157),
 (2, 0.16006211377497004),
 (3, 0.20380103752239218),
 (4, 0.24174948604559954),
 (5, 0.19297610351571345),
 (6, 0.17980319458648258),
 (7, 0.23803442652931867),
 (8, 0.21234460410405814),
 (9, 0.1361064882546078),
 (10, 0.1491046976763981),
 (11, 0.24174948604559954),
 (12, 0.16909050652408372),
 (13, 0.1981344879510331),
 (14, 0.1913544908305697),
 (15, 0.1522596485022355),
 (16, 0.1981344879510331),
 (17, 0.17724439806243122),
 (18, 0.14258110830068366),
 (19, 0.23454466453554773),
 (20, 0.1852895767384028),
 (21, 0.06667932243151192),
 (22, 0.24174948604559954),
 (23, 0.24174948604559954),
 (24, 0.18977731152742275),
 (25, 0.21970313316476817),
 (26, 0.09410216857912154)]


In [138]:
lda_tfidf = gensim.models.LdaMulticore(tfidf_corpus, num_topics=10,
                                      id2word=dictionary, passes=2,
                                      workers=4)

# Print topic
for idx, topic in lda_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.010*"character" + 0.009*"relationship" + 0.009*"scene" + 0.008*"watch" + 0.008*"plot" + 0.008*"story" + 0.008*"time" + 0.008*"work" + 0.007*"director" + 0.007*"people"
Topic: 1 
Words: 0.011*"scene" + 0.010*"character" + 0.010*"reason" + 0.010*"plot" + 0.009*"actor" + 0.009*"people" + 0.009*"time" + 0.009*"story" + 0.008*"shot" + 0.008*"woman"
Topic: 2 
Words: 0.010*"time" + 0.009*"story" + 0.009*"performance" + 0.009*"character" + 0.009*"word" + 0.008*"comedy" + 0.008*"day" + 0.007*"situation" + 0.007*"help" + 0.007*"scene"
Topic: 3 
Words: 0.009*"actor" + 0.008*"watch" + 0.008*"life" + 0.008*"character" + 0.008*"time" + 0.008*"soundtrack" + 0.007*"quality" + 0.007*"thing" + 0.007*"love" + 0.007*"people"
Topic: 4 
Words: 0.011*"year" + 0.009*"idea" + 0.009*"effect" + 0.008*"attempt" + 0.008*"fan" + 0.008*"time" + 0.008*"thing" + 0.008*"video" + 0.007*"animation" + 0.007*"story"
Topic: 5 
Words: 0.011*"show" + 0.010*"comedy" + 0.009*"family" + 0.009*"plot" + 0.009*"p

In [142]:
# Try Model

print(processed_docs[50])
print()
for idx, score in sorted(lda_tfidf[bow_corpus[50]],
                         key=lambda tup: -1 * tup[1]):
    print(f"Score: {score}\t \nTopic {idx}: \n{lda_tfidf.print_topic(idx, 10)}\n")

['work', 'workout', 'purchase', 'minute', 'blessing', 'easy', 'video', 'sweat', 'weight', 'exercise', 'today', 'circulation', 'pray', 'daughter', 'born', 'routine', 'channel', 'cable', 'life']

Score: 0.8713915348052979	 
Topic 1: 
0.011*"scene" + 0.010*"character" + 0.010*"reason" + 0.010*"plot" + 0.009*"actor" + 0.009*"people" + 0.009*"time" + 0.009*"story" + 0.008*"shot" + 0.008*"woman"

Score: 0.014291523024439812	 
Topic 4: 
0.011*"year" + 0.009*"idea" + 0.009*"effect" + 0.008*"attempt" + 0.008*"fan" + 0.008*"time" + 0.008*"thing" + 0.008*"video" + 0.007*"animation" + 0.007*"story"

Score: 0.01429105643182993	 
Topic 8: 
0.008*"star" + 0.008*"story" + 0.008*"life" + 0.008*"role" + 0.007*"case" + 0.007*"time" + 0.007*"series" + 0.007*"director" + 0.007*"scene" + 0.006*"watch"

Score: 0.014290460385382175	 
Topic 2: 
0.010*"time" + 0.009*"story" + 0.009*"performance" + 0.009*"character" + 0.009*"word" + 0.008*"comedy" + 0.008*"day" + 0.007*"situation" + 0.007*"help" + 0.007*"scene"


# Compute Model Perplexity and Coherence Score

In [143]:
from gensim.models import CoherenceModel

# Compute Perplexity
print('\nPerplexity: ', lda_bow.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_bow = CoherenceModel(model=lda_bow, texts=processed_docs.values,
                                     dictionary=dictionary, coherence='c_v')
coherence_bow = coherence_model_bow.get_coherence()
print('\nCoherence Score: ', coherence_bow)


Perplexity:  -6.290074677331923

Coherence Score:  0.3334874597832685


In [149]:
# Compute Perplexity
print('\nPerplexity: ', lda_tfidf.log_perplexity(tfidf_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_tfidf = CoherenceModel(model=lda_tfidf, texts=processed_docs.values,
                                     dictionary=dictionary, coherence='c_v')
coherence_tfidf = coherence_model_tfidf.get_coherence()
print('\nCoherence Score: ', coherence_tfidf)


Perplexity:  -8.11301486126665

Coherence Score:  0.28579792812123583


# Optimization

In [167]:
def get_optimum_lda(dictionary, corpus, texts, limit,
                    start=2, step=1, get_result=False):
    coherence_values = []
    
    for n in range(start, limit, step):
        lda = gensim.models.LdaMulticore(corpus=corpus,
                                         num_topics=n,
                                         id2word=dictionary)
        
        # Create coherence
        coherence_model = CoherenceModel(model=lda, 
                                         texts=texts,
                                         dictionary=dictionary, 
                                         coherence='c_v')
        coherence_values.append(coherence_model.get_coherence())
#         print(f"Idx {n} {coherence_model.get_coherence()}")
    
    
    opt_num_topics = start + coherence_values.index(max(coherence_values))
    
    lda_opt = gensim.models.LdaMulticore(corpus=corpus,
                                         num_topics=opt_num_topics,
                                         id2word=dictionary)
    
    if get_result:
        print(coherence_values)
    
    return lda_opt



In [168]:
lda_opt_bow = get_optimum_lda(dictionary, bow_corpus,
                              processed_docs.values,
                              10, get_result=True)

# Print topic
for idx, topic in lda_opt_bow.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Idx 2 0.3432437694752679
Idx 3 0.3437149611853181
Idx 4 0.35237707500157195
Idx 5 0.3477098253770904
Idx 6 0.32064933979687854
Idx 7 0.33160943888702954
Idx 8 0.32325571337396364
Idx 9 0.33171903031987365
[0.3432437694752679, 0.3437149611853181, 0.35237707500157195, 0.3477098253770904, 0.32064933979687854, 0.33160943888702954, 0.32325571337396364, 0.33171903031987365]
Topic: 0 
Words: 0.019*"character" + 0.018*"time" + 0.017*"story" + 0.015*"plot" + 0.013*"life" + 0.012*"thing" + 0.011*"people" + 0.011*"actor" + 0.010*"director" + 0.010*"scene"
Topic: 1 
Words: 0.025*"time" + 0.022*"story" + 0.018*"scene" + 0.017*"character" + 0.016*"watch" + 0.014*"year" + 0.011*"people" + 0.011*"thing" + 0.011*"director" + 0.011*"actor"
Topic: 2 
Words: 0.024*"time" + 0.019*"character" + 0.017*"story" + 0.017*"thing" + 0.016*"actor" + 0.015*"people" + 0.014*"life" + 0.010*"place" + 0.010*"point" + 0.010*"performance"
Topic: 3 
Words: 0.025*"time" + 0.020*"scene" + 0.016*"character" + 0.015*"people" +

In [169]:
lda_opt_tfidf = get_optimum_lda(dictionary, tfidf_corpus,
                                  processed_docs.values,
                                  10, get_result=True)

# Print topic
for idx, topic in lda_opt_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Idx 2 0.30157313402188235
Idx 3 0.321723991814657
Idx 4 0.2912925964267198
Idx 5 0.31993718740199706
Idx 6 0.31452561608851454
Idx 7 0.2973603149584484
Idx 8 0.29502960264983324
Idx 9 0.2939563255127682
[0.30157313402188235, 0.321723991814657, 0.2912925964267198, 0.31993718740199706, 0.31452561608851454, 0.2973603149584484, 0.29502960264983324, 0.2939563255127682]
Topic: 0 
Words: 0.009*"people" + 0.008*"story" + 0.008*"time" + 0.008*"life" + 0.008*"year" + 0.008*"music" + 0.007*"thing" + 0.007*"performance" + 0.007*"watch" + 0.007*"piece"
Topic: 1 
Words: 0.009*"time" + 0.008*"actor" + 0.008*"people" + 0.008*"director" + 0.008*"scene" + 0.007*"story" + 0.007*"character" + 0.007*"thing" + 0.007*"performance" + 0.007*"work"
Topic: 2 
Words: 0.009*"character" + 0.008*"plot" + 0.008*"actor" + 0.008*"story" + 0.008*"scene" + 0.008*"watch" + 0.007*"time" + 0.006*"thing" + 0.006*"reason" + 0.006*"idea"
