# Topic Modeling: A Real Example

## Import necessary dependencies and settings

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import nltk, random
from nltk.corpus import movie_reviews
from nltk.stem import PorterStemmer

pd.options.display.max_colwidth = 200
%matplotlib inline

## Sample corpus of text documents

In [3]:
corpus = [' '.join(movie_reviews.words(f)) for f in movie_reviews.fileids()]
labels = [f[:3] for f in movie_reviews.fileids()]

In [4]:
len(corpus)
print(corpus[0][:200])
print(labels[:10])

plot : two teen couples go to a church party , drink and then drive . they get into an accident . one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . what
['neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg']


In [5]:

corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Document': corpus, 
                          'Category': labels})
corpus_df = corpus_df[['Document', 'Category']]
corpus_df

Unnamed: 0,Document,Category
0,"plot : two teen couples go to a church party , drink and then drive . they get into an accident . one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . ...",neg
1,the happy bastard ' s quick movie review damn that y2k bug . it ' s got a head start in this movie starring jamie lee curtis and another baldwin brother ( william this time ) in a story regarding ...,neg
2,"it is movies like these that make a jaded movie viewer thankful for the invention of the timex indiglo watch . based on the late 1960 ' s television show by the same name , the mod squad tells the...",neg
3,""" quest for camelot "" is warner bros . ' first feature - length , fully - animated attempt to steal clout from disney ' s cartoon empire , but the mouse has no reason to be worried . the only othe...",neg
4,"synopsis : a mentally unstable man undergoing psychotherapy saves a boy from a potentially fatal accident and then falls in love with the boy ' s mother , a fledgling restauranteur . unsuccessfull...",neg
...,...,...
1995,"wow ! what a movie . it ' s everything a movie can be : funny , dramatic , interesting , weird , funny , weird and strikingly original . yep that pretty much describes this movie . it starts out l...",pos
1996,"richard gere can be a commanding actor , but he ' s not always in great films . everything comes together here . gere is a big time chicago defense attorney who takes on a seemingly unwinable case...",pos
1997,"glory -- starring matthew broderick , denzel washington , and morgan freeman -- is the true story of the 54th regiment of massachusetts , the first black fighting unit recruited by the north durin...",pos
1998,"steven spielberg ' s second epic film on world war ii is an unquestioned masterpiece of film . spielberg , ever the student on film , has managed to resurrect the war genre by producing one of its...",pos


## Simple text pre-processing

In [20]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')
stemmer = PorterStemmer()
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    tokens_pos = nltk.pos_tag(tokens)
    tokens = [w for (w,p) in tokens_pos if p[0] in ['N']]
    # filter stopwords out of document
    filtered_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [21]:
normalize_corpus(corpus[0])

array('plot teen coupl church parti drink drive accid die girlfriend life nightmar deal movi critiqu mind movi gener idea packag review film mess head highway memento way type film folk snag concept problem movi problem world audienc member idea dream charact other apparit disappear looooot chase scene ton thing mind film clue kind film problem secret minut thing thrill meantim part arrow dig flick way point strang bit sens make film line movi audienc password world sagemil vision minut movi okay peopl scene insight strang movi studio film director decent teen mind movi suit music video edg sens actor part bentley charact beauti neighborhood kudo film charact film stick entertain redund runtim cool explan crazi way horror flick way someon genr kid product year shelv skip joblo nightmar street blair crow crow salvat other echo',
      dtype='<U830')

In [None]:
norm_corpus = normalize_corpus(corpus)

In [23]:
norm_corpus[0]

'plot teen coupl church parti drink drive accid die girlfriend life nightmar deal movi critiqu mind movi gener idea packag review film mess head highway memento way type film folk snag concept problem movi problem world audienc member idea dream charact other apparit disappear looooot chase scene ton thing mind film clue kind film problem secret minut thing thrill meantim part arrow dig flick way point strang bit sens make film line movi audienc password world sagemil vision minut movi okay peopl scene insight strang movi studio film director decent teen mind movi suit music video edg sens actor part bentley charact beauti neighborhood kudo film charact film stick entertain redund runtim cool explan crazi way horror flick way someon genr kid product year shelv skip joblo nightmar street blair crow crow salvat other echo'

## Bag of Words Model

- Bag-of-words model is the simplest way to vectorize texts into numeric representations.
- In short, it is a method to represent a text using its word frequency list.
- The sequential order of words in the text is therefore naively ignored.

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
# get bag of words features in sparse format
cv = CountVectorizer(min_df=50, max_df=0.8)
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix

<2000x900 sparse matrix of type '<class 'numpy.int64'>'
	with 138347 stored elements in Compressed Sparse Row format>

In [28]:
# view non-zero feature positions in the sparse matrix
print(cv_matrix)

  (0, 600)	1
  (0, 797)	2
  (0, 178)	1
  (0, 572)	1
  (0, 241)	1
  (0, 2)	1
  (0, 223)	1
  (0, 339)	1
  (0, 456)	1
  (0, 538)	2
  (0, 205)	1
  (0, 505)	3
  (0, 332)	1
  (0, 386)	2
  (0, 654)	1
  (0, 499)	1
  (0, 360)	1
  (0, 872)	4
  (0, 845)	1
  (0, 314)	1
  (0, 160)	1
  (0, 618)	3
  (0, 893)	2
  (0, 48)	2
  (0, 494)	1
  :	:
  (1999, 346)	1
  (1999, 557)	1
  (1999, 730)	1
  (1999, 374)	1
  (1999, 86)	1
  (1999, 599)	1
  (1999, 350)	1
  (1999, 634)	4
  (1999, 564)	2
  (1999, 52)	1
  (1999, 721)	1
  (1999, 623)	1
  (1999, 680)	1
  (1999, 867)	1
  (1999, 694)	1
  (1999, 117)	2
  (1999, 391)	1
  (1999, 44)	1
  (1999, 156)	1
  (1999, 99)	1
  (1999, 838)	1
  (1999, 106)	1
  (1999, 404)	1
  (1999, 174)	1
  (1999, 258)	2


In [29]:
# view dense representation 
# warning might give a memory error if data is too big
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[0, 0, 1, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [30]:
# get all unique words in the corpus
vocab = np.array(cv.get_feature_names())
# show document feature vectors
pd.DataFrame(cv_matrix, columns=vocab)

Unnamed: 0,abil,accent,accid,act,action,actor,actress,ad,adam,adapt,...,word,work,worker,world,worth,write,writer,ye,year,york
0,0,0,1,0,0,1,0,0,0,0,...,0,0,0,2,0,0,0,0,1,0
1,0,0,0,1,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0,0,1,0,0,2,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,1,0,0,3,0
1996,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1997,1,1,0,0,3,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1998,0,0,0,0,0,3,0,0,1,0,...,1,1,0,2,0,0,0,0,1,0


## Latent Dirichlet Allocation

In [31]:
%%time
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=8, max_iter=1000, random_state=0,
                               max_doc_update_iter=50, learning_method='online',
                               batch_size=500, learning_offset = 50, n_jobs = -1)
dt_matrix = lda.fit_transform(cv_matrix) # document matrix

CPU times: user 32.2 s, sys: 4.74 s, total: 36.9 s
Wall time: 12min 18s


In [32]:
features = pd.DataFrame(dt_matrix, columns = ["T"+str(n) for n in range(1,9)])
features

Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8
0,0.174842,0.001488,0.154238,0.001493,0.001489,0.001492,0.234013,0.430945
1,0.261662,0.003289,0.003294,0.003291,0.003291,0.003292,0.718586,0.003294
2,0.381910,0.001471,0.001472,0.037776,0.001471,0.001471,0.001473,0.572956
3,0.001789,0.001786,0.180013,0.001786,0.001796,0.001789,0.151153,0.659889
4,0.564536,0.001008,0.429407,0.001010,0.001009,0.001010,0.001010,0.001010
...,...,...,...,...,...,...,...,...
1995,0.090744,0.001276,0.588942,0.001276,0.001276,0.001277,0.001278,0.313931
1996,0.658206,0.002841,0.324728,0.002845,0.002842,0.002847,0.002844,0.002846
1997,0.197742,0.000856,0.350791,0.000856,0.293872,0.000857,0.154168,0.000858
1998,0.058396,0.001190,0.222323,0.040804,0.267726,0.001191,0.407177,0.001193


## Show topics and their weights

In [None]:
# tt_matrix = lda.components_ # topic matrix
# for topic_weights in tt_matrix:
#     topic = [(token, weight) for token, weight in zip(vocab, topic_weights)]
#     topic = sorted(topic, key=lambda x: -x[1])
#     topic = [item for item in topic if item[1] > 0.6]
#     print(topic)
#     print()


In [33]:
topic_terms = lda.components_
top_terms = 20
topic_keywords_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:,:top_terms]
topic_keywords = vocab[topic_keywords_idxs]
topics = [', '.join(w) for w in topic_keywords]
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame(topics,
                        columns = ['Keywords per Topic'],
                        index = ["Topic"+str(n) for n in range(1,9)])

In [34]:
topics_df

Unnamed: 0,Keywords per Topic
Topic1,"charact, scene, action, time, plot, way, guy, actor, thing, director, perform, stori, script, role, man, murder, thriller, line, end, cop"
Topic2,"synopsi, goofi, tradit, footag, becom, stephen, pilot, write, bland, band, wast, denni, clue, count, cross, offer, bore, epic, start, food"
Topic3,"charact, stori, life, time, scene, man, way, year, perform, peopl, love, world, famili, director, role, relationship, wife, woman, thing, someth"
Topic4,"music, rock, chri, hill, night, sex, flashback, neighbor, audienc, talk, pop, bed, taylor, thought, show, remak, vehicl, use, day, wast"
Topic5,"war, men, battl, angel, ryan, armi, smith, bob, soldier, hank, jason, death, group, command, line, god, messag, enemi, kevin, ii"
Topic6,"joe, killer, york, island, summer, juli, dream, park, bill, legend, station, jennif, victim, kevin, boyfriend, friend, hors, boat, citi, imag"
Topic7,"effect, time, charact, alien, horror, year, star, stori, scene, world, seri, peopl, action, ship, thing, comput, plot, planet, space, earth"
Topic8,"charact, comedi, time, thing, scene, way, year, kid, plot, joke, star, girl, peopl, show, guy, friend, school, stori, perform, humor"


In [35]:
pd.options.display.float_format = '{:,.5f}'.format
pd.set_option('display.max_colwidth', 200)


dt_df = pd.DataFrame(dt_matrix,
                    columns=["Topic"+str(n) for n in range(1,9)])

max_contrib_topics = dt_df.max(axis=0)
dominant_topics = max_contrib_topics.index
contrib_perc = max_contrib_topics.values
document_numbers = [dt_df[dt_df[t]==max_contrib_topics.loc[t]].index[0]
                    for t in dominant_topics]
documents = [norm_corpus[i] for i in document_numbers]

documents_df = pd.DataFrame({'Dominant Topic': dominant_topics,
                            'Contribution%': contrib_perc,
                            'DOCID': document_numbers,
                            'Topic': topics_df['Keywords per Topic'],
                            'Text': documents})

In [36]:
documents_df

Unnamed: 0,Dominant Topic,Contribution%,DOCID,Topic,Text
Topic1,Topic1,0.99232,364,"charact, scene, action, time, plot, way, guy, actor, thing, director, perform, stori, script, role, man, murder, thriller, line, end, cop",god film regardless aw word kicker film film year charm kind disast fact brief time act stori level bad aveng perform rochon michael wong paul sorvino cours claud van presenc rob schneider direct ...
Topic2,Topic2,0.0625,506,"synopsi, goofi, tradit, footag, becom, stephen, pilot, write, bland, band, wast, denni, clue, count, cross, offer, bore, epic, start, food",film word
Topic3,Topic3,0.99431,1711,"charact, stori, life, time, scene, man, way, year, perform, peopl, love, world, famili, director, role, relationship, wife, woman, thing, someth",hereaft writer director egoyan tragedi death tragedi live hurt pain live hereaft death loss grief winter day town columbia school bu children slide highway lake ice sink children die other resid t...
Topic4,Topic4,0.99058,249,"music, rock, chri, hill, night, sex, flashback, neighbor, audienc, talk, pop, bed, taylor, thought, show, remak, vehicl, use, day, wast",synopsi blond psychologist sarah rican cliff neighbor upstair meantim someon flower name newspap column cat comment wast time move molass winter conclus audienc minut film movi star evita rebecca ...
Topic5,Topic5,0.73572,1229,"war, men, battl, angel, ryan, armi, smith, bob, soldier, hank, jason, death, group, command, line, god, messag, enemi, kevin, ii",today war realiti screen priivat ryan spielberg realiti product audienc theatr horror war pleas kid home r rate realiti tom hank capt john miller franc ii rescu home ryan matt damon brother war sp...
Topic6,Topic6,0.98673,85,"joe, killer, york, island, summer, juli, dream, park, bill, legend, station, jennif, victim, kevin, boyfriend, friend, hors, boat, citi, imag",anniversari slay juli jame hewitt friend memori man verg concentr schoolwork night sleep shadow blink strobe light danc club vision man rubber slicker hook hand movi summer juli friend man mountai...
Topic7,Topic7,0.99211,1289,"effect, time, charact, alien, horror, year, star, stori, scene, world, seri, peopl, action, ship, thing, comput, plot, planet, space, earth",note followup art movi film art movi art movi star trek film review dequina copyright michael star trek contact pg anyon seri star trek movi instal seri one pattern star trek sci adventur entri se...
Topic8,Topic8,0.99115,7,"charact, comedi, time, thing, scene, way, year, kid, plot, joke, star, girl, peopl, show, guy, friend, school, stori, perform, humor",movi laugh month mess movi mess man mr hugh grant dork sex prostitut thing bug fact grant sandler jim carrey eye flutter smile pass hand slapstick fistfight deliveri room culmin grant head joan cu...


In [37]:
import pyLDAvis
import pyLDAvis.sklearn
import dill
#import warnings

#warnings.filterwanrings('ignore')
pyLDAvis.enable_notebook()
cv_matrix2 = np.matrix(cv_matrix)
pyLDAvis.sklearn.prepare(lda, cv_matrix2, cv, mds="mmds")

## Clustering documents using topic model features

In [38]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=3, random_state=0)
km.fit_transform(features)
cluster_labels = km.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)

  and should_run_async(code)


Unnamed: 0,Document,Category,ClusterLabel
0,"plot : two teen couples go to a church party , drink and then drive . they get into an accident . one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . ...",neg,1
1,the happy bastard ' s quick movie review damn that y2k bug . it ' s got a head start in this movie starring jamie lee curtis and another baldwin brother ( william this time ) in a story regarding ...,neg,1
2,"it is movies like these that make a jaded movie viewer thankful for the invention of the timex indiglo watch . based on the late 1960 ' s television show by the same name , the mod squad tells the...",neg,1
3,""" quest for camelot "" is warner bros . ' first feature - length , fully - animated attempt to steal clout from disney ' s cartoon empire , but the mouse has no reason to be worried . the only othe...",neg,1
4,"synopsis : a mentally unstable man undergoing psychotherapy saves a boy from a potentially fatal accident and then falls in love with the boy ' s mother , a fledgling restauranteur . unsuccessfull...",neg,2
...,...,...,...
1995,"wow ! what a movie . it ' s everything a movie can be : funny , dramatic , interesting , weird , funny , weird and strikingly original . yep that pretty much describes this movie . it starts out l...",pos,0
1996,"richard gere can be a commanding actor , but he ' s not always in great films . everything comes together here . gere is a big time chicago defense attorney who takes on a seemingly unwinable case...",pos,2
1997,"glory -- starring matthew broderick , denzel washington , and morgan freeman -- is the true story of the 54th regiment of massachusetts , the first black fighting unit recruited by the north durin...",pos,0
1998,"steven spielberg ' s second epic film on world war ii is an unquestioned masterpiece of film . spielberg , ever the student on film , has managed to resurrect the war genre by producing one of its...",pos,1
