# Topic Modeling: A Real Example

## Import necessary dependencies and settings

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import nltk, random
from nltk.corpus import movie_reviews
from nltk.stem import PorterStemmer

pd.options.display.max_colwidth = 200
%matplotlib inline

## Sample corpus of text documents

In [3]:
corpus = [' '.join(movie_reviews.words(f)) for f in movie_reviews.fileids()]
labels = [f[:3] for f in movie_reviews.fileids()]

In [4]:
len(corpus)
print(corpus[0][:200])
print(labels[:10])

plot : two teen couples go to a church party , drink and then drive . they get into an accident . one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . what
['neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg']


In [5]:

corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Document': corpus, 
                          'Category': labels})
corpus_df = corpus_df[['Document', 'Category']]
corpus_df

Unnamed: 0,Document,Category
0,"plot : two teen couples go to a church party , drink and then drive . they get into an accident . one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . ...",neg
1,the happy bastard ' s quick movie review damn that y2k bug . it ' s got a head start in this movie starring jamie lee curtis and another baldwin brother ( william this time ) in a story regarding ...,neg
2,"it is movies like these that make a jaded movie viewer thankful for the invention of the timex indiglo watch . based on the late 1960 ' s television show by the same name , the mod squad tells the...",neg
3,""" quest for camelot "" is warner bros . ' first feature - length , fully - animated attempt to steal clout from disney ' s cartoon empire , but the mouse has no reason to be worried . the only othe...",neg
4,"synopsis : a mentally unstable man undergoing psychotherapy saves a boy from a potentially fatal accident and then falls in love with the boy ' s mother , a fledgling restauranteur . unsuccessfull...",neg
...,...,...
1995,"wow ! what a movie . it ' s everything a movie can be : funny , dramatic , interesting , weird , funny , weird and strikingly original . yep that pretty much describes this movie . it starts out l...",pos
1996,"richard gere can be a commanding actor , but he ' s not always in great films . everything comes together here . gere is a big time chicago defense attorney who takes on a seemingly unwinable case...",pos
1997,"glory -- starring matthew broderick , denzel washington , and morgan freeman -- is the true story of the 54th regiment of massachusetts , the first black fighting unit recruited by the north durin...",pos
1998,"steven spielberg ' s second epic film on world war ii is an unquestioned masterpiece of film . spielberg , ever the student on film , has managed to resurrect the war genre by producing one of its...",pos


## Simple text pre-processing

In [6]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')
stemmer = PorterStemmer()
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [7]:
norm_corpus = normalize_corpus(corpus)
norm_corpus

array(['plot two teen coupl go church parti drink drive get accid one guy die girlfriend continu see life nightmar deal watch movi sorta find critiqu mind fuck movi teen gener touch cool idea present bad packag make review even harder one write sinc gener applaud film attempt break mold mess head lost highway memento good bad way make type film folk snag one correctli seem taken pretti neat concept execut terribl problem movi well main problem simpli jumbl start normal downshift fantasi world audienc member idea go dream charact come back dead other look like dead strang apparit disappear looooot chase scene ton weird thing happen simpli explain person mind tri unravel film everi give clue get kind fed film biggest problem obvious got big secret hide seem want hide complet final five minut make thing entertain thrill even engag meantim realli sad part arrow dig flick like actual figur half way point strang start make littl bit sens still make film entertain guess bottom line movi like 

## Bag of Words Model

- Bag-of-words model is the simplest way to vectorize texts into numeric representations.
- In short, it is a method to represent a text using its word frequency list.
- The sequential order of words in the text is therefore naively ignored.

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
# get bag of words features in sparse format
cv = CountVectorizer(min_df=0.2, max_df=0.7)
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix

<2000x190 sparse matrix of type '<class 'numpy.int64'>'
	with 123821 stored elements in Compressed Sparse Row format>

In [9]:
# view non-zero feature positions in the sparse matrix
print(cv_matrix)

  (0, 134)	1
  (0, 175)	2
  (0, 67)	4
  (0, 64)	3
  (0, 71)	1
  (0, 151)	2
  (0, 98)	1
  (0, 179)	1
  (0, 56)	1
  (0, 113)	3
  (0, 63)	2
  (0, 17)	2
  (0, 45)	3
  (0, 158)	2
  (0, 13)	1
  (0, 75)	1
  (0, 69)	2
  (0, 180)	4
  (0, 152)	3
  (0, 137)	3
  (0, 181)	1
  (0, 161)	2
  (0, 185)	2
  (0, 14)	2
  (0, 28)	2
  :	:
  (1999, 58)	1
  (1999, 76)	1
  (1999, 159)	1
  (1999, 84)	3
  (1999, 165)	1
  (1999, 131)	1
  (1999, 109)	1
  (1999, 156)	1
  (1999, 125)	2
  (1999, 94)	1
  (1999, 78)	1
  (1999, 132)	1
  (1999, 144)	1
  (1999, 106)	3
  (1999, 4)	1
  (1999, 168)	2
  (1999, 70)	1
  (1999, 22)	1
  (1999, 147)	1
  (1999, 34)	3
  (1999, 129)	1
  (1999, 183)	1
  (1999, 79)	1
  (1999, 116)	1
  (1999, 150)	1


In [10]:
# view dense representation 
# warning might give a memory error if data is too big
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[0, 0, 1, ..., 1, 0, 0],
       [2, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 3, 0, ..., 0, 0, 2],
       [0, 0, 3, ..., 1, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

In [11]:
# get all unique words in the corpus
vocab = np.array(cv.get_feature_names())
# show document feature vectors
pd.DataFrame(cv_matrix, columns=vocab)

Unnamed: 0,act,action,actor,actual,almost,along,also,although,alway,anoth,...,way,well,without,wonder,work,world,would,year,yet,young
0,0,0,1,2,0,0,1,1,1,0,...,4,1,0,0,0,2,1,1,0,0
1,2,1,0,0,0,0,0,0,0,1,...,0,1,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,1,0,0,1
3,0,0,1,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,1,0,0
4,0,0,2,1,0,0,1,0,1,3,...,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1,0,0,0,0,0,4,0,0,0,...,0,4,0,3,0,1,0,3,0,0
1996,0,0,1,1,0,0,0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
1997,0,3,0,1,0,0,1,0,2,0,...,0,2,2,0,0,0,0,0,0,2
1998,0,0,3,0,2,0,2,0,0,0,...,0,0,1,0,1,2,0,1,0,0


## Latent Dirichlet Allocation

In [12]:
%%time
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=8, max_iter=10000, random_state=0,
                               max_doc_update_iter=50, learning_method='online',
                               batch_size=500, learning_offset = 50, n_jobs = -1)
dt_matrix = lda.fit_transform(cv_matrix) # document matrix

CPU times: user 10min 58s, sys: 46.2 s, total: 11min 44s
Wall time: 27min 2s


In [13]:
features = pd.DataFrame(dt_matrix, columns = ["T"+str(n) for n in range(1,9)])
features

Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8
0,0.083654,0.001050,0.455340,0.001052,0.282282,0.001052,0.001052,0.174516
1,0.415890,0.002907,0.338549,0.002910,0.002912,0.231007,0.002912,0.002914
2,0.001648,0.001645,0.512473,0.001647,0.267217,0.212076,0.001648,0.001648
3,0.001927,0.001923,0.447879,0.064384,0.141645,0.001927,0.189688,0.150628
4,0.190102,0.001147,0.001149,0.001148,0.001149,0.235257,0.212642,0.357406
...,...,...,...,...,...,...,...,...
1995,0.000798,0.000796,0.341987,0.083066,0.000797,0.000798,0.384830,0.186928
1996,0.002363,0.002358,0.002363,0.301929,0.156067,0.002362,0.002363,0.530194
1997,0.112431,0.001147,0.001149,0.001148,0.189951,0.440785,0.252240,0.001149
1998,0.001605,0.001603,0.001605,0.001604,0.143683,0.688255,0.160040,0.001605


## Show topics and their weights

In [14]:
# tt_matrix = lda.components_ # topic matrix
# for topic_weights in tt_matrix:
#     topic = [(token, weight) for token, weight in zip(vocab, topic_weights)]
#     topic = sorted(topic, key=lambda x: -x[1])
#     topic = [item for item in topic if item[1] > 0.6]
#     print(topic)
#     print()


In [15]:
topic_terms = lda.components_
top_terms = 20
topic_keywords_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:,:top_terms]
topic_keywords = vocab[topic_keywords_idxs]
topics = [', '.join(w) for w in topic_keywords]
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame(topics,
                        columns = ['Keywords per Topic'],
                        index = ["Topic"+str(n) for n in range(1,9)])

In [16]:
topics_df

Unnamed: 0,Keywords per Topic
Topic1,"good, bad, play, guy, get, realli, act, actor, scene, well, littl, kill, great, know, lot, tri, even, also, role, big"
Topic2,"thought, hour, keep, star, let, put, director, rather, head, hard, stori, run, call, long, happen, pictur, attempt, hope, leav, goe"
Topic3,"get, go, thing, even, see, look, plot, know, minut, think, end, bad, would, realli, say, much, watch, want, come, someth"
Topic4,"comedi, funni, laugh, get, big, play, high, star, friend, best, go, seem, enjoy, fun, bit, work, would, back, involv, noth"
Topic5,"world, life, live, peopl, show, year, us, would, first, know, see, even, take, new, use, way, look, go, two, say"
Topic6,"action, scene, star, effect, plot, first, origin, even, sequenc, director, much, new, look, use, play, set, also, cast, gener, would"
Topic7,"love, play, perform, man, life, stori, two, year, work, best, director, role, becom, john, take, find, turn, young, also, get"
Topic8,"stori, seem, well, see, much, scene, also, mani, end, great, howev, good, would, realli, even, may, feel, never, although, work"


In [17]:
pd.options.display.float_format = '{:,.5f}'.format
pd.set_option('display.max_colwidth', 200)


dt_df = pd.DataFrame(dt_matrix,
                    columns=["Topic"+str(n) for n in range(1,9)])

max_contrib_topics = dt_df.max(axis=0)
dominant_topics = max_contrib_topics.index
contrib_perc = max_contrib_topics.values
document_numbers = [dt_df[dt_df[t]==max_contrib_topics.loc[t]].index[0]
                    for t in dominant_topics]
documents = [norm_corpus[i] for i in document_numbers]

documents_df = pd.DataFrame({'Dominant Topic': dominant_topics,
                            'Contribution%': contrib_perc,
                            'DOCID': document_numbers,
                            'Topic': topics_df['Keywords per Topic'],
                            'Text': documents})

In [18]:
documents_df

Unnamed: 0,Dominant Topic,Contribution%,DOCID,Topic,Text
Topic1,Topic1,0.70858,380,"good, bad, play, guy, get, realli, act, actor, scene, well, littl, kill, great, know, lot, tri, even, also, role, big",want involv show busi one day refus sequel movi may make believ get wors movi prove littl worri last batman film thought joel schumacc take tim burton would screw also assum val kilmer would screw...
Topic2,Topic2,0.0625,506,"thought, hour, keep, star, let, put, director, rather, head, hard, stori, run, call, long, happen, pictur, attempt, hope, leav, goe",film extraordinarili horrend go wast word
Topic3,Topic3,0.9873,398,"get, go, thing, even, see, look, plot, know, minut, think, end, bad, would, realli, say, much, watch, want, come, someth",robert forster found famou appear jacki brown immedi sign littl film call american perfekt almost two year ago wait patient film releas never final forgot day though perus select local video store...
Topic4,Topic4,0.71782,1718,"comedi, funni, laugh, get, big, play, high, star, friend, best, go, seem, enjoy, fun, bit, work, would, back, involv, noth",usual movi someth soil rug big lebowski new offer creator critic hit fargo say least wildli entertain origin alway strong trait coen brother movi big lebowski insan origin begin oddli enough jeff ...
Topic5,Topic5,0.9271,1178,"world, life, live, peopl, show, year, us, would, first, know, see, even, take, new, use, way, look, go, two, say",know mani peopl idea cross mind life could ongo televis show watch anoth world peopl someth use wonder younger decid first thought watch lot tv brother hit head basebal bat pretti sure andrew nicc...
Topic6,Topic6,0.97566,93,"action, scene, star, effect, plot, first, origin, even, sequenc, director, much, new, look, use, play, set, also, cast, gener, would",numer comparison made movi past sci fi suspens thriller soldier multi crossbre like termin alien offspr problem mix gene final product real mongrel well made put product got ground besid action me...
Topic7,Topic7,0.9663,1471,"love, play, perform, man, life, stori, two, year, work, best, director, role, becom, john, take, find, turn, young, also, get",costum drama set england elizabeth lush romant polit masterpiec upset cross protest queen respect cathol one court countri whole pass queen royal famili speak upset protest crowen anoth cathol plu...
Topic8,Topic8,0.93637,727,"stori, seem, well, see, much, scene, also, mani, end, great, howev, good, would, realli, even, may, feel, never, although, work",girl word mess never abl determin spike lee tri accomplish film sens film go kind coher narr point film miss girl way way theresa randl charact address phone sex workplac girl known number plot th...


In [19]:
import pyLDAvis
import pyLDAvis.sklearn
import dill
#import warnings

#warnings.filterwanrings('ignore')
pyLDAvis.enable_notebook()
cv_matrix2 = np.matrix(cv_matrix)
pyLDAvis.sklearn.prepare(lda, cv_matrix2, cv, mds="mmds")

## Clustering documents using topic model features

In [20]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=3, random_state=0)
km.fit_transform(features)
cluster_labels = km.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)

  and should_run_async(code)


Unnamed: 0,Document,Category,ClusterLabel
0,"plot : two teen couples go to a church party , drink and then drive . they get into an accident . one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . ...",neg,2
1,the happy bastard ' s quick movie review damn that y2k bug . it ' s got a head start in this movie starring jamie lee curtis and another baldwin brother ( william this time ) in a story regarding ...,neg,0
2,"it is movies like these that make a jaded movie viewer thankful for the invention of the timex indiglo watch . based on the late 1960 ' s television show by the same name , the mod squad tells the...",neg,2
3,""" quest for camelot "" is warner bros . ' first feature - length , fully - animated attempt to steal clout from disney ' s cartoon empire , but the mouse has no reason to be worried . the only othe...",neg,2
4,"synopsis : a mentally unstable man undergoing psychotherapy saves a boy from a potentially fatal accident and then falls in love with the boy ' s mother , a fledgling restauranteur . unsuccessfull...",neg,0
...,...,...,...
1995,"wow ! what a movie . it ' s everything a movie can be : funny , dramatic , interesting , weird , funny , weird and strikingly original . yep that pretty much describes this movie . it starts out l...",pos,1
1996,"richard gere can be a commanding actor , but he ' s not always in great films . everything comes together here . gere is a big time chicago defense attorney who takes on a seemingly unwinable case...",pos,2
1997,"glory -- starring matthew broderick , denzel washington , and morgan freeman -- is the true story of the 54th regiment of massachusetts , the first black fighting unit recruited by the north durin...",pos,0
1998,"steven spielberg ' s second epic film on world war ii is an unquestioned masterpiece of film . spielberg , ever the student on film , has managed to resurrect the war genre by producing one of its...",pos,0
