In [4]:
import numpy as np
import pandas as pd

In [27]:
# loading in the data
data = pd.read_csv('wiki_movie_plots_deduped.csv')
data = np.array(data)
plots = list(data[:,7])

In [37]:
# pre-processing data
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk import PorterStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/andrei/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [44]:
def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [45]:
processed_docs = list(map(preprocess, plots))

In [46]:
print(plots[0])
print(processed_docs[0])

A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]
['bartend', 'work', 'saloon', 'serv', 'drink', 'custom', 'fill', 'stereotyp', 'irish', 'bucket', 'beer', 'carri', 'nation', 'follow', 'burst', 'insid', 'assault', 'irish', 'pull', 'eye', 'dump', 'beer', 'head', 'group', 'begin', 'wreck', 'smash', 'fixtur', 'mirror', 'break', 'cash', 'regist', 'bartend', 'spray', 'seltzer', 'water', 'nation', 'face', 'group', 'policemen', 'appear', 'order', 'everybodi', 'leav']


In [49]:
# create dict of the words
dictionary = gensim.corpora.Dictionary(processed_docs)

'''
Filter out tokens that appear in less than 15 documents (absolute number) or
more than 0.5 documents (fraction of total corpus size, not absolute number).
after the above two steps, keep only the first 100000 most frequent tokens.
'''
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)


In [52]:
# create a dictionary reporting how many words and how many times those words appear
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [55]:
# using tfidf
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [None]:
# Training our lda model using bag of words
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=30, id2word=dictionary, passes=5, workers=3)

In [57]:
# explore the words occuring in each topic and their relative weight.
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.008*"kill" + 0.007*"krishna" + 0.006*"tell" + 0.006*"famili" + 0.005*"leav" + 0.005*"devi" + 0.005*"return" + 0.005*"friend" + 0.005*"john" + 0.005*"go"
Topic: 1 
Words: 0.012*"kill" + 0.006*"attack" + 0.005*"escap" + 0.004*"forc" + 0.004*"destroy" + 0.004*"leav" + 0.004*"reveal" + 0.004*"ship" + 0.004*"earth" + 0.004*"return"
Topic: 2 
Words: 0.009*"tell" + 0.009*"leav" + 0.006*"go" + 0.006*"friend" + 0.005*"time" + 0.005*"life" + 0.005*"home" + 0.005*"come" + 0.005*"ask" + 0.005*"father"
Topic: 3 
Words: 0.007*"murder" + 0.006*"kill" + 0.006*"leav" + 0.005*"mari" + 0.005*"jimmi" + 0.004*"tell" + 0.004*"discov" + 0.004*"hous" + 0.004*"paul" + 0.004*"race"
Topic: 4 
Words: 0.007*"kill" + 0.005*"tell" + 0.005*"leav" + 0.005*"david" + 0.004*"polic" + 0.004*"take" + 0.004*"terrorist" + 0.004*"meet" + 0.004*"jam" + 0.004*"go"
Topic: 5 
Words: 0.014*"love" + 0.011*"father" + 0.010*"marri" + 0.009*"famili" + 0.008*"friend" + 0.008*"come" + 0.007*"mother" + 0.006*"get" + 0.

In [58]:
# Running LDA using TF-IDF
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=30, id2word=dictionary, passes=2, workers=4)

In [60]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))
    print()

Topic: 0 Word: 0.011*"krishna" + 0.007*"devi" + 0.007*"chandu" + 0.006*"jong" + 0.004*"sook" + 0.002*"tang" + 0.002*"mala" + 0.002*"venu" + 0.002*"crux" + 0.002*"terrorist"

Topic: 1 Word: 0.003*"film" + 0.002*"love" + 0.002*"jung" + 0.002*"famili" + 0.002*"father" + 0.002*"stori" + 0.002*"life" + 0.002*"mother" + 0.002*"villag" + 0.002*"girl"

Topic: 2 Word: 0.002*"film" + 0.001*"love" + 0.001*"girl" + 0.001*"play" + 0.001*"school" + 0.001*"young" + 0.001*"year" + 0.001*"life" + 0.001*"student" + 0.001*"friend"

Topic: 3 Word: 0.007*"raja" + 0.003*"singh" + 0.002*"rama" + 0.002*"kumar" + 0.002*"famili" + 0.002*"villag" + 0.002*"love" + 0.002*"father" + 0.002*"shyam" + 0.002*"kill"

Topic: 4 Word: 0.006*"dong" + 0.003*"seoul" + 0.003*"vishnu" + 0.003*"yong" + 0.002*"korean" + 0.001*"korea" + 0.001*"kwon" + 0.001*"prison" + 0.001*"luke" + 0.001*"german"

Topic: 5 Word: 0.002*"yuki" + 0.002*"king" + 0.002*"team" + 0.002*"villag" + 0.002*"school" + 0.002*"film" + 0.002*"famili" + 0.002*"l

In [61]:
# perforamance evaluation: skip

In [67]:
# testing on unseen data... jersey shore
import os
# f = open('shore.txt', 'r')
# f = open('dream.txt', 'r')
# f = open('endgame.txt', 'r')
f = open('cars.txt', 'r')
content = f.read()
bow_vector = dictionary.doc2bow(preprocess(content))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.471373975276947	 Topic: 0.007*"murder" + 0.006*"kill" + 0.006*"leav" + 0.005*"mari" + 0.005*"jimmi"
Score: 0.40686270594596863	 Topic: 0.006*"leav" + 0.006*"tell" + 0.005*"jack" + 0.005*"hous" + 0.004*"polic"
Score: 0.06820639967918396	 Topic: 0.010*"kill" + 0.007*"king" + 0.005*"leav" + 0.005*"fight" + 0.005*"take"
Score: 0.05128684267401695	 Topic: 0.012*"kill" + 0.006*"attack" + 0.005*"escap" + 0.004*"forc" + 0.004*"destroy"
