In [1]:
import gc
from sklearn.decomposition import NMF

gc.enable()

In [2]:
import pickle

with open("Data/df.pkl", 'rb') as picklefile:
    df = pickle.load(picklefile)

Let's do our analysis on stemmed words so that same topic is not split (e.g. "word" and "words" should belong to the same topic):

In [3]:
# import nltk
# from textblob import TextBlob
# stemmer = nltk.stem.porter.PorterStemmer()

# def stem_getter(text):
#     return " ".join([stemmer.stem(word) for word in TextBlob(text).words])

# df.raw_text = df.raw_text.map(stem_getter)

This time around, let's remove words in all caps: they are used to indicate character lines. Using them will just create topics identifying major characters of a show/movie which is not helpful. Let's also remove non-letter characters along the way:

In [4]:
import re

In [5]:
def cap_remover(text):
    text = re.sub(r'[A-Z]+(?![a-z])', '', text)
    text = re.sub(r'[\d]+', '', text)
    text = re.sub(r' +', ' ', text)
    return re.sub(r"[^\w' ]", '', text)

In [6]:
df.raw_text = df.raw_text.map(cap_remover)

In [7]:
%pylab inline
import numpy 
import matplotlib.pyplot as plt
import sklearn
# Import all of the scikit learn stuff
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer
from sklearn import metrics
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


I have also used a large list of keywords from [here](http://www.ranks.nl/stopwords) and supplemented it with Star Trek specific terms discovered in the initial LDA model so that they are not used in topic analysis:

In [8]:
with open('Data/stopwords.txt') as f:
     content = (f.read()).split()
stoplist = sorted(list(set(content)))

In [9]:
from helper import *
import warnings
warnings.filterwarnings('ignore')

In [10]:
import nltk
# nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/aleksod/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
# from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

stopwords = stoplist #set(stopwords.words('english'))
punctuation = set(string.punctuation) 
lemmatize = WordNetLemmatizer()

def cleaning(article):
    one = " ".join([i for i in article.lower().split() if i not in stopwords])
    two = "".join(i for i in one if i not in punctuation)
    three = " ".join(lemmatize.lemmatize(i) for i in two.split())
    return three

In [12]:
# df2 = df.drop(["_id","end","series","start","url","airdate"], axis=1)

In [13]:
text = df.raw_text.map(cleaning) #['raw_text']
text_list = [i.split() for i in text]
len(text_list)

678

In [14]:
# docs = df.raw_text
count_vectorizer = CountVectorizer(stop_words = stoplist, ngram_range=(1, 3))
dtm = count_vectorizer.fit_transform(text) #docs) 

In [15]:
df = None
df2 = None
docs = None
del df
del df2
del docs
gc.collect()

7

In [16]:
# gc.get_objects()

Let's see what 10 topics could be:

In [17]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_jobs=-1)
lda.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=-1, n_topics=10, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [18]:
import pyLDAvis, pyLDAvis.sklearn
from IPython.display import display


# Setup to run in Jupyter notebook
pyLDAvis.enable_notebook()

# Create the visualization
vis = pyLDAvis.sklearn.prepare(lda, dtm, count_vectorizer)

# # Export as a standalone HTML web page
# pyLDAvis.save_html(vis, 'lda.html')

# Let's view it!
display(vis)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  topic_term_dists = topic_term_dists.ix[topic_order]


In [21]:
lda.components_.shape

(10, 2748255)

In [None]:
with open("Data/ldamodel1.pkl", 'wb') as picklefile:
    pickle.dump(ldamodel, picklefile)

In [None]:
print(ldamodel.print_topics(num_topics=2, num_words=4))

In [None]:
for i in ldamodel.print_topics(): 
    for j in i: print(j)

Brilliant! My topics are based on characters that are in the respective series! Not a very useful information, but it LDA works pretty well. Let's see if we can visuzlize it.

In [None]:
ldamodel.save('topic.model')

In [None]:
from gensim.models import LdaModel
loading = LdaModel.load('topic.model')

In [None]:
print(loading.print_topics(num_topics=2, num_words=4))

In [None]:
def pre_new(doc):
    one = cleaning(doc).split()
    two = dictionary.doc2bow(one)
    return two

In [None]:
import pyLDAvis.gensim
import gensim
pyLDAvis.enable_notebook()

In [None]:
d = gensim.corpora.Dictionary.load('dictionary.dict')
c = gensim.corpora.MmCorpus('corpus.mm')
lda = gensim.models.LdaModel.load('topic.model')

In [None]:
data = pyLDAvis.gensim.prepare(lda, c, d)
data

In [None]:
pyLDAvis.save_html(data,'vis1.html')

This is a pretty good separation. Unfortunately, it is based mostly on very specific TV show / movies terms. Therefore, I will need to remove those if I want better results.