## Part 00: TF-IDF + NMF

In [3]:
import gc
from sklearn.decomposition import NMF

gc.enable()

In [4]:
# ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# # Get a sorted list of the objects and their sizes
# sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

In [5]:
import pickle

with open("Data/df.pkl", 'rb') as picklefile:
    df = pickle.load(picklefile)

Let's do our analysis on stemmed words so that same topic is not split (e.g. "word" and "words" should belong to the same topic):

In [6]:
# import nltk
# from textblob import TextBlob
# stemmer = nltk.stem.porter.PorterStemmer()

# def stem_getter(text):
#     return " ".join([stemmer.stem(word) for word in TextBlob(text).words])

# df.raw_text = df.raw_text.map(stem_getter)

This time around, let's remove words in all caps: they are used to indicate character lines. Using them will just create topics identifying major characters of a show/movie which is not helpful. Let's also remove non-letter characters along the way:

In [7]:
import re

In [8]:
def cap_remover(text):
    text = re.sub(r'[A-Z]+(?![a-z])', '', text)
    text = re.sub(r'[\d]+', '', text)
    text = re.sub(r' +', ' ', text)
    return re.sub(r"[^\w' ]", '', text)

In [9]:
df.raw_text = df.raw_text.map(cap_remover)

In [10]:
# def stopword_remover(text):
#     return " ".join([word for word in text.lower().split() if word not in stoplist])

In [11]:
# df.raw_text = df.raw_text.map(stopword_remover)

In [12]:
# df.raw_text[0]

In [13]:
# df.head()

In [14]:
%pylab inline
import numpy 
import matplotlib.pyplot as plt
import sklearn
# Import all of the scikit learn stuff
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.preprocessing import Normalizer
from sklearn import metrics
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [15]:
docs = df.raw_text

I have also used a large list of keywords from [here](http://www.ranks.nl/stopwords) and supplemented it with Star Trek specific terms discovered in the initial LDA model so that they are not used in topic analysis:

In [18]:
with open('Data/stopwords_wpropers.txt') as f:
     content = (f.read()).split()#f.readlines()
# for i in range(len(content)):
#     content[i] = content[i].replace("\n", "").lower()
stoplist = sorted(list(set(content)))
# texts = [" ".join([word for word in cluster_dict[i].lower().split() if word not in stoplist])
#          for i in cluster_dict.keys()]

In [19]:
vectorizer = TfidfVectorizer(stop_words = stoplist, ngram_range=(1, 3))
dtm = vectorizer.fit_transform(docs) 
# pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names()).head(10)

In [16]:
# vectorizer.get_feature_names()

Let's search for th optimal number of topics to explore (e.g. search until topics start make sense):

In [20]:
for num_topics in range(10,11):
    nmf_model = NMF(num_topics)

    dtm_nmf = nmf_model.fit_transform(dtm)

    # use NMF to attempt Topic Modeling
    words = vectorizer.get_feature_names()

    # get num_topic_words top topic words:
    num_topic_words = 30

    # iterate through our eigenvectors
    print("The topics for {} topic NMF are:".format(num_topics))
    for r in nmf_model.components_:
        # sort values associated with each dimension 
        a=sorted([(v,i) for i,v in enumerate(r)])[-num_topic_words:]
        # map back to words
        print([words[i[1]] for i in a])
    print()

The topics for 10 topic NMF are:
['core', 'three', 'doctor', 'lieutenant', 'astrometrics', 'commander', 'equinox', 'ensign', 'field', 'good', 'shuttle', 'people', 'course', 'find', 'help', 'temporal', 'power', 'warp', 'shields', 'crew', 'viewscreen', 'mister', 'well', 'engineering', 'sickbay', 'time', 'going', 'will', 'ship', 'bridge']
['bar', 'sees', 'man', 'smile', 'long', 'doesn', 'hand', 'room', 'good', 'ship', 'head', 'suddenly', 'table', 'continuing', 'eyes', 'console', 'station', 'reacts', 'face', 'takes', 'door', 'nods', 'going', 'will', 'time', 'smiles', 'moment', 'turns', 'moves', 'beat']
['aye sir', 'phasers', 'orbit', 'course', 'neutral', 'vessel', 'turns', 'man', 'command', 'beam', 'warp', 'phaser', 'well', 'time', 'commander', 'viewer', 'aye', 'transporter', 'lore', 'room', 'lieutenant', 'power', 'doctor', 'planet', 'continuing', 'ship', 'will', 'bridge', 'mister', 'sir']
['room', 'didn', 'council', 'find', 'ships', 'plating', 'metres', 'will', 'well', 'porthos', 'command

So it looks like having # topics in NMF model makes the most sense to me. The as far as I can tell are topics:
1. 
2. 
3. 

In [19]:
nmf_model = NMF(15)

dtm_nmf = nmf_model.fit_transform(dtm)
# dtm_nmf = Normalizer(copy=False).fit_transform(dtm_nmf)

In [45]:
dtm.shape

(678, 49838)

So we have 678 documents and the matrix has thousands and thousands of words associated with it.

In [46]:
dtm_nmf[:,:5].round(2)

array([[ 0.  ,  0.13,  0.01,  0.  ,  0.  ],
       [ 0.03,  0.18,  0.  ,  0.  ,  0.  ],
       [ 0.02,  0.13,  0.  ,  0.  ,  0.  ],
       ..., 
       [ 0.01,  0.12,  0.07,  0.  ,  0.  ],
       [ 0.  ,  0.15,  0.03,  0.  ,  0.  ],
       [ 0.01,  0.12,  0.04,  0.  ,  0.  ]])

In [47]:
dtm_nmf[:,5:10].round(2)

array([[ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.01,  0.  ,  0.  ,  0.  ,  0.07],
       ..., 
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.01]])

Looking at the NMF-transformed TF-IDF matrices above, there are no strong topics emerging out of the 10 columns. Therefore, I shall attempt to perform K-means clustering to classify the documents:

In [48]:
dtm_nmf.shape

(678, 10)

And we have two topics now (two columns in the NMF transformed matrix). And the components (word distribution among the two topics) of those two topics are:

In [49]:
nmf_model.components_ #[:,:]

array([[  6.33850614e-03,   0.00000000e+00,   4.53588539e-05, ...,
          0.00000000e+00,   0.00000000e+00,   1.94387040e-05],
       [  0.00000000e+00,   3.36421378e-04,   3.55596430e-04, ...,
          0.00000000e+00,   1.68352417e-05,   0.00000000e+00],
       [  0.00000000e+00,   2.67690335e-05,   2.80993368e-05, ...,
          0.00000000e+00,   0.00000000e+00,   1.42787006e-04],
       ..., 
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00]])

In [50]:
nmf_model.components_.shape

(10, 49838)

Let's just assign each document to its highest feature, a clustering, in a sense:

In [51]:
# nmf_data_clust=[list(in_list).index(max(in_list)) for in_list in dtm_nmf]

In [52]:
len(nmf_data_clust)

678

In [20]:
# 4) use NMF to attempt Topic Modeling
words = vectorizer.get_feature_names()

## get 7 top topic words:

# iterate through our eigenvectors
for r in nmf_model.components_:
    # sort values associated with each dimension 
    a=sorted([(v,i) for i,v in enumerate(r)])[-30:]
    # map back to words
    print([words[i[1]] for i in a])

['doctor', 'programme', 'will', 'sickbay', 'going', 'ship', 'bridge']
['sir', 'time', 'will', 'turns', 'moves', 'continuing', 'beat']
['cube', 'will', 'descent', 'collective', 'borg ship', 'hugh', 'borg']
['martok', 'damar', 'weyoun', 'dominion', 'jem', 'hadar', 'jem hadar']
['lwaxana', 'shattered mirror', 'explorers', 'visitor', 'muse', 'beat', 'jake']
['weyoun', 'afterimage', 'penumbra', 'winn', 'damar', 'vic', 'ezri']
['shran', 'sir', 'degra', 'vulcan', 'xindi', 'bridge', 'ship']
['du', 'brunt', 'maihar', 'maihar du', 'ferengi', 'zek', 'nagus']
['ship', 'planet', 'will', 'bridge', 'scott', 'mister', 'sir']
['duras', 'redemption', 'kor', 'martok', 'kurn', 'klingon', 'gowron']
['kalita', 'rascals', 'preemptive', 'conundrum', 'preemptive strike', 'ensign ro', 'ro']
['alien', 'beat', 'hunter', 'pursuit', 'captive', 'captive pursuit', 'tosk']
['bajoran', 'resurrection', 'collaborator', 'prophets', 'vedek', 'winn', 'bareil']
['blaze', 'jennifer', 'cardassians', 'hudson', 'cardassian', 'ma

In [21]:
# iterate through our eigenvectors
for r in nmf_model.components_:
    # sort values associated with each dimension 
    a=sorted([(v,i) for i,v in enumerate(r)])[-30:]
    # map back to words
    print([words[i[1]] for i in a])

['mess hall', 'course', 'find', 'warp', 'shields', 'lieutenant', 'people', 'lab', 'viewscreen', 'good', 'help', 'computer', 'mister', 'astrometrics', 'crew', 'engineering', 'kazon', 'delta', 'delta flyer', 'holodeck', 'well', 'flyer', 'time', 'doctor', 'programme', 'will', 'sickbay', 'going', 'ship', 'bridge']
['panel', 'station', 'long', 'man', 'hand', 'table', 'doctor', 'computer', 'head', 'suddenly', 'room', 'commander', 'going', 'takes', 'eyes', 'face', 'ship', 'door', 'nods', 'smiles', 'moment', 'console', 'reacts', 'sir', 'time', 'will', 'turns', 'moves', 'continuing', 'beat']
['bridge', 'borg cube', 'ruby', 'borg queen chamber', 'queen chamber', 'locutus', 'vessel', 'sphere', 'assimilate', 'icheb', 'queen', 'transwarp', 'borg queen', 'best worlds', 'nanoprobes', 'ship', 'unimatrix', 'worlds', 'lore', 'drones', 'drone', 'shelby', 'assimilated', 'cube', 'will', 'descent', 'collective', 'borg ship', 'hugh', 'borg']
['shoals', 'rocks shoals', 'call arms', 'female', 'purgatory', 'vor