# Topic Modeling on Minimally Processed Text
## Part 00: TF-IDF + NMF

In [1]:
import gc
from sklearn.decomposition import NMF

gc.enable()

In [2]:
# ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# # Get a sorted list of the objects and their sizes
# sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

In [3]:
import pickle

with open("Data/df.pkl", 'rb') as picklefile:
    df = pickle.load(picklefile)

Let's do our analysis on stemmed words so that same topic is not split (e.g. "word" and "words" should belong to the same topic):

In [4]:
# import nltk
# from textblob import TextBlob
# stemmer = nltk.stem.porter.PorterStemmer()

# def stem_getter(text):
#     return " ".join([stemmer.stem(word) for word in TextBlob(text).words])

# df.raw_text = df.raw_text.map(stem_getter)

This time around, let's remove words in all caps: they are used to indicate character lines. Using them will just create topics identifying major characters of a show/movie which is not helpful. Let's also remove non-letter characters along the way:

In [5]:
import re

In [6]:
def cap_remover(text):
    text = re.sub(r'[A-Z]+(?![a-z])', '', text)
    text = re.sub(r'[\d]+', '', text)
    text = re.sub(r' +', ' ', text)
    return re.sub(r"[^\w' ]", '', text)

In [7]:
df.raw_text = df.raw_text.map(cap_remover)

In [8]:
# def stopword_remover(text):
#     return " ".join([word for word in text.lower().split() if word not in stoplist])

In [9]:
# df.raw_text = df.raw_text.map(stopword_remover)

In [10]:
# df.raw_text[0]

In [11]:
# df.head()

In [12]:
%pylab inline
import numpy 
import matplotlib.pyplot as plt
import sklearn
# Import all of the scikit learn stuff
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.preprocessing import Normalizer
from sklearn import metrics
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


In [13]:
docs = df.raw_text

I have also used a large list of keywords from [here](http://www.ranks.nl/stopwords) and supplemented it with Star Trek specific terms discovered in the initial LDA model so that they are not used in topic analysis:

In [14]:
with open('Data/stopwords.txt') as f:
     content = (f.read()).split()#f.readlines()
# for i in range(len(content)):
#     content[i] = content[i].replace("\n", "").lower()
stoplist = sorted(list(set(content)))
# texts = [" ".join([word for word in cluster_dict[i].lower().split() if word not in stoplist])
#          for i in cluster_dict.keys()]

In [15]:
vectorizer = TfidfVectorizer(stop_words = stoplist, ngram_range=(1, 3))
dtm = vectorizer.fit_transform(docs) 
# pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names()).head(10)

In [16]:
# vectorizer.get_feature_names()

Let's search for th optimal number of topics to explore (e.g. search until topics start make sense):

In [17]:
for num_topics in range(2,31):
    nmf_model = NMF(num_topics)

    dtm_nmf = nmf_model.fit_transform(dtm)

    # use NMF to attempt Topic Modeling
    words = vectorizer.get_feature_names()

    # get num_topic_words top topic words:
    num_topic_words = 30

    # iterate through our eigenvectors
    print("The topics for {} topic NMF are:".format(num_topics))
    for r in nmf_model.components_:
        # sort values associated with each dimension 
        a=sorted([(v,i) for i,v in enumerate(r)])[-num_topic_words:]
        # map back to words
        print([words[i[1]] for i in a])
    print()

The topics for 2 topic NMF are:
['didn', 'years', 'vessel', 'vulcan', 'course', 'lieutenant', 'planet', 'help', 'good', 'power', 'find', 'three', 'commander', 'people', 'room', 'viewscreen', 'crew', 'engineering', 'warp', 'sickbay', 'doctor', 'well', 'mister', 'going', 'time', 'sir', 'will', 'borg', 'ship', 'bridge']
['eyes', 'weyoun', 'winn', 'bajoran', 'dominion', 'station', 'martok', 'console', 'klingon', 'takes', 'face', 'reacts', 'door', 'cardassian', 'continuing', 'nods', 'going', 'damar', 'ezri', 'time', 'will', 'smiles', 'moment', 'moves', 'turns', 'jem', 'hadar', 'jem hadar', 'jake', 'beat']

The topics for 3 topic NMF are:
['years', 'programme', 'didn', 'computer', 'course', 'help', 'lieutenant', 'power', 'three', 'vulcan', 'good', 'planet', 'find', 'commander', 'crew', 'viewscreen', 'people', 'room', 'engineering', 'warp', 'sickbay', 'doctor', 'well', 'going', 'mister', 'time', 'will', 'sir', 'ship', 'bridge']
['eyes', 'weyoun', 'bajoran', 'winn', 'klingon', 'station', 'cons

['earth', 'ships', 'brig', 'weapon control', 'gralik', 'sir', 'insectoids', 'spheres', 'species', 'hayes', 'aquatics', 'reptilian bridge', 'shuttlepod', 'chemist', 'insectoid', 'shran', 'centre', 'command centre', 'bridge', 'ship', 'council', 'sphere', 'degra ship', 'trellium', 'reptilians', 'rajiin', 'weapon', 'reptilian', 'degra', 'xindi']

The topics for 8 topic NMF are:
['kazon', 'didn', 'holodeck', 'commander', 'course', 'three', 'power', 'lieutenant', 'planet', 'vulcan', 'help', 'room', 'find', 'programme', 'good', 'people', 'viewscreen', 'crew', 'warp', 'engineering', 'sickbay', 'doctor', 'well', 'sir', 'time', 'going', 'mister', 'will', 'ship', 'bridge']
['hand', 'table', 'romulan', 'man', 'station', 'head', 'suddenly', 'cardassian', 'going', 'room', 'eyes', 'commander', 'ro', 'takes', 'face', 'door', 'klingon', 'smiles', 'nods', 'ship', 'moment', 'reacts', 'time', 'console', 'turns', 'will', 'sir', 'moves', 'continuing', 'beat']
['borg queen chamber', 'queen chamber', 'cargo b

['ferenginar', 'body parts', 'magnificent ferengi', 'rules', 'family business', 'latinum', 'motive', 'grand', 'krax', 'prophet', 'acquisition', 'leeta', 'pel', 'bar', 'rules acquisition', 'grand nagus', 'brother', 'prophet motive', 'nilva', 'moogie', 'ishka', 'ferengi love', 'profit', 'du', 'brunt', 'maihar', 'maihar du', 'ferengi', 'zek', 'nagus']
['orbit', 'men', 'commodore', 'vessel', 'saavik', 'course', 'yeoman', 'phaser', 'earth', 'transporter room', 'surface', 'planet surface', 'time', 'man', 'aye', 'lieutenant', 'transporter', 'room', 'well', 'power', 'mister scott', 'pike', 'doctor', 'ship', 'planet', 'will', 'bridge', 'scott', 'mister', 'sir']
['sins', 'bat leth', 'bat', 'leth', 'father', 'sons mogh', 'etor', 'toral', 'sins father', 'mpec', 'grilka', 'sons', 'mogh', 'empire', 'sword', 'apocalypse', 'kahless', 'ehleyr', 'council', 'will', 'warrior', 'klingons', 'beat', 'duras', 'redemption', 'kor', 'martok', 'kurn', 'klingon', 'gowron']

The topics for 11 topic NMF are:
['lieut

['phase', 'warp', 'relays', 'spacedock', 'armoury officer', 'pineapple', 'birthday', 'dinner', 'likes eat', 'green ship', 'ship armoury', 'online', 'bromelin', 'vulcan', 'launch bay', 'ship', 'favourite', 'cannon', 'recoil', 'favourite food', 'amplifier', 'metres', 'natalie', 'jupiter', 'jupiter station', 'sir', 'bridge', 'monitor', 'cannons', 'armoury']

The topics for 13 topic NMF are:
['lieutenant', 'astrometrics', 'course', 'computer', 'didn', 'three', 'commander', 'mister', 'find', 'delta flyer', 'delta', 'good', 'help', 'kazon', 'people', 'warp', 'viewscreen', 'holodeck', 'flyer', 'crew', 'engineering', 'programme', 'doctor', 'well', 'sickbay', 'will', 'time', 'going', 'ship', 'bridge']
['hand', 'long', 'man', 'doctor', 'table', 'computer', 'station', 'head', 'suddenly', 'cardassian', 'room', 'commander', 'eyes', 'takes', 'going', 'face', 'ship', 'door', 'sir', 'nods', 'smiles', 'moment', 'console', 'reacts', 'will', 'time', 'turns', 'moves', 'continuing', 'beat']
['assimilation'

['warp', 'phase', 'relays', 'spacedock', 'armoury officer', 'pineapple', 'birthday', 'dinner', 'likes eat', 'green ship', 'ship armoury', 'online', 'bromelin', 'vulcan', 'launch bay', 'ship', 'favourite', 'cannon', 'recoil', 'favourite food', 'amplifier', 'metres', 'natalie', 'jupiter', 'jupiter station', 'sir', 'bridge', 'monitor', 'cannons', 'armoury']
['life', 'cardassian', 'promenade', 'cardassians', 'reckoning', 'vedek winn', 'nerys', 'support', 'orb', 'vedek bareil', 'major', 'bek', 'life support', 'fascination', 'emissary', 'li', 'circle', 'neela', 'kubus', 'beat', 'bajor', 'kai', 'shakaar', 'bajoran', 'resurrection', 'collaborator', 'prophets', 'vedek', 'winn', 'bareil']
['demilitarized zone', 'demilitarized', 'chroniton', 'orta', 'ship', 'kennelly', 'young ro', 'parem', 'cardassians', 'santos', 'console', 'young', 'brossmer', 'cardassian', 'strike', 'phase', 'power play', 'ensign', 'mirok', 'macias', 'macduff', 'beat', 'kalita', 'rascals', 'conundrum', 'preemptive', 'preemptiv

['ferenginar', 'body parts', 'magnificent ferengi', 'rules', 'family business', 'latinum', 'motive', 'grand', 'leeta', 'krax', 'acquisition', 'prophet', 'pel', 'bar', 'rules acquisition', 'grand nagus', 'brother', 'prophet motive', 'nilva', 'moogie', 'ishka', 'ferengi love', 'profit', 'du', 'brunt', 'maihar', 'maihar du', 'ferengi', 'zek', 'nagus']
['creature', 'vessel', 'men', 'course', 'mudd', 'commodore', 'yeoman', 'earth', 'phaser', 'transporter room', 'surface', 'time', 'planet surface', 'man', 'lieutenant', 'aye', 'transporter', 'room', 'well', 'power', 'doctor', 'mister scott', 'pike', 'ship', 'planet', 'will', 'bridge', 'scott', 'mister', 'sir']
['sins', 'bat', 'bat leth', 'father', 'leth', 'sons mogh', 'etor', 'toral', 'sins father', 'mpec', 'grilka', 'sons', 'mogh', 'empire', 'sword', 'apocalypse', 'kahless', 'council', 'ehleyr', 'will', 'warrior', 'klingons', 'beat', 'duras', 'redemption', 'kor', 'martok', 'kurn', 'klingon', 'gowron']
['cardassians', 'laren', 'cardassian', '

['survivors encampment', 'laser', 'mister', 'ship', 'zoo', 'keeper', 'read thoughts', 'screen', 'starbase', 'illusions', 'flash', 'specimens', 'planet', 'pike cell', 'commodore', 'magistrate', 'read', 'mendez', 'thoughts', 'specimen', 'survivors', 'sir', 'cage', 'hearing room', 'talosians', 'talosian', 'illusion', 'talos', 'vina', 'pike']

The topics for 18 topic NMF are:
['three', 'lab', 'life', 'course', 'mess hall', 'ensign', 'commander', 'lieutenant', 'astrometrics', 'find', 'viewscreen', 'computer', 'warp', 'people', 'good', 'mister', 'help', 'crew', 'holodeck', 'kazon', 'engineering', 'well', 'programme', 'time', 'doctor', 'sickbay', 'will', 'going', 'ship', 'bridge']
['station', 'panel', 'long', 'man', 'hand', 'table', 'doctor', 'computer', 'head', 'suddenly', 'room', 'going', 'commander', 'takes', 'eyes', 'face', 'ship', 'door', 'nods', 'smiles', 'moment', 'console', 'reacts', 'sir', 'time', 'will', 'turns', 'moves', 'continuing', 'beat']
['bridge', 'borg cube', 'borg queen cha

['species', 'will', 'time', 'expanse', 'shuttlepod', 'going', 'earth', 'kemocite', 'aquatic', 'gralik', 'sir', 'weapon control', 'reptilian bridge', 'insectoids', 'hayes', 'insectoid', 'spheres', 'aquatics', 'centre', 'command centre', 'bridge', 'ship', 'council', 'sphere', 'reptilians', 'degra ship', 'weapon', 'reptilian', 'xindi', 'degra']
['ferenginar', 'body parts', 'magnificent ferengi', 'rules', 'family business', 'latinum', 'motive', 'grand', 'leeta', 'krax', 'acquisition', 'prophet', 'pel', 'bar', 'rules acquisition', 'brother', 'grand nagus', 'prophet motive', 'nilva', 'moogie', 'ishka', 'ferengi love', 'profit', 'du', 'brunt', 'maihar', 'maihar du', 'ferengi', 'zek', 'nagus']
['yeoman', 'saavik', 'vaal', 'creature', 'orbit', 'beam', 'course', 'men', 'surface', 'mudd', 'planet surface', 'transporter room', 'phaser', 'time', 'man', 'transporter', 'room', 'lieutenant', 'aye', 'well', 'power', 'doctor', 'mister scott', 'ship', 'planet', 'will', 'bridge', 'scott', 'mister', 'sir']

['life', 'cardassian', 'promenade', 'cardassians', 'reckoning', 'vedek winn', 'nerys', 'support', 'orb', 'vedek bareil', 'major', 'bek', 'life support', 'fascination', 'emissary', 'circle', 'li', 'neela', 'kubus', 'beat', 'bajor', 'kai', 'shakaar', 'bajoran', 'resurrection', 'collaborator', 'prophets', 'vedek', 'winn', 'bareil']
['cave', 'shuttlecraft', 'relora', 'ocampa', 'kazon vessel', 'carey', 'shuttle', 'caretaker', 'transporter', 'technology', 'kazonnistrim', 'kar', 'sects', 'jal', 'will', 'mister suder', 'maquis', 'federation', 'razik', 'ogla', 'mister', 'bridge', 'suder', 'ship', 'nistrim', 'trabe', 'maje', 'kazon ship', 'culluh', 'kazon']
['port', 'launch bay', 'eat', 'well', 'spacedock', 'relays', 'armoury officer', 'pineapple', 'birthday', 'dinner', 'likes eat', 'green ship', 'ship armoury', 'ship', 'online', 'bromelin', 'cannon', 'favourite', 'recoil', 'metres', 'favourite food', 'amplifier', 'natalie', 'jupiter', 'jupiter station', 'sir', 'bridge', 'cannons', 'monitor', 'a

['lokirrim ship', 'holodeck', 'anomaly', 'ranek', 'race', 'ship', 'shields', 'ares command', 'ares command module', 'cockpit', 'module', 'core', 'aft compartment', 'lokirrim', 'well', 'bridge', 'flyer aft compartment', 'going', 'irina', 'delta flyer aft', 'flyer aft', 'command module', 'delta flyer cockpit', 'flyer cockpit', 'ares', 'time', 'slipstream', 'delta', 'delta flyer', 'flyer']
['survivors encampment', 'laser', 'mister', 'ship', 'zoo', 'keeper', 'read thoughts', 'screen', 'starbase', 'illusions', 'flash', 'specimens', 'planet', 'pike cell', 'commodore', 'magistrate', 'read', 'mendez', 'thoughts', 'specimen', 'survivors', 'sir', 'cage', 'hearing room', 'talosians', 'talosian', 'illusion', 'talos', 'vina', 'pike']
['gold press latinum', 'press latinum', 'latinum', 'sir', 'bars', 'sword', 'robin hood', 'ferengi', 'hn', 'horga hn', 'kolos', 'ajur', 'marian', 'nottingham', 'tagus', 'horga', 'risa', 'disc', 'robin', 'beat', 'love', 'guy', 'jeanluc', 'qpid', 'vorgons', 'uthat', 'sova

KeyboardInterrupt: 

In [None]:
3 + 4

In [19]:
print("The topics for {} topic NMF are:".format(num_topics))
for r in nmf_model.components_:
    # sort values associated with each dimension 
    a=sorted([(v,i) for i,v in enumerate(r)])[-30:]
    # map back to words
    print([words[i[1]] for i in a])
print()

The topics for 10 topic NMF are:
['mess hall', 'course', 'shields', 'find', 'lab', 'warp', 'lieutenant', 'viewscreen', 'people', 'mister', 'good', 'computer', 'help', 'astrometrics', 'crew', 'engineering', 'delta', 'kazon', 'delta flyer', 'holodeck', 'flyer', 'well', 'time', 'doctor', 'programme', 'will', 'sickbay', 'going', 'ship', 'bridge']
['hand', 'long', 'computer', 'table', 'head', 'suddenly', 'station', 'room', 'going', 'cardassian', 'eyes', 'commander', 'takes', 'ro', 'face', 'door', 'sir', 'ship', 'klingon', 'smiles', 'nods', 'time', 'moment', 'reacts', 'console', 'will', 'turns', 'continuing', 'moves', 'beat']
['cargo bay', 'borg vessel', 'bridge', 'assimilation', 'borg cube', 'queen chamber', 'sphere', 'vessel', 'assimilate', 'icheb', 'best worlds', 'queen', 'transwarp', 'borg queen', 'ship', 'lore', 'worlds', 'nanoprobes', 'shelby', 'unimatrix', 'drones', 'drone', 'assimilated', 'will', 'descent', 'cube', 'hugh', 'borg ship', 'collective', 'borg']
['call arms', 'vorta', 'er

So it looks like having # topics in NMF model makes the most sense to me. The as far as I can tell are topics:
1. 
2. 
3. 

In [67]:
nmf_model = NMF(5)

dtm_nmf = nmf_model.fit_transform(dtm)
# dtm_nmf = Normalizer(copy=False).fit_transform(dtm_nmf)

In [45]:
dtm.shape

(678, 49838)

So we have 678 documents and the matrix has thousands and thousands of words associated with it.

In [46]:
dtm_nmf[:,:5].round(2)

array([[ 0.  ,  0.13,  0.01,  0.  ,  0.  ],
       [ 0.03,  0.18,  0.  ,  0.  ,  0.  ],
       [ 0.02,  0.13,  0.  ,  0.  ,  0.  ],
       ..., 
       [ 0.01,  0.12,  0.07,  0.  ,  0.  ],
       [ 0.  ,  0.15,  0.03,  0.  ,  0.  ],
       [ 0.01,  0.12,  0.04,  0.  ,  0.  ]])

In [47]:
dtm_nmf[:,5:10].round(2)

array([[ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.01,  0.  ,  0.  ,  0.  ,  0.07],
       ..., 
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.01]])

Looking at the NMF-transformed TF-IDF matrices above, there are no strong topics emerging out of the 10 columns. Therefore, I shall attempt to perform K-means clustering to classify the documents:

In [48]:
dtm_nmf.shape

(678, 10)

And we have two topics now (two columns in the NMF transformed matrix). And the components (word distribution among the two topics) of those two topics are:

In [49]:
nmf_model.components_ #[:,:]

array([[  6.33850614e-03,   0.00000000e+00,   4.53588539e-05, ...,
          0.00000000e+00,   0.00000000e+00,   1.94387040e-05],
       [  0.00000000e+00,   3.36421378e-04,   3.55596430e-04, ...,
          0.00000000e+00,   1.68352417e-05,   0.00000000e+00],
       [  0.00000000e+00,   2.67690335e-05,   2.80993368e-05, ...,
          0.00000000e+00,   0.00000000e+00,   1.42787006e-04],
       ..., 
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00]])

In [50]:
nmf_model.components_.shape

(10, 49838)

Let's just assign each document to its highest feature, a clustering, in a sense:

In [51]:
# nmf_data_clust=[list(in_list).index(max(in_list)) for in_list in dtm_nmf]

In [52]:
len(nmf_data_clust)

678

In [68]:
# 4) use NMF to attempt Topic Modeling
words = vectorizer.get_feature_names()

## get 7 top topic words:

# iterate through our eigenvectors
for r in nmf_model.components_:
    # sort values associated with each dimension 
    a=sorted([(v,i) for i,v in enumerate(r)])[-7:]
    # map back to words
    print([words[i[1]] for i in a])

['well', 'will', 'doctor', 'mister', 'don', 'll', 've']
['time', 'going', 'weyoun', 've', 'll', 'don', 'beat']
['time', 'going', 'degra', 'trip', 'don', 'll', 've']
['reacts', 'turns', 'console', 'will', 'moves', 'continuing', 'beat']
['will', 'cube', 'collective', 'll', 've', 'delta', 'flyer']


In [28]:
x = dtm_nmf[:,0]
y = dtm_nmf[:,10]
plt.scatter(x,y)

IndexError: index 10 is out of bounds for axis 1 with size 10

## Part 01: Word Count + LDA  
I am going to follow the guide [here](http://www.shichaoji.com/category/data-cleasing/) for (hopefully) a better topic visuzlization:

In [None]:
from helper import *
import warnings
warnings.filterwarnings('ignore')

In [None]:
# import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')

In [None]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

stopwords = set(stopwords.words('english'))
punctuation = set(string.punctuation) 
lemmatize = WordNetLemmatizer()

def cleaning(article):
    one = " ".join([i for i in article.lower().split() if i not in stopwords])
    two = "".join(i for i in one if i not in punctuation)
    three = " ".join(lemmatize.lemmatize(i) for i in two.split())
    return three

In [None]:
df2 = df.drop(["_id","end","series","start","url","airdate"], axis=1)

In [None]:
text = df2.applymap(cleaning)['raw_text']
text_list = [i.split() for i in text]
len(text_list)

In [None]:
from time import time
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO,
                   filename='running.log',filemode='w')

In [None]:
df = None
df2 = None
docs = None
del df
del df2
del docs
gc.collect()

In [None]:
# gc.get_objects()

In [None]:
# Importing Gensim
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
dictionary = corpora.Dictionary(text_list)
dictionary.save('dictionary.dict')
print(dictionary)

In [None]:
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in text_list]
corpora.MmCorpus.serialize('corpus.mm', doc_term_matrix)

# print(len(doc_term_matrix))
# print(doc_term_matrix[100])

In [None]:
start = time()
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=10, id2word = dictionary, passes=50)
print('used: {:.2f}s'.format(time()-start))

In [None]:
with open("Data/ldamodel1.pkl", 'wb') as picklefile:
    pickle.dump(ldamodel, picklefile)

In [None]:
print(ldamodel.print_topics(num_topics=2, num_words=4))

In [None]:
for i in ldamodel.print_topics(): 
    for j in i: print(j)

Brilliant! My topics are based on characters that are in the respective series! Not a very useful information, but it LDA works pretty well. Let's see if we can visuzlize it.

In [None]:
ldamodel.save('topic.model')

In [None]:
from gensim.models import LdaModel
loading = LdaModel.load('topic.model')

In [None]:
print(loading.print_topics(num_topics=2, num_words=4))

In [None]:
def pre_new(doc):
    one = cleaning(doc).split()
    two = dictionary.doc2bow(one)
    return two

In [None]:
import pyLDAvis.gensim
import gensim
pyLDAvis.enable_notebook()

In [None]:
d = gensim.corpora.Dictionary.load('dictionary.dict')
c = gensim.corpora.MmCorpus('corpus.mm')
lda = gensim.models.LdaModel.load('topic.model')

In [None]:
data = pyLDAvis.gensim.prepare(lda, c, d)
data

In [None]:
pyLDAvis.save_html(data,'vis1.html')

This is a pretty good separation. Unfortunately, it is based mostly on very specific TV show / movies terms. Therefore, I will need to remove those if I want better results.