# PART3: LDA
This section focuses on using Latent Dirichlet Allocation (LDA). LDA is a probabilistic topic model that assumes documents are a mixture of topics and that each word in the document is attributable to the document's topics. For our implementaiton of LDA, we use the Gensim pacakage.

In [1]:
# basic imports 
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import cm
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import gensim
from gensim import corpora

# filenames
clean_emails_filename ='preprocessed_emails.csv'
#loading the clean emails prepared in the first part
clean_emails = pd.read_csv(clean_emails_filename,index_col = 0, header = 0).dropna(how='all')

In [2]:
clean_emails.head()

Unnamed: 0,0
0,wow
1,2011 945 latest syria aid qaddafi sid hrc memo...
2,chri steven thx
3,cairo condemn final
4,11 2011 136 huma abedin latest syria aid qadda...


Below are some Gensim specific conversions; we also filter out extreme words (see inline comment)

In [3]:
#tokenize the emails for building the dictionary
email_token = [nltk.word_tokenize(t) for t in clean_emails.iloc[:,0] ]
# turn our tokenized emails into a id <-> term dictionary
email_dictionary = corpora.Dictionary(email_token)
#remove extremes (similar to the min/max df step used when creating the tf-idf matrix)
email_dictionary.filter_extremes(no_below=1, no_above=0.8)
# convert tokenized emails into a document-term matrix
email_corpus = [email_dictionary.doc2bow(text) for text in email_token]

The actual model is run by the following function. We took 10 passes to have a better convergence, but we can see that it took our machine a lot of time on our machine to run.

In [4]:
def create_lda_model(num_of_topics):
    lda_model = gensim.models.LdaModel(email_corpus, id2word=email_dictionary, num_topics=num_of_topics, update_every=5, chunksize=1000, passes=10)
    return lda_model

In [5]:
#create a lda model starting with 5 topics
%time lda_model = create_lda_model(5)

CPU times: user 1min 18s, sys: 704 ms, total: 1min 18s
Wall time: 1min 20s


In [6]:
print('LDA Model for 5 topics')
lda_model.show_topics()

LDA Model for 5 topics


[(0,
  '0.010*"state" + 0.008*"us" + 0.007*"israel" + 0.005*"isra" + 0.005*"woman" + 0.005*"work" + 0.004*"govern" + 0.004*"secur" + 0.004*"peac" + 0.004*"peopl"'),
 (1,
  '0.015*"fyi" + 0.013*"stategov" + 0.012*"2010" + 0.008*"thank" + 0.007*"email" + 0.007*"get" + 0.007*"see" + 0.006*"2009" + 0.006*"cheryl" + 0.006*"sid"'),
 (2,
  '0.030*"secretari" + 0.025*"offic" + 0.021*"depart" + 0.017*"meet" + 0.015*"state" + 0.012*"room" + 0.009*"arriv" + 0.009*"rout" + 0.008*"time" + 0.008*"privat"'),
 (3,
  '0.007*"obama" + 0.006*"would" + 0.005*"said" + 0.005*"presid" + 0.005*"one" + 0.004*"nt" + 0.004*"new" + 0.004*"american" + 0.004*"parti" + 0.004*"time"'),
 (4,
  '0.031*"call" + 0.010*"nt" + 0.008*"senat" + 0.007*"talk" + 0.007*"tomorrow" + 0.006*"want" + 0.006*"ok" + 0.006*"vote" + 0.005*"today" + 0.005*"know"')]

We use the pyLDAvis library to show the topics. 

In [7]:
import pyLDAvis.gensim

viz_data = pyLDAvis.gensim.prepare(lda_model, email_corpus, email_dictionary)
pyLDAvis.display(viz_data)

  spec = inspect.getargspec(func)
  spec = inspect.getargspec(func)
  spec = inspect.getargspec(func)
  spec = inspect.getargspec(func)
  spec = inspect.getargspec(func)
  spec = inspect.getargspec(func)


In [8]:
#create a lda model starting with 10 topics
%time lda_model_10 = create_lda_model(10)
print('LDA Model for 10 topics')
lda_model_10.show_topics()

CPU times: user 1min 17s, sys: 847 ms, total: 1min 18s
Wall time: 1min 20s
LDA Model for 10 topics


[(0,
  '0.014*"state" + 0.011*"us" + 0.005*"depart" + 0.005*"would" + 0.005*"secur" + 0.005*"afghanistan" + 0.004*"work" + 0.004*"new" + 0.004*"nation" + 0.004*"unit"'),
 (1,
  '0.008*"republican" + 0.007*"american" + 0.007*"democrat" + 0.007*"parti" + 0.006*"polit" + 0.006*"israel" + 0.006*"obama" + 0.005*"would" + 0.005*"said" + 0.005*"senat"'),
 (2,
  '0.048*"call" + 0.013*"talk" + 0.011*"2010" + 0.009*"ok" + 0.009*"tomorrow" + 0.009*"nt" + 0.008*"today" + 0.007*"stategov" + 0.007*"huma" + 0.006*"want"'),
 (3,
  '0.046*"fyi" + 0.017*"print" + 0.015*"sid" + 0.015*"14" + 0.014*"email" + 0.012*"pl" + 0.011*"sent" + 0.009*"stategov" + 0.009*"thank" + 0.009*"2010"'),
 (4,
  '0.041*"secretari" + 0.039*"offic" + 0.029*"depart" + 0.024*"meet" + 0.019*"room" + 0.016*"state" + 0.015*"arriv" + 0.014*"rout" + 0.012*"privat" + 0.011*"resid"'),
 (5,
  '0.010*"work" + 0.009*"get" + 0.008*"nt" + 0.008*"see" + 0.007*"go" + 0.007*"want" + 0.007*"know" + 0.007*"speech" + 0.006*"also" + 0.006*"thank"')

In [9]:
viz_data_10 = pyLDAvis.gensim.prepare(lda_model_10, email_corpus, email_dictionary)
pyLDAvis.display(viz_data_10)

In [10]:
#create a lda model starting with 25 topics
%time lda_model_25 = create_lda_model(25)
print('LDA Model for 25 topics')
lda_model_25.show_topics()

CPU times: user 1min 26s, sys: 664 ms, total: 1min 26s
Wall time: 1min 27s
LDA Model for 25 topics


[(2,
  '0.008*"updat" + 0.007*"strobe" + 0.007*"brook" + 0.006*"libyan" + 0.006*"militia" + 0.005*"talbott" + 0.005*"presid" + 0.005*"funer" + 0.005*"dep" + 0.004*"http"'),
 (13,
  '0.052*"sid" + 0.028*"memo" + 0.014*"ye" + 0.012*"sent" + 0.012*"hillari" + 0.012*"2010" + 0.011*"via" + 0.010*"uk" + 0.009*"shaun" + 0.009*"sbwhoeop"'),
 (20,
  '0.007*"american" + 0.006*"obama" + 0.006*"polit" + 0.006*"parti" + 0.006*"koch" + 0.005*"would" + 0.005*"republican" + 0.005*"group" + 0.005*"year" + 0.004*"like"'),
 (14,
  '0.046*"call" + 0.017*"talk" + 0.016*"get" + 0.012*"tomorrow" + 0.011*"want" + 0.011*"work" + 0.011*"thx" + 0.010*"know" + 0.010*"ask" + 0.009*"back"'),
 (12,
  '0.019*"percent" + 0.013*"nt" + 0.013*"2010" + 0.012*"favor" + 0.009*"enough" + 0.009*"unfavor" + 0.009*"obama" + 0.008*"13" + 0.008*"10" + 0.008*"vote"'),
 (23,
  '0.057*"stategov" + 0.041*"2010" + 0.029*"cheryl" + 0.024*"mill" + 0.023*"huma" + 0.023*"2009" + 0.020*"abedin" + 0.019*"call" + 0.018*"sullivan" + 0.016*"ok

In [14]:
viz_data_25 = pyLDAvis.gensim.prepare(lda_model_25, email_corpus, email_dictionary)
pyLDAvis.display(viz_data_25)

In [15]:
#create a lda model starting with 50 topics
%time lda_model_50 = create_lda_model(50)
print('LDA Model for 50 topics')
lda_model_50.show_topics()

CPU times: user 2min, sys: 2.56 s, total: 2min 3s
Wall time: 2min 7s
LDA Model for 50 topics


[(18,
  '0.049*"senat" + 0.028*"republican" + 0.020*"vote" + 0.019*"democrat" + 0.015*"bill" + 0.015*"boehner" + 0.012*"hous" + 0.011*"nt" + 0.010*"said" + 0.010*"like"'),
 (13,
  '0.058*"pl" + 0.045*"lona" + 0.040*"secretari" + 0.038*"print" + 0.034*"assist" + 0.034*"schedul" + 0.034*"state" + 0.032*"valmoro" + 0.027*"special" + 0.015*"direct"'),
 (1,
  '0.014*"woman" + 0.013*"gender" + 0.012*"state" + 0.011*"secretari" + 0.009*"forc" + 0.009*"issu" + 0.008*"depart" + 0.008*"bureau" + 0.008*"base" + 0.007*"germani"'),
 (32,
  '0.077*"huma" + 0.065*"abedin" + 0.058*"stategov" + 0.041*"2009" + 0.030*"abedinh" + 0.022*"2010" + 0.012*"lauren" + 0.010*"jilotylc" + 0.010*"jiloti" + 0.008*"arturo"'),
 (8,
  '0.009*"greec" + 0.008*"right" + 0.007*"cabl" + 0.007*"exhibit" + 0.006*"illinoi" + 0.006*"woman" + 0.006*"coalit" + 0.006*"paul" + 0.006*"power" + 0.006*"highlight"'),
 (45,
  '0.010*"john" + 0.009*"kenya" + 0.009*"mubarak" + 0.008*"birthday" + 0.007*"one" + 0.007*"touch" + 0.007*"uk" + 

In [16]:
viz_data_50 = pyLDAvis.gensim.prepare(lda_model_50, email_corpus, email_dictionary)
pyLDAvis.display(viz_data_50)

In [18]:
pyLDAvis.save_html(viz_data, '5topics.html')
pyLDAvis.save_html(viz_data_10, '10topics.html')
pyLDAvis.save_html(viz_data_25, '25topics.html')
pyLDAvis.save_html(viz_data_50, '50topics.html')