In [1]:
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
import pickle

In [2]:
import re
import numpy as np
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases, Phraser
from nltk.corpus import stopwords
# spacy for lemmatization
import spacy
# for plotting
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt

  def _figure_formats_changed(self, name, old, new):


In [3]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [4]:
all_files=glob('emails/*.txt')

In [5]:
len(all_files)

11313

In [6]:
with open(all_files[0]) as f:
    contents = f.read()
    f.close()

In [7]:
file_data=[]
for x in tqdm(all_files):
    with open(x, encoding='utf-8',errors='ignore') as f:
        contents = f.read()
        file_data.append(contents)
    f.close()

100%|██████████| 11313/11313 [00:00<00:00, 77319.18it/s]


In [8]:
len(file_data)

11313

In [9]:
df={'emails':file_data,'files':all_files}
df = pd.DataFrame(df)

In [10]:
df.to_csv('emails.csv')

In [11]:
def parse_raw_message(raw_message):
    lines = raw_message.split('\n')
    email = {}
    message = ''
    keys_to_extract = ['from', 'to']
    for line in lines:
        if ':' not in line:
            message += line.strip()
            email['body'] = message
        else:
            pairs = line.split(':')
            key = pairs[0].lower()
            val = pairs[1].strip()
            if key in keys_to_extract:
                email[key] = val
    return email

In [12]:
def map_to_list(emails, key):
    results = []
    for email in emails:
        if key not in email:
            results.append('')
        else:
            results.append(email[key])
    return results

In [13]:
def parse_into_emails(messages):
    emails = [parse_raw_message(message) for message in messages]
    return {
        'body': map_to_list(emails, 'body'),
        'to': map_to_list(emails, 'to'),
        'from_': map_to_list(emails, 'from')
    }

In [14]:
email_df = pd.DataFrame(parse_into_emails(df.emails))

In [15]:
email_df.head()

Unnamed: 0,body,to,from_
0,>>>>Does anyone know if all the Patrick divisi...,,hammerl@acsu.buffalo.edu (Valerie S. Hammerl)
1,">\tWhile we're on the subject, has anyone else...",,lusky@ccwf.cc.utexas.edu (Jonathan R. Lusky)
2,|>|> >Swatikas were also common in American In...,,hays@ssd.intel.com (Kirk Hays)
3,Hi!...I am searching for packages that could h...,,stjohn@math1.kaist.ac.kr (Ryou Seong Joon)
4,Here are the standings after game 1 of each of...,,andrew@idacom.hp.com (Andrew Scott)


In [16]:
# prep NLTK Stop words

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use','know','get','say','go',
                    'well','run','also'])

In [17]:
email_df.iloc[2]['body']

'|>|> >Swatikas were also common in American Indian markings/painted walls etc.  Is|> >it the Swastika that is bad?|>|> Just want to back this up with a personal anecdote.  My grandparents|> have a Navajo rug made in the 1920\'s, which they received in trade|> from the weaver while living in Flagstaff, Arizona.  The decorative motif|> consists of 4 large black swastikas, one in each corner.  What\'s more, the|> color scheme is black, white, and red.  To the casual glance it would|> undoubtedly appear to be a Nazi relic of some kind.  Yet they owned it|> ten years before Hitler and the National Socialists came to power.|>|> As I recall, they took it down in the 30\'s, and didn\'t feel quite right|> about putting it back up until the 60\'s.  It still draws comments from|> those who don\'t know what it is.Having lived, played, and worked on and near the Navajo reservationfor a number of years, I can confirm this is an ancient pattern,found in petroglyphs dated 800 to 1200 years old.Also, 

In [18]:
# Convert email body to list
data = email_df.body.values.tolist()

In [19]:
# tokenize - break down each sentence into a list of words
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [20]:
data_words = list(sent_to_words(data))

In [21]:
data_words[3]

['hi',
 'am',
 'searching',
 'for',
 'packages',
 'that',
 'could',
 'handle',
 'multi',
 'page',
 'giffiles',
 'are',
 'there',
 'any',
 'on',
 'some',
 'ftp',
 'servers',
 'll',
 'appreciate',
 'one',
 'which',
 'works',
 'on',
 'pc',
 'either',
 'on',
 'dos',
 'or',
 'windows',
 'but',
 'any',
 'package',
 'works',
 'on',
 'unix',
 'will',
 'be',
 'ok',
 'thanks',
 'in',
 'advance']

In [22]:
# Build the bigram and trigram models
bigram = Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = Phrases(bigram[data_words], threshold=100)

In [23]:
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

In [24]:
# See trigram example
trigram_mod[bigram_mod[data_words[200]]]

['on',
 'two',
 'separate',
 'occasions',
 'saw',
 'dick',
 'allen',
 'back',
 'when',
 'he',
 'was',
 'richie',
 'homer',
 'at',
 'shea',
 'off',
 'the',
 'middle',
 'of',
 'the',
 'black',
 'centerfield',
 'hitter',
 'sbackground',
 'screen',
 'think',
 'both',
 'shots',
 'would',
 'have',
 'traveled',
 'feet',
 'jay']

In [25]:
# remove stop_words, make bigrams and lemmatize
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [26]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

In [27]:
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

In [28]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [29]:
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB'])

In [30]:
data_lemmatized[200]

['separate',
 'occasion',
 'see',
 'homer',
 'hitter',
 'sbackground',
 'think',
 'shot',
 'travel',
 'foot']

In [31]:
# create dictionary and corpus both are needed for (LDA) topic modeling

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [32]:
with open ('id2word.pkl', 'wb') as f:
    pickle.dump(id2word,f)
f.close()

In [33]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [34]:
lda_model.print_topics()# The weights reflect how important a keyword is to that topic.

[(0,
  '0.044*"cable" + 0.030*"ram" + 0.030*"instal" + 0.023*"spare" + 0.023*"connector" + 0.021*"hook" + 0.017*"slot" + 0.016*"motherboard" + 0.015*"task" + 0.015*"simms"'),
 (1,
  '0.028*"believe" + 0.026*"claim" + 0.022*"true" + 0.016*"question" + 0.015*"word" + 0.015*"fact" + 0.015*"state" + 0.014*"people" + 0.014*"mean" + 0.013*"law"'),
 (2,
  '0.033*"smoke" + 0.028*"tear_gas" + 0.019*"skin" + 0.017*"leafs" + 0.016*"rich" + 0.016*"mirror" + 0.013*"extract" + 0.011*"closely" + 0.008*"goalie" + 0.006*"harmful"'),
 (3,
  '0.048*"system" + 0.031*"use" + 0.027*"drive" + 0.025*"bit" + 0.022*"driver" + 0.018*"card" + 0.018*"datum" + 0.016*"key" + 0.015*"memory" + 0.014*"computer"'),
 (4,
  '0.046*"oil" + 0.014*"angel" + 0.011*"officially" + 0.009*"standard_disclaimer" + 0.009*"election" + 0.006*"corporate" + 0.004*"bolt" + 0.002*"clever" + 0.000*"brake" + 0.000*"export"'),
 (5,
  '0.056*"apr" + 0.013*"penguin" + 0.008*"temp" + 0.007*"earn" + 0.007*"baerga" + 0.007*"unfair" + 0.006*"human

In [35]:
doc_lda = lda_model[corpus]

In [36]:
# Model perplexity and topic coherence provide a convenient
# measure to judge how good a given topic model is.
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.


Perplexity:  -15.543145384551773


In [37]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.4876290346147165


In [38]:
# Visualize the topics
pyLDAvis.enable_notebook(sort=True)
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [39]:
pyLDAvis.display(vis)

In [45]:
lda_model.save('model/my_model.model')

In [46]:
top_topics = lda_model.get_document_topics(corpus[1], minimum_probability=0.0)

In [47]:
lda_model

<gensim.models.ldamodel.LdaModel at 0x7ff93c3a9220>

In [48]:
top_topics

[(0, 0.0014234676),
 (1, 0.027277717),
 (2, 0.00074072013),
 (3, 0.24603324),
 (4, 0.027296685),
 (5, 0.0004717127),
 (6, 0.0017141411),
 (7, 0.1939084),
 (8, 0.28774044),
 (9, 0.001081198),
 (10, 0.0029121547),
 (11, 0.00066121755),
 (12, 0.09771441),
 (13, 0.0015545117),
 (14, 0.033795662),
 (15, 0.0033423344),
 (16, 0.002547096),
 (17, 0.00040381803),
 (18, 0.06607294),
 (19, 0.0033081654)]

In [49]:
lda_model.show_topic(3)

[('system', 0.047838032),
 ('use', 0.03107872),
 ('drive', 0.027355561),
 ('bit', 0.02470283),
 ('driver', 0.021570178),
 ('card', 0.017954001),
 ('datum', 0.017951597),
 ('key', 0.016409446),
 ('memory', 0.01543947),
 ('computer', 0.013766613)]