### Data Cleaning

Data cleaning is essential in LDA, and it should be corpus specific data cleaning and require data exploration.
Here we have cleaned the emails and have removed the previous email snippets, forwarded emails, email ids and generic footer in those emails. 
Further, using POS tagging in spacy, I have removed all the words except those are noun/pronoun and adjectives. I took the inspiration from http://www.aclweb.org/anthology/U15-1013 paper, according to which noun only topic modeling is more efficient. Once the data is cleaned, I save the data for further use in LDA model.

In [13]:
import numpy as np 
import pandas as pd
import email
import spacy
import re
from tqdm._tqdm_notebook import tqdm_notebook,tnrange,tqdm
from collections import Counter,OrderedDict
from gensim import models,corpora
from gensim.summarization import summarize,keywords
import warnings
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from spacy.lang.en.stop_words import STOP_WORDS
%matplotlib inline
pyLDAvis.enable_notebook()
warnings.filterwarnings('ignore')
tqdm_notebook.pandas('Progress')
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import seaborn as sns

In [17]:
all_emails = pd.read_csv('emails.csv')

In [19]:
## Helper functions
def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append( part.get_payload() )
    return ''.join(parts)

def split_email_addresses(line):
    '''To separate multiple email addresses'''
    if line:
        addrs = line.split(',')
        addrs = frozenset(map(lambda x: x.strip(), addrs))
    else:
        addrs = None
    return addrs

In [20]:
# Parse the emails into a list email objects
messages = list(map(email.message_from_string, all_emails['message']))

# Get fields from parsed email objects
keys = messages[0].keys()
for key in keys:
    all_emails[key] = [doc[key] for doc in messages]
# Parse content from emails
all_emails['content'] = list(map(get_text_from_email, messages))
# Split multiple email addresses
all_emails['From'] = all_emails['From'].map(split_email_addresses)
all_emails['To'] = all_emails['To'].map(split_email_addresses)

# Extract the root of 'file' as 'user'
all_emails['user'] = all_emails['file'].map(lambda x:x.split('/')[0])
del messages
all_emails.drop(['message', 'file', 'Message-ID', 'Content-Transfer-Encoding'], axis=1, inplace=True)
all_emails.head()

Unnamed: 0,Date,From,To,Subject,Mime-Version,Content-Type,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,content,user
0,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",(phillip.allen@enron.com),(tim.belden@enron.com),,1.0,text/plain; charset=us-ascii,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Here is our forecast\n\n,allen-p
1,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",(phillip.allen@enron.com),(john.lavorato@enron.com),Re:,1.0,text/plain; charset=us-ascii,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Traveling to have a business meeting takes the...,allen-p
2,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",(phillip.allen@enron.com),(leah.arsdall@enron.com),Re: test,1.0,text/plain; charset=us-ascii,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,test successful. way to go!!!,allen-p
3,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",(phillip.allen@enron.com),(randall.gay@enron.com),,1.0,text/plain; charset=us-ascii,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,"Randy,\n\n Can you send me a schedule of the s...",allen-p
4,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",(phillip.allen@enron.com),(greg.piper@enron.com),Re: Hello,1.0,text/plain; charset=us-ascii,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,Let's shoot for Tuesday at 11:45.,allen-p


In [23]:
all_emails.drop(['X-cc', 'X-bcc', 'X-Folder', 'X-Origin', 'X-FileName',\
                'Content-Type', 'Mime-Version', ], axis=1, inplace=True)

In [24]:
all_emails.isnull().sum()

Date           0
From           0
To         21847
Subject        0
X-From        29
X-To          29
content        0
user           0
dtype: int64

In [29]:
all_emails['X-To'] = all_emails['X-To'].apply(lambda x: x.split('<')[0] if (x is not None) else x)

In [30]:
all_emails.head()

Unnamed: 0,Date,From,To,Subject,X-From,X-To,content,user
0,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",(phillip.allen@enron.com),(tim.belden@enron.com),,Phillip K Allen,Tim Belden,Here is our forecast\n\n,allen-p
1,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",(phillip.allen@enron.com),(john.lavorato@enron.com),Re:,Phillip K Allen,John J Lavorato,Traveling to have a business meeting takes the...,allen-p
2,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",(phillip.allen@enron.com),(leah.arsdall@enron.com),Re: test,Phillip K Allen,Leah Van Arsdall,test successful. way to go!!!,allen-p
3,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",(phillip.allen@enron.com),(randall.gay@enron.com),,Phillip K Allen,Randall L Gay,"Randy,\n\n Can you send me a schedule of the s...",allen-p
4,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",(phillip.allen@enron.com),(greg.piper@enron.com),Re: Hello,Phillip K Allen,Greg Piper,Let's shoot for Tuesday at 11:45.,allen-p


In [22]:
len(all_emails)

517401

In [32]:
email_contents = all_emails.content

In [37]:
# Clean html content
# Clean generic footer
# Remove forwarded part of email
def cleanhtml(raw_html):
    cleaners = [re.compile('<.*?>'),\
               re.compile('(?<=\*)(.|\n)*(?=\*)'),\
               re.compile('(?<=\=)(.|\n)*(?=\=)'),\
               re.compile('ECT@ECT(.|\n)*'),\
               re.compile('---Original Message---(.|\n)*')]
    for cleaner in cleaners:
        raw_html = re.sub(cleaner, ' ', raw_html)
    return raw_html

In [38]:
email_contents_cleaned = email_contents.apply(cleanhtml)

In [39]:
# Save the data here, since next steps takes a lot of time and can make the server die.
refined_emails_df = pd.DataFrame({'email': email_contents_cleaned})
refined_emails_df.to_csv('enron_clean.csv', index=False)

In [41]:
# emails = pd.read_csv('enron_clean.csv')
# email_contents_cleaned = emails.email

nlp = spacy.load('en')

In [42]:
len(email_contents_cleaned)

517401

In [115]:
# Clean all the words except Noun, Adj and pronouns.
def clean_up_spacy(doc):
    filtered = []
    keep = ['ADJ', 'INTJ', 'NOUN', 'PROPN']
    for token in doc:
        if token.pos_ in keep and token not in STOP_WORDS and\
    token.is_punct is False and token.is_alpha:
            filtered.append(str(token))
    return ' '.join(filtered)

In [108]:
# all_texts=[]
# for text in email_contents_cleaned:
#     all_texts.append(clean_up_spacy(nlp(text)))




In [116]:
all_texts=[]
for doc in tqdm(nlp.pipe(email_contents_cleaned, n_threads=36, batch_size=2500)):
    all_texts.append(clean_up_spacy(doc))





0it [00:00, ?it/s][A[A[A[A



1it [00:50, 50.80s/it][A[A[A[A



1092it [00:50, 21.45it/s][A[A[A[A



1782it [00:50, 34.94it/s][A[A[A[A



1782it [01:05, 27.02it/s][A[A[A[A



2501it [01:52, 22.16it/s][A[A[A[A



3606it [01:52, 31.92it/s][A[A[A[A



4341it [01:53, 38.39it/s][A[A[A[A



4860it [01:53, 42.95it/s][A[A[A[A



4860it [02:05, 38.58it/s][A[A[A[A



5001it [03:20, 24.98it/s][A[A[A[A



5127it [03:20, 25.60it/s][A[A[A[A



5358it [03:20, 26.74it/s][A[A[A[A



5625it [03:20, 28.06it/s][A[A[A[A



6119it [03:20, 30.51it/s][A[A[A[A



7085it [03:20, 35.30it/s][A[A[A[A



7085it [03:36, 32.79it/s][A[A[A[A



7501it [04:34, 27.34it/s][A[A[A[A



8054it [04:34, 29.35it/s][A[A[A[A



8732it [04:34, 31.81it/s][A[A[A[A



9130it [04:34, 33.25it/s][A[A[A[A



9638it [04:34, 35.08it/s][A[A[A[A



9638it [04:46, 33.68it/s][A[A[A[A



10001it [05:11, 32.06it/s][A[A[A[A



11425it [05:12, 36.61

158906it [1:35:02, 27.87it/s][A[A[A[A



159251it [1:35:02, 27.93it/s][A[A[A[A



159587it [1:35:02, 27.98it/s][A[A[A[A



159587it [1:35:17, 27.91it/s][A[A[A[A



160001it [1:36:33, 27.62it/s][A[A[A[A



160334it [1:36:34, 27.67it/s][A[A[A[A



160816it [1:36:34, 27.76it/s][A[A[A[A



161417it [1:36:34, 27.86it/s][A[A[A[A



162168it [1:36:34, 27.99it/s][A[A[A[A



162168it [1:36:47, 27.92it/s][A[A[A[A



162501it [1:37:21, 27.82it/s][A[A[A[A



163052it [1:37:21, 27.91it/s][A[A[A[A



164232it [1:37:21, 28.11it/s][A[A[A[A



164232it [1:37:37, 28.04it/s][A[A[A[A



165001it [1:38:38, 27.88it/s][A[A[A[A



165805it [1:38:38, 28.01it/s][A[A[A[A



166467it [1:38:38, 28.13it/s][A[A[A[A



166913it [1:38:38, 28.20it/s][A[A[A[A



166913it [1:38:57, 28.11it/s][A[A[A[A



167501it [1:40:16, 27.84it/s][A[A[A[A



167885it [1:40:16, 27.91it/s][A[A[A[A



168341it [1:40:16, 27.98it/s][A[A[A[A



168758it [

309871it [3:01:07, 28.51it/s][A[A[A[A



309871it [3:01:23, 28.47it/s][A[A[A[A



310001it [3:02:11, 28.36it/s][A[A[A[A



310682it [3:02:11, 28.42it/s][A[A[A[A



311429it [3:02:11, 28.49it/s][A[A[A[A



312254it [3:02:11, 28.56it/s][A[A[A[A



312254it [3:02:23, 28.53it/s][A[A[A[A



312501it [3:03:11, 28.43it/s][A[A[A[A



313284it [3:03:11, 28.50it/s][A[A[A[A



313983it [3:03:11, 28.57it/s][A[A[A[A



314951it [3:03:11, 28.65it/s][A[A[A[A



314951it [3:03:24, 28.62it/s][A[A[A[A



315001it [3:04:09, 28.51it/s][A[A[A[A



315698it [3:04:09, 28.57it/s][A[A[A[A



316394it [3:04:09, 28.63it/s][A[A[A[A



317423it [3:04:09, 28.73it/s][A[A[A[A



317423it [3:04:24, 28.69it/s][A[A[A[A



317501it [3:05:03, 28.59it/s][A[A[A[A



318438it [3:05:03, 28.68it/s][A[A[A[A



319286it [3:05:03, 28.75it/s][A[A[A[A



319286it [3:05:14, 28.73it/s][A[A[A[A



320001it [3:06:04, 28.66it/s][A[A[A[A



320861it [

487338it [4:17:37, 31.53it/s][A[A[A[A



487338it [4:17:50, 31.50it/s][A[A[A[A



487501it [4:18:58, 31.37it/s][A[A[A[A



488318it [4:18:58, 31.43it/s][A[A[A[A



488704it [4:18:59, 31.45it/s][A[A[A[A



489399it [4:18:59, 31.49it/s][A[A[A[A



489809it [4:18:59, 31.52it/s][A[A[A[A



489809it [4:19:10, 31.50it/s][A[A[A[A



490001it [4:19:54, 31.42it/s][A[A[A[A



490812it [4:19:54, 31.47it/s][A[A[A[A



491801it [4:19:54, 31.54it/s][A[A[A[A



491801it [4:20:10, 31.50it/s][A[A[A[A



492501it [4:20:52, 31.47it/s][A[A[A[A



493086it [4:20:52, 31.50it/s][A[A[A[A



494152it [4:20:52, 31.57it/s][A[A[A[A



494594it [4:20:52, 31.60it/s][A[A[A[A



494594it [4:21:10, 31.56it/s][A[A[A[A



495001it [4:21:36, 31.53it/s][A[A[A[A



495703it [4:21:36, 31.58it/s][A[A[A[A



497249it [4:21:37, 31.68it/s][A[A[A[A



497249it [4:21:51, 31.65it/s][A[A[A[A



497501it [4:23:11, 31.50it/s][A[A[A[A



498206it [

In [124]:

stopwords = ENGLISH_STOP_WORDS.union(['ect', 'hou', 'com', 'recipient', 'PRON', 'nm',\
                                     'pm', 'cc', 'fw','re','ees','|','http',\
                                    'ena','go','get','www','et','ena','as','don','id','ask','ct'])
def additional_clean(x):
    cleaned = re.sub('[^A-Za-z]+', ' ', x)
    cleaned = [word for word in cleaned.split() if word not in stopwords] 
    cleaned = [word.lower() for word in cleaned if len(word) > 2]
    return ' '.join(cleaned)

In [122]:
len(all_texts)

517401

In [126]:
all_texts = [additional_clean(x) for x in all_texts]

In [1]:
all_emails.content = all_texts
all_emails.to_csv('all_emails.csv', index=False)

In [127]:
# Save the cleaned data for use in topic model
spacy_refined_emails_df = pd.DataFrame({'email': all_texts})
spacy_refined_emails_df.to_csv('emails_spacy_refined.csv', index=False)

In [72]:
# all_emails_df = pd.read_csv('emails_spacy_refined.csv')
# all_texts = list(all_emails_df.email)