In [1]:
import pandas as pd

In [2]:
data=pd.read_csv('/Users/aadiharan99/Downloads/emails.csv')

In [3]:
data=data[:1000]

In [4]:
from sklearn.model_selection import train_test_split as tts
from sklearn import pipeline
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import numpy as np
import re

In [5]:
from nltk.corpus import stopwords
sw=set(stopwords.words('english'))


In [6]:
def word_preprocessor(text):
  text=re.sub('[^a-zA-Z]', " ", str(text))
  text=text.lower()
  text_tokens=word_tokenize(text)
  text_tokens_no_sw=[w for w in text_tokens if not w in sw]
  text=TreebankWordDetokenizer().detokenize(text_tokens_no_sw)
  return text

In [7]:
data['message']

0      Message-ID: <18782981.1075855378110.JavaMail.e...
1      Message-ID: <15464986.1075855378456.JavaMail.e...
2      Message-ID: <24216240.1075855687451.JavaMail.e...
3      Message-ID: <13505866.1075863688222.JavaMail.e...
4      Message-ID: <30922949.1075863688243.JavaMail.e...
                             ...                        
995    Message-ID: <20430828.1075855696096.JavaMail.e...
996    Message-ID: <18425275.1075855696118.JavaMail.e...
997    Message-ID: <24036204.1075855666506.JavaMail.e...
998    Message-ID: <33307764.1075855696139.JavaMail.e...
999    Message-ID: <15009418.1075855696162.JavaMail.e...
Name: message, Length: 1000, dtype: object

In [8]:
def parse_raw_message(raw_message):
    lines = raw_message.split('\n')
    email = {}
    message = ''
    keys_to_extract = ['from', 'to']
    for line in lines:
        if ':' not in line:
            message += line.strip()
            email['body'] = message
        else:
            pairs = line.split(':')
            key = pairs[0].lower()
            val = pairs[1].strip()
            if key in keys_to_extract:
                email[key] = val
    return email

In [9]:
def map_to_list(emails, key):
    results = []
    for email in emails:
        if key not in email:
            results.append('')
        else:
            results.append(email[key])
    return results

In [10]:
def parse_into_emails(messages):
    emails = [parse_raw_message(message) for message in messages]
    return {
        'body': map_to_list(emails, 'body'),
        'to': map_to_list(emails, 'to'),
        'from_': map_to_list(emails, 'from')
    }

In [11]:
email_df = pd.DataFrame(parse_into_emails(data.message))
print(email_df.head())

body                       to  \
0                               Here is our forecast     tim.belden@enron.com   
1  Traveling to have a business meeting takes the...  john.lavorato@enron.com   
2                     test successful.  way to go!!!   leah.arsdall@enron.com   
3  Randy,Can you send me a schedule of the salary...    randall.gay@enron.com   
4                                                        greg.piper@enron.com   

                     from_  
0  phillip.allen@enron.com  
1  phillip.allen@enron.com  
2  phillip.allen@enron.com  
3  phillip.allen@enron.com  
4  phillip.allen@enron.com  


In [12]:
email_df['cleaned_text']=email_df['body'].apply(word_preprocessor)

In [13]:
email_df.head()

Unnamed: 0,body,to,from_,cleaned_text
0,Here is our forecast,tim.belden@enron.com,phillip.allen@enron.com,forecast
1,Traveling to have a business meeting takes the...,john.lavorato@enron.com,phillip.allen@enron.com,traveling business meeting takes fun trip espe...
2,test successful. way to go!!!,leah.arsdall@enron.com,phillip.allen@enron.com,test successful way go
3,"Randy,Can you send me a schedule of the salary...",randall.gay@enron.com,phillip.allen@enron.com,randy send schedule salary level everyone thes...
4,,greg.piper@enron.com,phillip.allen@enron.com,


In [14]:
email_df.drop(['body'],1,inplace=True)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf=TfidfVectorizer()

In [16]:
score=tf_idf.fit_transform(email_df['cleaned_text'])

In [17]:
dense=score.todense()
dense1=dense.tolist()

In [18]:
col_names=tf_idf.get_feature_names()

In [19]:
df = pd.DataFrame(dense1,columns = col_names)


In [20]:
import numpy as np
from sklearn.decomposition import NMF

In [21]:
nmf=NMF(10,random_state=42,alpha=0.1,l1_ratio=0.5,init='nndsvd')

In [22]:
arr=nmf.fit_transform(df)

In [23]:
comps=nmf.components_

In [24]:
from sklearn.decomposition import LatentDirichletAllocation

In [25]:
lda=LatentDirichletAllocation(n_components=10)

In [26]:
model1=lda.fit_transform(df)

In [27]:
comps1=lda.components_

In [28]:
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [32]:
print_topics(lda,tf_idf,5)


Topic #0:
ina xls anytime men amon

Topic #1:
ect hou allen forwarded phillip

Topic #2:
gas testing storage natural tried

Topic #3:
forecast password west phillip deskcall

Topic #4:
add buckley trading track grigsby

Topic #5:
help burnet jeff yes phillip

Topic #6:
floor go going phillip service

Topic #7:
call get phillip image would

Topic #8:
phillip please spreadsheet lucy rentroll

Topic #9:
enron ect com hou phillip


In [33]:
print_topics(nmf,tf_idf,5)


Topic #0:
ect hou allen forwarded phillip

Topic #1:
enron com na corp ect

Topic #2:
phillip please call email get

Topic #3:
heizenrader tim forwarded allen phillip

Topic #4:
project loan would land cost

Topic #5:
deskcall hoskins password west reports

Topic #6:
front exterior roof house place

Topic #7:
buckley resume karen track round

Topic #8:
trades detailing notdistinguish buys sells

Topic #9:
lay alan ken ca wants
