In [1]:
from __future__ import print_function
from __future__ import division

import os, sys
import collections
import nltk
import numpy as np
import pandas as pd
from scipy import sparse, hstack
import email
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import *
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import random

# Helper libraries
import constants
import utils
import vocabulary

## Load in Email Bodies
Data source and exploration code: [Kaggle](https://www.kaggle.com/zichen/explore-enron)

In [2]:
# load csv dataset - download from Kaggle (linked above, ~.5gb)

# replace with local path
path = 'C:/Users/Colby/Documents/Berkeley/266_NLP/final_project/data'

# all emails for targeted search
all_emails = pd.read_csv(path + '/emails.csv')
#emails_df = pd.read_csv(path + '/emails.csv', rows = 50000)

print("Shape:", all_emails.shape)
all_emails.head()
#print(all_emails['message'][1])

Shape: (517401, 2)


Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [3]:
# citation: Kaggle exploration code
# isolate email body
# run-time: ~ 3 minutes on full dataset

def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append( part.get_payload() )
    return ''.join(parts)

# Parse the emails into a list email objects
messages = list(map(email.message_from_string, all_emails['message']))
#emails_df.drop('message', axis=1, inplace=True)

# Parse content from emails
all_emails['content_str'] = list(map(get_text_from_email, messages))

del messages


# mini version for preprocessing (to save run-time)
size = 50000
emails_df = all_emails.loc[range(size),]
print("Full shape:", all_emails.shape)
print("Mini shape:", emails_df.shape)

all_emails.head()

Full shape: (517401, 3)
Mini shape: (50000, 3)


Unnamed: 0,file,message,content_str
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,Here is our forecast\n\n
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,Traveling to have a business meeting takes the...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,test successful. way to go!!!
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,"Randy,\n\n Can you send me a schedule of the s..."
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,Let's shoot for Tuesday at 11:45.


## Query Raw Contents for Suspicious Phrases
### Store dataframe indexes
Use this block to search raw messages and store the row indexes in *target_ids*

In [4]:
# test phrase - enter between quotes
phrase = "Paso's inartful"

query = all_emails[all_emails['content_str'].str.contains(phrase, case=False)]
print(query.shape)
print("Matching indexes:", query.index.tolist())

(6, 3)
Matching indexes: [114757, 115480, 116217, 117325, 118775, 289272]


In [8]:
# enter matching index from list above
enter_id = 289272
print(query.loc[enter_id,'content_str'])

# store ids here in the form {123: 1, 456: 1, ...}
target_dict = {16212: 1, 450200: 1, 114757: 1}

Food for thought...

 -----Original Message-----
From: 	Sellers, Emily  
Sent:	Tuesday, March 27, 2001 1:01 PM
To:	Moran, Michael P.; Dornan, Dari; Pavlou, Maria; Talcott, Jim; Bargainer, David K.; Crowley, Philip; Holtzman, Staci; Huber, Lee; King Jr., Frazier; Kyle, Candace; McCoppin, Dorothy; Pryor, Tony; Raker, Colleen; Scott, Susan; Soldano, Louis; Wilkie, Kim
Subject:	FW: New York Times Article: REQUIRED READING 

Drew asked me to forward the attached message and article.


 Please take a look at this article.  El Paso's inartful internal documents are creating the same kind of fun for them that Microsoft had during the recent antitrust unpleasantness.  I know that we can't control the way people think about issues such as market behavior, price strategy, etc.  We can, however, remind ourselves and our clients to be very precise and careful in the way we talk about, and particularly, write about, such issues.  I'd like each of us and our clients to keep problems like El Paso's an

### Get Random Sample for Labeling

Reviewed 200 examples in first 50,000 emails

In [9]:
# keep seed = 24 for sample of first 50000 emails
random.seed(24)
rand_num = 100
rand_ids_1 = random.sample(range(2000), rand_num)
rand_ids_2 = random.sample(range(2000,50000), rand_num)
rand_ids = rand_ids_1 + rand_ids_2
rand_set = emails_df.loc[rand_ids,]
#print(rand_ids)
print("Sample set shape:", rand_set.shape)
#for id_ in rand_ids:
#    print(id_)
#    print(rand_set.loc[id_, 'content_str'])

# assign labels based on id
label_dict = {373: 1, 346: 1, 405: 1, 27091: 1, 13966: 1}
rand_set["suspicious_ind"] = np.zeros(len(rand_ids))
for k in label_dict.keys():
    rand_set.loc[k, "suspicious_ind"] = label_dict.get(k)

Sample set shape: (200, 3)


In [10]:
rand_set.head()

Unnamed: 0,file,message,content_str,suspicious_ind
1458,allen-p/deleted_items/381.,Message-ID: <26393700.1075862162007.JavaMail.e...,[IMAGE]\n [IMAGE][IMAGE] [IMAGE] Yahoo! ...,0.0
784,allen-p/all_documents/266.,Message-ID: <16945.1075855671306.JavaMail.evan...,"Kay & Neal,\n\nThanks for remembering my birth...",0.0
1719,allen-p/discussion_threads/212.,Message-ID: <19120705.1075855677934.JavaMail.e...,Attached are two files that illustrate the fo...,0.0
1193,allen-p/all_documents/73.,Message-ID: <22565759.1075855667097.JavaMail.e...,Put me down as a reviewer,0.0
373,allen-p/_sent_mail/439.,Message-ID: <14134673.1075855725697.JavaMail.e...,"Jacques,\n\nStill trying to close the loop on ...",1.0


## Preprocess Raw Email Contents

In [11]:
# tokenize and canonicalize each email to get vocab
# WARNING: THIS STEP TAKES ~40 MINUTES ON LOCAL MACHINE WHEN USING ALL 500K EMAILS
tokenizer = TreebankWordTokenizer()
all_tokens = []
email_tokens = []

for i, body in enumerate(emails_df["content_str"]):
    #get sentence level
    sents = nltk.tokenize.sent_tokenize(body)
    canon = []
    for sent in sents:
        #list of tokens in sentence
        sent_tokens = tokenizer.tokenize(sent)
        canon += utils.canonicalize_words(sent_tokens)
    all_tokens += canon
    email_tokens.append(canon)

emails_df["content_tokens"] = email_tokens
print("Total Number of Tokens:", len(all_tokens))

Total Number of Tokens: 15743385


In [12]:
# build vocab
# V = size
V = 10000
vocab = vocabulary.Vocabulary(all_tokens, size=V)
print("Vocabulary size: {:,}".format(vocab.size))
vocab_ids = vocab.words_to_ids(all_tokens)
print("Unigrams: ", len(vocab.unigram_counts))

Vocabulary size: 10,000
Unigrams:  167039


In [13]:
# preprocess email bodies with unknowns and sentence buffers
emails_preprocessed = []

for i, body in enumerate(emails_df["content_str"]):
    #get sentence level
    sents = nltk.tokenize.sent_tokenize(body)
    list_sents = []
    for sent in sents:
        #list of tokens in sentence
        sent_tokens = tokenizer.tokenize(sent)
        list_sents.append(sent_tokens)
    #preprocessed = list(utils.preprocess_sentences(list_sents, vocab, use_eos=True, emit_ids=False))
    #just keep word IDs
    preprocessed = list(utils.preprocess_sentences(list_sents, vocab, use_eos=True, emit_ids=True))
    emails_preprocessed.append(preprocessed)

emails_df["content_IDS"] = emails_preprocessed
emails_preprocessed[2]

[0, 954, 1894, 4, 1, 0, 219, 7, 193, 59, 59, 1, 0, 59, 1]

In [14]:
# test ID to word
vocab.ids_to_words(emails_df.loc[2, "content_IDS"])

['<s>',
 'test',
 'successful',
 '.',
 '</s>',
 '<s>',
 'way',
 'to',
 'go',
 '!',
 '!',
 '</s>',
 '<s>',
 '!',
 '</s>']

In [15]:
# final preprocessed strings for feature generation
emails_df["content_proc"] = emails_df["content_IDS"].apply(lambda x: ' '.join(vocab.ids_to_words(x)))

## Generate BOW features and longer N-grams

In [16]:
# email length feature
emails_df["email_length"] = emails_df["content_IDS"].apply(len)
emails_df.head()

Unnamed: 0,file,message,content_str,content_tokens,content_IDS,content_proc,email_length
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,Here is our forecast\n\n,"[here, is, our, forecast]","[0, 139, 21, 74, 2697, 1]",<s> here is our forecast </s>,6
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,Traveling to have a business meeting takes the...,"[traveling, to, have, a, business, meeting, ta...","[0, 3836, 7, 37, 14, 133, 123, 1804, 5, 1140, ...",<s> traveling to have a business meeting takes...,171
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,test successful. way to go!!!,"[test, successful, ., way, to, go, !, !, !]","[0, 954, 1894, 4, 1, 0, 219, 7, 193, 59, 59, 1...",<s> test successful . </s> <s> way to go ! ! <...,15
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,"Randy,\n\n Can you send me a schedule of the s...","[randy, ,, can, you, send, me, a, schedule, of...","[0, 1086, 3, 69, 17, 274, 72, 14, 389, 12, 5, ...","<s> randy , can you send me a schedule of the ...",45
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,Let's shoot for Tuesday at 11:45.,"[let, 's, shoot, for, tuesday, at, DGDG:DGDG, .]","[0, 129, 40, 6326, 16, 245, 34, 55, 4, 1]",<s> let 's shoot for tuesday at DGDG:DGDG . </s>,10


In [106]:
# generate BOW features based on vocab.size V (above) and transform to TF-IDF
# uses canonicalized words
#bow_feats = utils.id_lists_to_sparse_bow(emails_df["content_IDS"], vocab.size)
#print("BOW shape: ", bow_feats.shape)
#transformer = TfidfTransformer()
#bow_tfidf = transformer.fit_transform(bow_feats)
#print("BOW TF-IDF:", bow_tfidf.shape)

# TOO MANY ODD TOKENS CAPTURED BY VOCAB OBJECT

BOW shape:  (50000, 5000)
BOW TF-IDF: (50000, 5000)


In [25]:
# consider longer n-grams
transformer = TfidfTransformer()
vectorize = CountVectorizer(ngram_range=(1, 2), max_df=.1, min_df=20)
n_grams = vectorize.fit_transform(emails_df["content_proc"])
print("N-grams shape: ", n_grams.shape)

n_grams_idf = transformer.fit_transform(n_grams)
print("N-grams TF-IDF:", n_grams_idf.shape)

N-grams shape:  (50000, 75864)
N-grams TF-IDF: (50000, 75864)


In [26]:
# join features
length_sparse = sparse.csr_matrix(emails_df["email_length"]).transpose()
# with length feature
#feature_vects = sparse.hstack([length_sparse, bow_tfidf, n_grams_idf])
#feature_vects = sparse.hstack([bow_tfidf, n_grams_idf])
feature_vects = n_grams_idf
feature_vects.shape

(50000, 75864)

### Fit & Evaluate Simple K-Means Clustering

In [27]:
# fit k-means (n=4, tol=.01, max_iter=100 takes ~20 mins to train)
kmeans = KMeans(n_clusters=5, tol=.01, max_iter=100)
kmeans.fit(feature_vects)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.01, verbose=0)

In [28]:
# evaluate predictions on positively labeled examples
base_preds = kmeans.predict(feature_vects)
print("First 30 clusters:", base_preds[:30])
print("Positive labels:  ", base_preds[list(label_dict.keys())])

First 30 clusters: [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
Positive labels:   [2 2 2 2 2]


In [29]:
# evaluate closest examples in cluster 2 based on cosine similarity
suspicious_ids = np.zeros(len(base_preds))
for k in label_dict.keys():
    suspicious_ids[k] = label_dict.get(k)

# enter index of typical cluster
key_cluster = 2

cluster_ids = (base_preds==key_cluster).astype(int)

features_np = sparse.csr_matrix.todense(feature_vects)

labeled_feats = features_np[np.multiply(suspicious_ids, cluster_ids).astype(bool)]
print(labeled_feats.shape)

(5, 75864)


In [43]:
# cosine_similarity
print(label_dict.keys())
cos_sims = cosine_similarity(labeled_feats, feature_vects)
print(cos_sims.shape)
closest = np.argsort(cos_sims, axis = 1)
print(closest[:,-10:])
print(cos_sims[1,985])

dict_keys([346, 27091, 373, 13966, 405])
(5, 50000)
[[21177 31738 39744 13961 11650 15532  7014   966   346  2759]
 [  985  2779  1893  2769   356   976  1879   373  2787   995]
 [  357   358  1892  2771   978   977  2820  1855   405  1027]
 [17248 10977 14371 11429 16116 13982 11606 13966  9998 17433]
 [20382 19225 29563 21933 27092 30259 24055 31124 27091 20569]]
0.164362146998


In [37]:
# nearest emails flagged:
emails_df.loc[985, "content_proc"]

"<s> jacques , the agreement looks fine . </s> <s> my only comment is that george and larry might object to the language that `` the bank that was requested to finance the construction of the project declined to make the loan based on the high costs of the construction of the project '' . </s> <s> <unk> , that bank lowered the loan amount based on lower estimates of <unk> which <unk> the amount of equity that would be required . </s> <s> did i loan them $ DGDGDGDGDGDGDG ? </s> <s> i thought it was less . </s> <s> regarding exhibit a , the assets include : the land , <unk> plans , engineering completed , appraisal , and <unk> study . </s> <s> most of these items are in a state of partial completion by the consultants . </s> <s> i have been speaking directly to the architect , engineer , and <unk> engineer . </s> <s> i am unclear on what is the best way to proceed with these consultants . </s> <s> the obligations should include the fees owed to the consultants above . </s> <s> do we need