In [37]:
from __future__ import print_function
from __future__ import division

import os, sys
import collections
import nltk
import numpy as np
import pandas as pd
from scipy import sparse, hstack
import email
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import *
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import random

# Helper libraries
import constants
import utils
import vocabulary

## Load in Email Bodies
Data source and exploration code: [Kaggle](https://www.kaggle.com/zichen/explore-enron)

In [2]:
# load csv dataset - download from Kaggle (linked above, ~.5gb)

# replace with local path
path = 'C:/Users/Colby/Documents/Berkeley/266_NLP/final_project/data'

# all emails for targeted search
all_emails = pd.read_csv(path + '/emails.csv')
#emails_df = pd.read_csv(path + '/emails.csv', rows = 50000)

print("Shape:", all_emails.shape)
all_emails.head()
#print(all_emails['message'][1])

Shape: (517401, 2)


Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [3]:
# citation: Kaggle exploration code
# isolate email body
# run-time: ~ 3 minutes on full dataset

def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append( part.get_payload() )
    return ''.join(parts)

# Parse the emails into a list email objects
messages = list(map(email.message_from_string, all_emails['message']))
#emails_df.drop('message', axis=1, inplace=True)

# Parse content from emails
all_emails['content_str'] = list(map(get_text_from_email, messages))

del messages


# mini version for preprocessing (to save run-time)
size = 50000
emails_df = all_emails.loc[range(size),]
print("Full shape:", all_emails.shape)
print("Mini shape:", emails_df.shape)

all_emails.head()

Full shape: (517401, 3)
Mini shape: (50000, 3)


Unnamed: 0,file,message,content_str
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,Here is our forecast\n\n
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,Traveling to have a business meeting takes the...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,test successful. way to go!!!
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,"Randy,\n\n Can you send me a schedule of the s..."
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,Let's shoot for Tuesday at 11:45.


## Query Raw Contents for Suspicious Phrases
### Store dataframe indexes
Use this block to search raw messages and store the row indexes in *target_ids*

In [203]:
# test phrase - enter between quotes
phrase = "and stuff"

query = all_emails[all_emails['content_str'].str.contains(phrase, case=False)]
print(query.shape)
print("Matching indexes:", query.index.tolist())

(160, 3)
Matching indexes: [10682, 13050, 13303, 13670, 14541, 14730, 16044, 16212, 16939, 18363, 21033, 21037, 21517, 22948, 22955, 23896, 27214, 31595, 31599, 32079, 36857, 36889, 37145, 37341, 39709, 46081, 51752, 58238, 59058, 60853, 60854, 60859, 63049, 63065, 63081, 63168, 63254, 63373, 66148, 67658, 69838, 72252, 72256, 72257, 72740, 72759, 72813, 72939, 73895, 74539, 76896, 79697, 83561, 85227, 86243, 88247, 94130, 96350, 97913, 101716, 107626, 110149, 119809, 144020, 145633, 146735, 146776, 148558, 148966, 150518, 165518, 166075, 180538, 195232, 212248, 213120, 221997, 265332, 270277, 271537, 275666, 276081, 276083, 276085, 276086, 277318, 277319, 277321, 277324, 279530, 292182, 292297, 296283, 298164, 298166, 298167, 298168, 298169, 298170, 298171, 298172, 298173, 298174, 298175, 298177, 298178, 298179, 298180, 298181, 298186, 303544, 303545, 307533, 307535, 310739, 310740, 310741, 310742, 320741, 320742, 323096, 329104, 348760, 352897, 355211, 357850, 358102, 368444, 372226,

In [229]:
# enter matching index from list above
enter_id = 46081
print(query.loc[enter_id,'content_str'])

# store ids here in the form {123: 1, 456: 1, ...}
target_ids = {16212: 1, 450200: 1}

this is cute,  things like this help during difficult times

-----Original Message-----
From: Rick Cates [mailto:rd_cates@yahoo.com]
Sent: Monday, December 03, 2001 11:03 AM
To: Brickman, Ronnie; Guy Bruner; Campbell, Larry; Chris Gerald; raymond
hanover; Haug, Steve; Ben Howard; Bill Jones; Loveless, Rick; Nichols,
Leo; John Rose
Subject: Fwd: Retirement in a Trailer Park


> RETIREMENT IN A TRAILER  PARK THRU THE EYES OF A
> CHILD This is just too
> cute! 
> 
> After a spring  break, a teacher asked her young
> pupils how they  spent
> the holidays. One  child wrote the following: 
> 
> "We always used to spend the holidays with Grandma
> and Grandpa. They used
>  to live here in a big brick house, but Grandpa got
> retarded and they
> moved  to Arizona.    
> 
> Now they live in a  place with a lot of other
> retarded  people.  They
> live in a tin box  and have rocks painted green to
> look like grass.  They
> ride around on big tricycles and wear name tags
> because they don't  kn

### Get Random Sample for Labeling

Reviewed 200 examples in first 50,000 emails

In [6]:
# keep seed = 24 for sample of first 50000 emails
random.seed(24)
rand_num = 100
rand_ids_1 = random.sample(range(2000), rand_num)
rand_ids_2 = random.sample(range(2000,50000), rand_num)
rand_ids = rand_ids_1 + rand_ids_2
rand_set = emails_df.loc[rand_ids,]
#print(rand_ids)
print("Sample set shape:", rand_set.shape)
#for id_ in rand_ids:
#    print(id_)
#    print(rand_set.loc[id_, 'content_str'])

# assign labels based on id
label_dict = {373: 1, 346: 1, 405: 1, 27091: 1, 13966: 1}
rand_set["suspicious_ind"] = np.zeros(len(rand_ids))
for k in label_dict.keys():
    rand_set.loc[k, "suspicious_ind"] = label_dict.get(k)

Sample set shape: (200, 3)


In [7]:
rand_set.head()

Unnamed: 0,file,message,content_str,suspicious_ind
1458,allen-p/deleted_items/381.,Message-ID: <26393700.1075862162007.JavaMail.e...,[IMAGE]\n [IMAGE][IMAGE] [IMAGE] Yahoo! ...,0.0
784,allen-p/all_documents/266.,Message-ID: <16945.1075855671306.JavaMail.evan...,"Kay & Neal,\n\nThanks for remembering my birth...",0.0
1719,allen-p/discussion_threads/212.,Message-ID: <19120705.1075855677934.JavaMail.e...,Attached are two files that illustrate the fo...,0.0
1193,allen-p/all_documents/73.,Message-ID: <22565759.1075855667097.JavaMail.e...,Put me down as a reviewer,0.0
373,allen-p/_sent_mail/439.,Message-ID: <14134673.1075855725697.JavaMail.e...,"Jacques,\n\nStill trying to close the loop on ...",1.0


## Preprocess Raw Email Contents

In [8]:
# tokenize and canonicalize each email to get vocab
# WARNING: THIS STEP TAKES ~40 MINUTES ON LOCAL MACHINE WHEN USING ALL 500K EMAILS
tokenizer = TreebankWordTokenizer()
all_tokens = []
email_tokens = []

for i, body in enumerate(emails_df["content_str"]):
    #get sentence level
    sents = nltk.tokenize.sent_tokenize(body)
    canon = []
    for sent in sents:
        #list of tokens in sentence
        sent_tokens = tokenizer.tokenize(sent)
        canon += utils.canonicalize_words(sent_tokens)
    all_tokens += canon
    email_tokens.append(canon)

emails_df["content_tokens"] = email_tokens
print("Total Number of Tokens:", len(all_tokens))

Total Number of Tokens: 15743385


In [72]:
# build vocab
# V = size
V = 5000
vocab = vocabulary.Vocabulary(all_tokens, size=V)
print("Vocabulary size: {:,}".format(vocab.size))
vocab_ids = vocab.words_to_ids(all_tokens)
print("Unigrams: ", len(vocab.unigram_counts))

Vocabulary size: 5,000
Unigrams:  167039


In [73]:
# preprocess email bodies with unknowns and sentence buffers
emails_preprocessed = []

for i, body in enumerate(emails_df["content_str"]):
    #get sentence level
    sents = nltk.tokenize.sent_tokenize(body)
    list_sents = []
    for sent in sents:
        #list of tokens in sentence
        sent_tokens = tokenizer.tokenize(sent)
        list_sents.append(sent_tokens)
    #preprocessed = list(utils.preprocess_sentences(list_sents, vocab, use_eos=True, emit_ids=False))
    #just keep word IDs
    preprocessed = list(utils.preprocess_sentences(list_sents, vocab, use_eos=True, emit_ids=True))
    emails_preprocessed.append(preprocessed)

emails_df["content_IDS"] = emails_preprocessed
emails_preprocessed[2]

[0, 954, 1895, 4, 1, 0, 219, 7, 193, 59, 59, 1, 0, 59, 1]

In [74]:
# test ID to word
vocab.ids_to_words(emails_df.loc[2, "content_IDS"])

['<s>',
 'test',
 'successful',
 '.',
 '</s>',
 '<s>',
 'way',
 'to',
 'go',
 '!',
 '!',
 '</s>',
 '<s>',
 '!',
 '</s>']

In [104]:
# final preprocessed strings for feature generation
emails_df["content_proc"] = emails_df["content_IDS"].apply(lambda x: ' '.join(vocab.ids_to_words(x)))

## Generate BOW features and longer N-grams

In [105]:
# email length feature
emails_df["email_length"] = emails_df["content_IDS"].apply(len)
emails_df.head()

Unnamed: 0,file,message,content_str,content_tokens,content_IDS,email_length,content_proc
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,Here is our forecast\n\n,"[here, is, our, forecast]","[0, 139, 21, 74, 2697, 1]",6,<s> here is our forecast </s>
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,Traveling to have a business meeting takes the...,"[traveling, to, have, a, business, meeting, ta...","[0, 3835, 7, 37, 14, 133, 123, 1804, 5, 1140, ...",171,<s> traveling to have a business meeting takes...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,test successful. way to go!!!,"[test, successful, ., way, to, go, !, !, !]","[0, 954, 1895, 4, 1, 0, 219, 7, 193, 59, 59, 1...",15,<s> test successful . </s> <s> way to go ! ! <...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,"Randy,\n\n Can you send me a schedule of the s...","[randy, ,, can, you, send, me, a, schedule, of...","[0, 1086, 3, 69, 17, 274, 72, 14, 389, 12, 5, ...",45,"<s> randy , can you send me a schedule of the ..."
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,Let's shoot for Tuesday at 11:45.,"[let, 's, shoot, for, tuesday, at, DGDG:DGDG, .]","[0, 129, 40, 2, 16, 245, 34, 55, 4, 1]",10,<s> let 's <unk> for tuesday at DGDG:DGDG . </s>


In [106]:
# generate BOW features based on vocab.size V (above) and transform to TF-IDF
# uses canonicalized words
bow_feats = utils.id_lists_to_sparse_bow(emails_df["content_IDS"], vocab.size)
print("BOW shape: ", bow_feats.shape)
transformer = TfidfTransformer()
bow_tfidf = transformer.fit_transform(bow_feats)
print("BOW TF-IDF:", bow_tfidf.shape)

# TOO MANY ODD TOKENS CAPTURED BY VOCAB OBJECT

BOW shape:  (50000, 5000)
BOW TF-IDF: (50000, 5000)


In [167]:
# consider longer n-grams
vectorize = CountVectorizer(ngram_range=(1, 3), max_df=.2, min_df=.01)
n_grams = vectorize.fit_transform(emails_df["content_proc"])
print("N-grams shape: ", n_grams.shape)

n_grams_idf = transformer.fit_transform(n_grams)
print("N-grams TF-IDF:", n_grams_idf.shape)

N-grams shape:  (50000, 3939)
N-grams TF-IDF: (50000, 3939)


In [168]:
# join features
length_sparse = sparse.csr_matrix(emails_df["email_length"]).transpose()
# with length feature
#feature_vects = sparse.hstack([length_sparse, bow_tfidf, n_grams_idf])
#feature_vects = sparse.hstack([bow_tfidf, n_grams_idf])
feature_vects = n_grams_idf
feature_vects.shape

(50000, 3939)

### Fit & Evaluate Simple K-Means Clustering

In [169]:
# fit k-means (n=4, tol=.01, max_iter=100 takes ~20 mins to train)
kmeans = KMeans(n_clusters=4, tol=.01, max_iter=100)
kmeans.fit(feature_vects)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=4, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.01, verbose=0)

In [170]:
# evaluate predictions on positively labeled examples
base_preds = kmeans.predict(feature_vects)
print("First 30 clusters:", base_preds[:30])
print("Positive labels:  ", base_preds[list(label_dict.keys())])

First 30 clusters: [0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 2 0 0 2 0 0 0 0 0 0 0 2 0 0]
Positive labels:   [0 0 0 0 0]


In [173]:
# evaluate closest examples in cluster 2 based on cosine similarity
suspicious_ids = np.zeros(len(base_preds))
for k in label_dict.keys():
    suspicious_ids[k] = label_dict.get(k)

# enter index of typical cluster
key_cluster = 0

cluster_ids = (base_preds==key_cluster).astype(int)

features_np = sparse.csr_matrix.todense(feature_vects)

labeled_feats = features_np[np.multiply(suspicious_ids, cluster_ids).astype(bool)]
print(labeled_feats.shape)

(5, 3939)


In [174]:
# cosine_similarity
print(label_dict.keys())
cos_sims = cosine_similarity(labeled_feats, feature_vects)
print(cos_sims.shape)
closest = np.argsort(cos_sims, axis = 1)
print(closest[:,0:10])

dict_keys([346, 27091, 373, 13966, 405])
(5, 50000)
[[    0 10648 34952 10650 10651 34951 10653 34950 10655 34949]
 [    0  4375 46359 14860 40413  4371 40412  4369  4368 46362]
 [    0 15079 15080  4657 37824 37820 37816 37815 15107 37808]
 [    0 38573 19107 19105 38576 38577 38578 19100 19099 19098]
 [    0 46059 10570 10571 36744 46055 10575 29488 10583 10584]]


In [188]:
# nearest emails flagged:
emails_df.loc[37820 , "content_proc"]

'<s> lets discuss this before we make offer . </s> <s> rick </s>'