In [1]:
from __future__ import print_function
from __future__ import division

import os, sys
import collections
import nltk
import numpy as np
import pandas as pd
import email
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import *
import random

# Helper libraries
import constants
import utils
import vocabulary

## Load in Email Bodies
Data source and exploration code: [Kaggle](https://www.kaggle.com/zichen/explore-enron)

In [2]:
# load csv dataset - download from Kaggle (linked above, ~.5gb)

# replace with local path
path = '/mnt/c/Users/clay/Desktop/w266'
#path = 'C:/Users/Colby/Documents/Berkeley/266_NLP/final_project/data'

# all emails for targeted search
all_emails = pd.read_csv(path + '/emails.csv')
#emails_df = pd.read_csv(path + '/emails.csv', rows = 50000)

print("Shape:", all_emails.shape)
all_emails.head()
#print(all_emails['message'][1])

Shape: (517401, 2)


Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [3]:
# citation: Kaggle exploration code
# isolate email body
# run-time: ~ 3 minutes on full dataset

def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append( part.get_payload() )
    return ''.join(parts)

# Parse the emails into a list email objects
messages = list(map(email.message_from_string, all_emails['message']))
#emails_df.drop('message', axis=1, inplace=True)

# Parse content from emails
all_emails['content_str'] = list(map(get_text_from_email, messages))

del messages


# mini version for preprocessing (to save run-time)
size = 50000
emails_df = all_emails.loc[range(size),]
print("Full shape:", all_emails.shape)
print("Mini shape:", emails_df.shape)

all_emails.head()

Full shape: (517401, 3)
Mini shape: (50000, 3)


Unnamed: 0,file,message,content_str
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,Here is our forecast\n\n
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,Traveling to have a business meeting takes the...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,test successful. way to go!!!
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,"Randy,\n\n Can you send me a schedule of the s..."
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,Let's shoot for Tuesday at 11:45.


## Query Raw Contents for Suspicious Phrases
### Store dataframe indexes
Use this block to search raw messages and store the row indexes in *target_ids*

In [24]:
# test phrase - enter between quotes
phrase = "during the recent antitrust unpleasantness"

query = all_emails[all_emails['content_str'].str.contains(phrase, case=False)]
print(query.shape)
print("Matching indexes:", query.index.tolist())

(6, 3)
Matching indexes: [114757, 115480, 116217, 117325, 118775, 289272]


In [25]:
# enter matching index from list above
enter_id = 114757
print(query.loc[enter_id,'content_str'])

# store ids here in the form {123: 1, 456: 1, ...}
target_ids = {450200: 1}

Please forward this to all ETS lawyers.  DF

All:  Please take a look at this article.  El Paso's inartful internal documents are creating the same kind of fun for them that Microsoft had during the recent antitrust unpleasantness.  I know that we can't control the way people think about issues such as market behavior, price strategy, etc.  We can, however, remind ourselves and our clients to be very precise and careful in the way we talk about, and particularly, write about, such issues.  I'd like each of us and our clients to keep problems like El Paso's and Microsoft's in mind when we put pen to paper or fingers to keyboard.  Thanks.  df       

---------------------- Forwarded by Drew Fossum/ET&S/Enron on 03/27/2001 12:06 PM ---------------------------





Business/Financial Desk; Section A
Deal for Use of Gas Pipeline Stirs Dispute on Competition
By RICHARD A. OPPEL Jr. and LOWELL BERGMAN

03/26/2001
The New York Times
Page 1, Column 4
c. 2001 New York Times Company

Early last y

### Get Random Sample for Labeling

Reviewed 200 examples in first 50,000 emails

In [6]:
# keep seed = 24 for sample of first 50000 emails
random.seed(24)
rand_num = 100
rand_ids_1 = random.sample(range(2000), rand_num)
rand_ids_2 = random.sample(range(2000,50000), rand_num)
rand_ids = rand_ids_1 + rand_ids_2
rand_set = emails_df.loc[rand_ids,]
#print(rand_ids)
print("Sample set shape:", rand_set.shape)
#for id_ in rand_ids:
#    print(id_)
#    print(rand_set.loc[id_, 'content_str'])

# assign labels based on id
label_dict = {373: 1, 346: 1, 405: 1, 27091: 1, 13966: 1, 8861: 1, 92183: 1, 17932: 1, 12955: 1, 8101: 1, 19004: 1,\
              32643: 1, 114757: 1}
rand_set["suspicious_ind"] = np.zeros(len(rand_ids))
for k in label_dict.keys():
    rand_set.loc[k, "suspicious_ind"] = label_dict.get(k)

Sample set shape: (200, 3)


In [56]:
rand_set.head()

Unnamed: 0,file,message,content_str,content_tokens,content_IDS,suspicious_ind
1458,allen-p/deleted_items/381.,Message-ID: <26393700.1075862162007.JavaMail.e...,[IMAGE]\n [IMAGE][IMAGE] [IMAGE] Yahoo! ...,"[[, image, ], [, image, ], [, image, ], [, ima...","[0, 78, 107, 79, 78, 107, 79, 78, 107, 79, 78,...",0.0
784,allen-p/all_documents/266.,Message-ID: <16945.1075855671306.JavaMail.evan...,"Kay & Neal,\n\nThanks for remembering my birth...","[kay, &, neal, ,, thanks, for, remembering, my...","[0, 2, 43, 2, 3, 102, 16, 2, 94, 2, 4, 1, 0, 1...",0.0
1719,allen-p/discussion_threads/212.,Message-ID: <19120705.1075855677934.JavaMail.e...,Attached are two files that illustrate the fo...,"[attached, are, two, files, that, illustrate, ...","[0, 198, 41, 181, 2, 25, 2, 5, 197, 9, 44, 359...",0.0
1193,allen-p/all_documents/73.,Message-ID: <22565759.1075855667097.JavaMail.e...,Put me down as a reviewer,"[put, me, down, as, a, reviewer]","[0, 412, 72, 302, 44, 14, 2, 1]",0.0
373,allen-p/_sent_mail/439.,Message-ID: <14134673.1075855725697.JavaMail.e...,"Jacques,\n\nStill trying to close the loop on ...","[jacques, ,, still, trying, to, close, the, lo...","[0, 2, 3, 226, 846, 7, 583, 5, 2, 20, 5, 54, 1...",1.0


## Preprocess Raw Email Contents

In [4]:
# tokenize and canonicalize each email to get vocab
# WARNING: THIS STEP TAKES ~40 MINUTES ON LOCAL MACHINE WHEN USING ALL 500K EMAILS
tokenizer = TreebankWordTokenizer()
all_tokens = []
email_tokens = []

for i, body in enumerate(emails_df["content_str"]):
    #get sentence level
    sents = nltk.tokenize.sent_tokenize(body)
    canon = []
    for sent in sents:
        #list of tokens in sentence
        sent_tokens = tokenizer.tokenize(sent)
        canon += utils.canonicalize_words(sent_tokens)
    all_tokens += canon
    email_tokens.append(canon)

emails_df["content_tokens"] = email_tokens
print("Total Number of Tokens:", len(all_tokens))

Total Number of Tokens: 15743385


In [15]:
# build vocab
# V = size
V = 30000
vocab = vocabulary.Vocabulary(all_tokens, size=V)
print("Vocabulary size: {:,}".format(vocab.size))
vocab_ids = vocab.words_to_ids(all_tokens)
print("Unigrams: ", len(vocab.unigram_counts))

Vocabulary size: 30,000
Unigrams:  167039


In [16]:
# preprocess email bodies with unknowns and sentence buffers
emails_preprocessed = []

for i, body in enumerate(emails_df["content_str"]):
    #get sentence level
    sents = nltk.tokenize.sent_tokenize(body)
    list_sents = []
    for sent in sents:
        #list of tokens in sentence
        sent_tokens = tokenizer.tokenize(sent)
        list_sents.append(sent_tokens)
    #preprocessed = list(utils.preprocess_sentences(list_sents, vocab, use_eos=True, emit_ids=False))
    #just keep word IDs
    preprocessed = list(utils.preprocess_sentences(list_sents, vocab, use_eos=True, emit_ids=True))
    emails_preprocessed.append(preprocessed)

emails_df["content_IDS"] = emails_preprocessed
emails_preprocessed[2]

[0, 954, 1896, 4, 1, 0, 219, 7, 193, 59, 59, 1, 0, 59, 1]

In [17]:
# test ID to word
vocab.ids_to_words(emails_df.loc[2, "content_IDS"])

['<s>',
 'test',
 'successful',
 '.',
 '</s>',
 '<s>',
 'way',
 'to',
 'go',
 '!',
 '!',
 '</s>',
 '<s>',
 '!',
 '</s>']

In [18]:
emails_df.head()

Unnamed: 0,file,message,content_str,content_tokens,content_IDS
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,Here is our forecast\n\n,"[here, is, our, forecast]","[0, 139, 21, 74, 2693, 1]"
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,Traveling to have a business meeting takes the...,"[traveling, to, have, a, business, meeting, ta...","[0, 3838, 7, 37, 14, 133, 123, 1805, 5, 1140, ..."
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,test successful. way to go!!!,"[test, successful, ., way, to, go, !, !, !]","[0, 954, 1896, 4, 1, 0, 219, 7, 193, 59, 59, 1..."
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,"Randy,\n\n Can you send me a schedule of the s...","[randy, ,, can, you, send, me, a, schedule, of...","[0, 1086, 3, 69, 17, 274, 72, 14, 389, 12, 5, ..."
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,Let's shoot for Tuesday at 11:45.,"[let, 's, shoot, for, tuesday, at, DGDG:DGDG, .]","[0, 129, 40, 6339, 16, 245, 34, 55, 4, 1]"


## Generate BOW features and longer N-grams

In [20]:
# generate BOW features based on vocab.size V (above) and transform to TF-IDF
# uses canonicalized words
bow_feats = utils.id_lists_to_sparse_bow(emails_df["content_IDS"], vocab.size)
print("BOW shape: ", bow_feats.shape)
transformer = TfidfTransformer()
bow_tfidf = transformer.fit_transform(bow_feats)
print("BOW TF-IDF:", bow_tfidf.shape)

BOW shape: (50000, 30000)
TF-IDF:    (50000, 30000)


In [24]:
# consider longer n-grams
vectorize = CountVectorizer(ngram_range=(2, 3), max_df=.3, min_df=.01)
n_grams = vectorize.fit_transform(emails_df["content_str"])
print("N-grams shape: ", n_grams.shape)

n_grams_idf = transformer.fit_transform(n_grams)
print("N-grams TF-IDF:", n_grams_idf.shape)

N-grams shape:  (50000, 1560)
N-grams TF-IDF: (50000, 1560)
