In [216]:
from __future__ import print_function
from __future__ import division

import os, sys
import collections
import nltk
import numpy as np
import pandas as pd
import email
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize import sent_tokenize
import random

# Helper libraries
import constants
import utils
import vocabulary

## Load in Email Bodies
Data source and exploration code: [Kaggle](https://www.kaggle.com/zichen/explore-enron)

In [235]:
# load csv dataset - download from Kaggle (linked above, ~.5gb)

# replace with local path
path = 'C:/Users/Colby/Documents/Berkeley/266_NLP/final_project/data'

#emails_df = pd.read_csv(path + '/emails.csv')
emails_df = pd.read_csv(path + '/emails.csv', nrows=2000)

print("Shape:", emails_df.shape)
emails_df.head()
print(emails_df['message'][1])

Shape: (2000, 2)
Message-ID: <15464986.1075855378456.JavaMail.evans@thyme>
Date: Fri, 4 May 2001 13:51:00 -0700 (PDT)
From: phillip.allen@enron.com
To: john.lavorato@enron.com
Subject: Re:
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: John J Lavorato <John J Lavorato/ENRON@enronXgate@ENRON>
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Sent Mail
X-Origin: Allen-P
X-FileName: pallen (Non-Privileged).pst

Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.

As far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is no

In [236]:
# citation: Kaggle exploration code
# isolate email body

def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append( part.get_payload() )
    return ''.join(parts)

# Parse the emails into a list email objects
messages = list(map(email.message_from_string, emails_df['message']))
#emails_df.drop('message', axis=1, inplace=True)

# Parse content from emails
emails_df['content_str'] = list(map(get_text_from_email, messages))

del messages

emails_df.head()

Unnamed: 0,file,message,content_str
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,Here is our forecast\n\n
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,Traveling to have a business meeting takes the...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,test successful. way to go!!!
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,"Randy,\n\n Can you send me a schedule of the s..."
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,Let's shoot for Tuesday at 11:45.


## Preprocess Email Contents

In [237]:
# tokenize and canonicalize each email to get vocab
tokenizer = TreebankWordTokenizer()
all_tokens = []
email_tokens = []

for i, body in enumerate(emails_df["content_str"]):
    #get sentence level
    sents = nltk.tokenize.sent_tokenize(body)
    canon = []
    for sent in sents:
        #list of tokens in sentence
        sent_tokens = tokenizer.tokenize(sent)
        canon += utils.canonicalize_words(sent_tokens)
    all_tokens += canon
    email_tokens.append(canon)

emails_df["content_tokens"] = email_tokens
print(len(all_tokens))

445556


In [238]:
# build vocab
# V = size
V = 500
vocab = vocabulary.Vocabulary(all_tokens, size=V)
print("Vocabulary size: {:,}".format(vocab.size))
vocab_ids = vocab.words_to_ids(all_words)
print("Unigrams: ", len(vocab.unigram_counts))

Vocabulary size: 500
Unigrams:  16832


In [239]:
# preprocess email bodies with unknowns and sentence buffers
emails_preprocessed = []

for i, body in enumerate(emails_df["content_str"]):
    #get sentence level
    sents = nltk.tokenize.sent_tokenize(body)
    list_sents = []
    for sent in sents:
        #list of tokens in sentence
        sent_tokens = tokenizer.tokenize(sent)
        list_sents.append(sent_tokens)
    #preprocessed = list(utils.preprocess_sentences(list_sents, vocab, use_eos=True, emit_ids=False))
    #just keep word IDs
    preprocessed = list(utils.preprocess_sentences(list_sents, vocab, use_eos=True, emit_ids=True))
    emails_preprocessed.append(preprocessed)

emails_df["content_IDS"] = emails_preprocessed
emails_preprocessed[2]

[0, 2, 2, 5, 1, 0, 414, 7, 203, 75, 75, 1, 0, 75, 1]

In [240]:
# test ID to word
vocab.ids_to_words(emails_df["content_IDS"][2])

['<s>',
 '<unk>',
 '<unk>',
 '.',
 '</s>',
 '<s>',
 'way',
 'to',
 'go',
 '!',
 '!',
 '</s>',
 '<s>',
 '!',
 '</s>']

In [241]:
emails_df.head()

Unnamed: 0,file,message,content_str,content_tokens,content_IDS
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,Here is our forecast\n\n,"[here, is, our, forecast]","[0, 93, 18, 89, 2, 1]"
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,Traveling to have a business meeting takes the...,"[traveling, to, have, a, business, meeting, ta...","[0, 2, 7, 39, 12, 266, 133, 2, 6, 2, 96, 11, 6..."
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,test successful. way to go!!!,"[test, successful, ., way, to, go, !, !, !]","[0, 2, 2, 5, 1, 0, 414, 7, 203, 75, 75, 1, 0, ..."
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,"Randy,\n\n Can you send me a schedule of the s...","[randy, ,, can, you, send, me, a, schedule, of...","[0, 2, 4, 66, 13, 216, 84, 12, 2, 11, 6, 2, 10..."
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,Let's shoot for Tuesday at 11:45.,"[let, 's, shoot, for, tuesday, at, DGDG:DGDG, .]","[0, 151, 56, 2, 14, 467, 35, 51, 5, 1]"


In [242]:
# create sparse matrix of word ID feature
x_sparse = utils.id_lists_to_sparse_bow(emails_df["content_IDS"], vocab.size)

### Get Random Sample for Labeling

In [255]:
random.seed(24)
rand_ids = random.sample(range(2000), 100)
print(rand_ids)
train_set = emails_df.loc[rand_ids,]
print(train_set.shape)
print(train_set['message'][373])
label_dict = {373: 1}

[1458, 784, 1719, 1193, 373, 447, 342, 397, 346, 1373, 1396, 187, 1445, 1551, 309, 1651, 1447, 580, 1483, 1888, 1568, 26, 903, 957, 1773, 1476, 1300, 236, 57, 1058, 345, 1962, 1006, 1513, 1731, 909, 623, 1023, 1951, 174, 1407, 1557, 520, 1726, 1252, 322, 1395, 669, 1966, 625, 152, 1110, 1329, 751, 1353, 1830, 68, 1893, 437, 647, 694, 1832, 1521, 142, 636, 198, 503, 1362, 1221, 1636, 313, 1677, 1736, 1002, 1760, 581, 545, 1151, 560, 405, 191, 1180, 1012, 1607, 315, 1336, 293, 664, 982, 1372, 1237, 1994, 1604, 269, 974, 1035, 531, 1042, 1566, 462]
(100, 5)
Message-ID: <14134673.1075855725697.JavaMail.evans@thyme>
Date: Mon, 19 Mar 2001 01:36:00 -0800 (PST)
From: phillip.allen@enron.com
To: jacquestc@aol.com
Subject: 
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: jacquestc@aol.com
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_June2001\Notes Folders\'sent mail
X-Origin: Allen-P
X-FileName: pallen.nsf

Jacques,

Still tr

.....

# From Assignment 2

### Converting IDs to a feature vector

Many NLP models are designed to work directly with a sequence of word ids. However, for many standard machine learning models such as Naive Bayes, SVMs, or Logistic Regression, we need to convert this sequence to a fixed-length vector $x \in \mathbb{R}^d$. In the general case, we can define a set of feature extractors $f_i$ for $i = 0,1,\ldots,d-1$, such that $x_i = f_i([\mathtt{ids}])$.

The simplest way to do this is a bag-of-words model, in which we let the number of features be the size of our vocabulary ($d = |V|$), and we let each feature be a count of the number of times word $i$ appears in the sequence:

$$ f_i([\mathtt{ids}]) = \sum_{j = 0}^{n} \mathbf{1}[w_j = i] $$

We can do this in a very simple way by using the `collections.Counter` class:

In [9]:
print("Example, with words as keys:", collections.Counter(x_tokens_canonical))
x_fdict = collections.Counter(x_ids)
x_fdict

A common data format to use in machine learning applications is to transform this dictionary-like object that maps keys to values into a feature vector:

In [10]:
num_features = vocab.size  # one feature for each word
x_vector = [x_fdict.get(i, 0) for i in range(num_features)]
x_vector

If one has multiple examples, these are represented as multiple rows of these vectors stacked on top of one another (similar to the batching you did in assignment 1).  If $|V|$ is large and the text short, it is likely most of the elements of such a matrix are zero.  A memory optimization can be made by using a [sparse vector](https://docs.scipy.org/doc/scipy/reference/sparse.html) representation. (Or for more than one example, a sparse matrix.) You may have worked with these before, as they are the preferred input format for many `scikit-learn` ML routines.

The sparse matrix constructor requires three parallel lists: the row indices, the column indices, and the corresponding values.  Note that we have just a single row (we have only one example), so all the row indices are 0.

In [11]:
#  See https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html#scipy.sparse.csr_matrix
from scipy.sparse import csr_matrix 

row_indices = []
col_indices = []
values = []

# Construct three parallel lists as described above to satisfy the sparse matrix constructor.
for wordid, count in x_fdict.items():
    row_indices.append(0)       # only a single example, so row 0
    col_indices.append(wordid)  # column is word id
    values.append(count)        # value is count
x_sparse = csr_matrix((values, (row_indices, col_indices)),
                      shape=[1, vocab.size])
print("Non-zero values:")
print(x_sparse)
x_sparse

We've provided a helper function, `utils.id_lists_to_sparse_bow` that can handle this conversion over a whole dataset, and in most cases we'll handle this conversion for you in the starter code.