In [132]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

After reading in the dataset, the 'Complete' column needs to be moved so that the train_test_split function is easier to deal with

In [133]:
enron = pd.read_pickle("./enron_cleaned_stops.pkl")
tags = pd.read_pickle("./pos_tags_stops.pkl")

In [134]:
enron.columns

Index(['Message-ID', 'Date', 'From', 'To', 'Subject', 'X-From', 'X-To', 'X-cc',
       'X-bcc', 'X-Folder', 'X-Origin', 'X-FileName', 'content', 'user',
       'Cat_1_level_1', 'Cat_1_level_2', 'Cat_1_weight', 'Cat_2_level_1',
       'Cat_2_level_2', 'Cat_2_weight', 'Cat_3_level_1', 'Cat_3_level_2',
       'Cat_3_weight', 'Cat_4_level_1', 'Cat_4_level_2', 'Cat_4_weight',
       'Cat_5_level_1', 'Cat_5_level_2', 'Cat_5_weight', 'Cat_6_level_1',
       'Cat_6_level_2', 'Cat_6_weight', 'Cat_7_level_1', 'Cat_7_level_2',
       'Cat_7_weight', 'Cat_8_level_1', 'Cat_8_level_2', 'Cat_8_weight',
       'Cat_9_level_1', 'Cat_9_level_2', 'Cat_9_weight', 'Cat_10_level_1',
       'Cat_10_level_2', 'Cat_10_weight', 'Cat_11_level_1', 'Cat_11_level_2',
       'Cat_11_weight', 'Cat_12_level_1', 'Cat_12_level_2', 'Cat_12_weight',
       'labeled', 'Complete'],
      dtype='object')

In [135]:
column_complete = enron.pop('Complete')
enron.insert(0, 'Complete', column_complete)
enron.head()

Unnamed: 0,Complete,Message-ID,Date,From,To,Subject,X-From,X-To,X-cc,X-bcc,...,Cat_10_level_1,Cat_10_level_2,Cat_10_weight,Cat_11_level_1,Cat_11_level_2,Cat_11_weight,Cat_12_level_1,Cat_12_level_2,Cat_12_weight,labeled
0,re confidential employee informationlenhart i ...,<9831685.1075855725804.JavaMail.evans@thyme>,2001-03-15 14:45:00,frozenset({'phillip.allen@enron.com'}),frozenset({'todd.burke@enron.com'}),Re: Confidential Employee Information/Lenhart,Phillip K Allen,Todd Burke,,,...,,,,,,,,,,True
1,re personal and confidential compensation info...,<21041312.1075855725847.JavaMail.evans@thyme>,2001-03-15 14:11:00,frozenset({'phillip.allen@enron.com'}),frozenset({'kim.bolton@enron.com'}),RE: PERSONAL AND CONFIDENTIAL COMPENSATION INF...,Phillip K Allen,Kim Bolton,,,...,,,,,,,,,,True
2,fw western wholesale activities gas & power co...,<5907100.1075858639941.JavaMail.evans@thyme>,2001-06-20 17:04:51,frozenset({'k..allen@enron.com'}),"frozenset({'matt.smith@enron.com', 'matthew.le...",FW: Western Wholesale Activities - Gas & Power...,"Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...","Lenhart, Matthew </O=ENRON/OU=NA/CN=RECIPIENTS...",,,...,,,,,,,,,,True
3,fw western wholesale activities gas & power co...,<26625142.1075858639964.JavaMail.evans@thyme>,2001-06-20 17:09:00,frozenset({'k..allen@enron.com'}),"frozenset({'matt.smith@enron.com', 'matthew.le...",FW: Western Wholesale Activities - Gas & Power...,"Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...","Lenhart, Matthew </O=ENRON/OU=NA/CN=RECIPIENTS...",,,...,,,,,,,,,,True
4,fw western wholesale activities gas & power co...,<19730598.1075858642129.JavaMail.evans@thyme>,2001-08-09 12:30:58,frozenset({'k..allen@enron.com'}),"frozenset({'matt.smith@enron.com', 'm..tholt@e...",FW: Western Wholesale Activities - Gas & Power...,"Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...","Smith, Matt </O=ENRON/OU=NA/CN=RECIPIENTS/CN=M...",,,...,,,,,,,,,,True


# Split the dataset into a train and test set

In [136]:
# X = first 14 columns of the dataframe --> up to the user column
# y = the label columns. The remaining columns are all labels, except for the "labeled" column which is True for every sample.
train_X, test_X, train_y, test_y = train_test_split(enron.iloc[:, 0:15], enron.iloc[:, 15:-1], test_size=0.15)

train_X.head()

Unnamed: 0,Complete,Message-ID,Date,From,To,Subject,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,content,user
1293,memo please call me memo,<7493853.1075849875061.JavaMail.evans@thyme>,2001-06-13 16:52:00,frozenset({'steven.kean@enron.com'}),"frozenset({'john.lavorato@enron.com', 'kevin.h...",MEMO,Steven J Kean,"John J Lavorato, Kevin Hannon, Jeff Skilling","Kelly Kimberly, Karen Denne, Mark Palmer",,\Steven_Kean_Nov2001_1\Notes Folders\All docum...,KEAN-S,skean.nsf,Please call me. ---------------------- Forward...,kean-s
828,re nascar craftsman truck series proposal than...,<27270309.1075847580577.JavaMail.evans@thyme>,2001-05-17 11:29:00,frozenset({'steven.kean@enron.com'}),frozenset({'dlpits@yahoo.com'}),Re: NASCAR Craftsman Truck Series Proposal,Steven J Kean,Lorna Clark <dlpits@yahoo.com>,,,\Steven_Kean_June2001_1\Notes Folders\All docu...,KEAN-S,skean.nsf,"Thank you for the information. Unfortunately, ...",kean-s
1429,re meeting request july yes maureen please sch...,<15628808.1075858884279.JavaMail.evans@thyme>,2001-07-06 17:54:00,frozenset({'steven.kean@enron.com'}),frozenset({'michael.grimescorpenron@enron.com'}),Re: Meeting Request - July 25th,Steven J Kean,Michael GrimesCORPENRON <Michael GrimesCORPENR...,Maureen McVicker <Maureen McVicker/NA/Enron@En...,,"\SKEAN (Non-Privileged)\Kean, Steven J.\Sent I...",Kean-S,SKEAN (Non-Privileged).pst,Yes. Maureen - please schedule. From: Michael ...,kean-s
50,re confidential concern sharon i suggest that ...,<28937390.1075853126342.JavaMail.evans@thyme>,2001-07-26 13:54:59,frozenset({'michelle.cash@enron.com'}),frozenset({'rob.walls@enron.com'}),RE: Confidential Concern,"Cash, Michelle </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Walls Jr., Rob </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Butcher, Sharon </O=ENRON/OU=NA/CN=RECIPIENTS/...",,"\MCASH (Non-Privileged)\Cash, Michelle\Sent Items",Cash-M,MCASH (Non-Privileged).pst,"Sharon, I suggest that we ask Valeria Hope to ...",cash-m
79,re energy issues miyung you seem to be finding...,<29038107.1075843442826.JavaMail.evans@thyme>,2001-04-25 10:16:00,frozenset({'joseph.alamo@enron.com'}),frozenset({'miyung.buster@enron.com'}),Re: Energy Issues,Joseph Alamo,Miyung Buster,Jeff Dasovich,,\Jeff_Dasovich_June2001\Notes Folders\All docu...,DASOVICH-J,jdasovic.nsf,"Miyung, You seem to be finding these okay by y...",dasovich-j


# Supervised Models

## CountVectorizer and Tfidf Vectorizer

In [137]:
cvec = CountVectorizer(analyzer='word',
                      ngram_range=(1,1),
                      max_features=None,
                      stop_words='english',
                      min_df=2,
                      max_df=0.95)

train_counts = cvec.fit_transform(train_X.Complete)

In [138]:
test_counts = cvec.transform(test_X.Complete)

In [139]:
tfvec = TfidfVectorizer(ngram_range=(1, 1),
                       max_features=None,
                       stop_words='english',
                       min_df=2,
                       max_df=0.95)

train_tf = tfvec.fit_transform(train_X['Complete'].fillna(''))

In [140]:
test_tf = tfvec.transform(test_X['Complete'].fillna(' '))

## Naive Bayes Classifier
Sklearn's documentation states that a count vectorizer should be more appropriate for this model. I will try both.

First, count vectorizer

In [141]:
count_n_bayes = MultinomialNB()
count_n_bayes.fit(train_counts, train_y.iloc[:, 0])

count_n_bayes.score(test_counts, test_y.iloc[:, 0])

  self.class_log_prior_ = (np.log(self.class_count_) -


1.0

And now Tfidf

In [142]:
tf_n_bayes = MultinomialNB()
tf_n_bayes.fit(train_tf, train_y.iloc[:, 0])

tf_n_bayes.score(test_tf, test_y.iloc[:, 0])

  self.class_log_prior_ = (np.log(self.class_count_) -


1.0