In [152]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

After reading in the dataset, the 'Complete' column needs to be moved so that the train_test_split function is easier to deal with

In [153]:
enron = pd.read_pickle("./enron_cleaned_stops.pkl")
tags = pd.read_pickle("./pos_tags_stops.pkl")

In [154]:
enron.columns

Index(['Message-ID', 'Date', 'From', 'To', 'Subject', 'X-From', 'X-To', 'X-cc',
       'X-bcc', 'X-Folder', 'X-Origin', 'X-FileName', 'content', 'user',
       'Cat_1_level_1', 'Cat_1_level_2', 'Cat_1_weight', 'Cat_2_level_1',
       'Cat_2_level_2', 'Cat_2_weight', 'Cat_3_level_1', 'Cat_3_level_2',
       'Cat_3_weight', 'Cat_4_level_1', 'Cat_4_level_2', 'Cat_4_weight',
       'Cat_5_level_1', 'Cat_5_level_2', 'Cat_5_weight', 'Cat_6_level_1',
       'Cat_6_level_2', 'Cat_6_weight', 'Cat_7_level_1', 'Cat_7_level_2',
       'Cat_7_weight', 'Cat_8_level_1', 'Cat_8_level_2', 'Cat_8_weight',
       'Cat_9_level_1', 'Cat_9_level_2', 'Cat_9_weight', 'Cat_10_level_1',
       'Cat_10_level_2', 'Cat_10_weight', 'Cat_11_level_1', 'Cat_11_level_2',
       'Cat_11_weight', 'Cat_12_level_1', 'Cat_12_level_2', 'Cat_12_weight',
       'labeled', 'Complete'],
      dtype='object')

In [155]:
column_complete = enron.pop('Complete')
enron.insert(0, 'Complete', column_complete)
enron.head()

Unnamed: 0,Complete,Message-ID,Date,From,To,Subject,X-From,X-To,X-cc,X-bcc,...,Cat_10_level_1,Cat_10_level_2,Cat_10_weight,Cat_11_level_1,Cat_11_level_2,Cat_11_weight,Cat_12_level_1,Cat_12_level_2,Cat_12_weight,labeled
0,re confidential employee informationlenhart i ...,<9831685.1075855725804.JavaMail.evans@thyme>,2001-03-15 14:45:00,frozenset({'phillip.allen@enron.com'}),frozenset({'todd.burke@enron.com'}),Re: Confidential Employee Information/Lenhart,Phillip K Allen,Todd Burke,,,...,,,,,,,,,,True
1,re personal and confidential compensation info...,<21041312.1075855725847.JavaMail.evans@thyme>,2001-03-15 14:11:00,frozenset({'phillip.allen@enron.com'}),frozenset({'kim.bolton@enron.com'}),RE: PERSONAL AND CONFIDENTIAL COMPENSATION INF...,Phillip K Allen,Kim Bolton,,,...,,,,,,,,,,True
2,fw western wholesale activities gas & power co...,<5907100.1075858639941.JavaMail.evans@thyme>,2001-06-20 17:04:51,frozenset({'k..allen@enron.com'}),"frozenset({'matt.smith@enron.com', 'matthew.le...",FW: Western Wholesale Activities - Gas & Power...,"Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...","Lenhart, Matthew </O=ENRON/OU=NA/CN=RECIPIENTS...",,,...,,,,,,,,,,True
3,fw western wholesale activities gas & power co...,<26625142.1075858639964.JavaMail.evans@thyme>,2001-06-20 17:09:00,frozenset({'k..allen@enron.com'}),"frozenset({'matt.smith@enron.com', 'matthew.le...",FW: Western Wholesale Activities - Gas & Power...,"Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...","Lenhart, Matthew </O=ENRON/OU=NA/CN=RECIPIENTS...",,,...,,,,,,,,,,True
4,fw western wholesale activities gas & power co...,<19730598.1075858642129.JavaMail.evans@thyme>,2001-08-09 12:30:58,frozenset({'k..allen@enron.com'}),"frozenset({'matt.smith@enron.com', 'm..tholt@e...",FW: Western Wholesale Activities - Gas & Power...,"Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...","Smith, Matt </O=ENRON/OU=NA/CN=RECIPIENTS/CN=M...",,,...,,,,,,,,,,True


# Split the dataset into a train and test set

In [156]:
# X = first 14 columns of the dataframe --> up to the user column
# y = the label columns. The remaining columns are all labels, except for the "labeled" column which is True for every sample.
train_X, test_X, train_y, test_y = train_test_split(enron.iloc[:, 0:15], enron.iloc[:, 16], test_size=0.15)

train_X.head()

Unnamed: 0,Complete,Message-ID,Date,From,To,Subject,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,content,user
257,confidential calif peaking plant phil enron is...,<30080497.1075860886243.JavaMail.evans@thyme>,2001-01-30 01:48:00,frozenset({'kevin.hyatt@enron.com'}),frozenset({'plombardo@plantsystems.com'}),Confidential - Calif Peaking Plant,Kevin Hyatt,PLombardo <PLombardo@plantsystems.com>,,,"\Kevin_Hyatt_Mar2002\Hyatt, Kevin\Projects\McK...",Hyatt-K,khyatt (Non-Privileged).pst,Phil-- Enron is currently evaluating the possi...,hyatt-k
525,re daily update legislative activity i attende...,<25619454.1075846170746.JavaMail.evans@thyme>,2000-08-24 21:45:00,frozenset({'mona.petrochko@enron.com'}),frozenset({'bruno.gaillard@enron.com'}),Re: Daily Update/ Legislative activity - 08/24/00,Mona L Petrochko,Bruno Gaillard,"West GA, Edward Hamb, Jennifer Rudolph, Chris ...",,\Steven_Kean_Dec2000_1\Notes Folders\All docum...,KEAN-S,skean.nsf,I attended Cmmr. Wood's two-day hearing on Who...,kean-s
1512,re ihi arbitration privileged and confidential...,<1683661.1075853209714.JavaMail.evans@thyme>,2001-03-07 15:46:00,frozenset({'richard.sanders@enron.com'}),frozenset({'gail.brownfeld@enron.com'}),RE: IHI Arbitration: PRIVILEGED AND CONFIDENTI...,Richard B Sanders,Gail Brownfeld,,,\Richard_Sanders_Oct2001\Notes Folders\All doc...,Sanders-R,rsanders.nsf,That all depends on your definition of the wor...,sanders-r
1243,mou with india oil corp. fyi mou with india oi...,<7182453.1075846162997.JavaMail.evans@thyme>,2000-08-02 15:07:00,frozenset({'steven.kean@enron.com'}),frozenset({'mitchell.taylor@enron.com'}),MOU With India Oil Corp.,Steven J Kean,Mitchell Taylor,,,\Steven_Kean_Dec2000_1\Notes Folders\All docum...,KEAN-S,skean.nsf,fyi ---------------------- Forwarded by Steven...,kean-s
832,generator organization generator organization ...,<3498886.1075847581164.JavaMail.evans@thyme>,2001-05-16 02:09:00,frozenset({'steven.kean@enron.com'}),frozenset({'maureen.mcvicker@enron.com'}),GENERATOR ORGANIZATION,Steven J Kean,Maureen McVicker,,,\Steven_Kean_June2001_1\Notes Folders\All docu...,KEAN-S,skean.nsf,---------------------- Forwarded by Steven J K...,kean-s


In [157]:
train_y

257     1.0
525     1.0
1512    1.0
1243    1.0
832     4.0
1359    1.0
1445    3.0
1452    5.0
1162    4.0
301     4.0
1285    1.0
427     6.0
1206    1.0
1581    1.0
989     3.0
1334    1.0
1337    4.0
1019    1.0
964     3.0
1674    4.0
1094    1.0
878     6.0
244     1.0
1567    1.0
1102    4.0
194     1.0
1701    2.0
1473    1.0
1255    4.0
934     1.0
       ... 
359     1.0
60      1.0
72      1.0
698     4.0
921     4.0
456     4.0
27      1.0
474     4.0
889     4.0
99      1.0
1686    4.0
329     2.0
722     1.0
184     1.0
310     3.0
923     4.0
1451    1.0
169     1.0
293     2.0
255     1.0
467     6.0
783     6.0
376     4.0
887     1.0
1215    4.0
1343    1.0
164     6.0
739     4.0
1319    3.0
1409    3.0
Name: Cat_1_level_2, Length: 1446, dtype: float64

# Supervised Models

## CountVectorizer and Tfidf Vectorizer

In [158]:
cvec = CountVectorizer(analyzer='word',
                      ngram_range=(1,1),
                      max_features=None,
                      stop_words='english',
                      min_df=2,
                      max_df=0.95)

train_counts = cvec.fit_transform(train_X.Complete)

In [159]:
test_counts = cvec.transform(test_X.Complete)

In [160]:
tfvec = TfidfVectorizer(ngram_range=(1, 1),
                       max_features=None,
                       stop_words='english',
                       min_df=2,
                       max_df=0.95)

train_tf = tfvec.fit_transform(train_X['Complete'].fillna(''))

In [161]:
test_tf = tfvec.transform(test_X['Complete'].fillna(' '))

## Naive Bayes Classifier
Sklearn's documentation states that a count vectorizer should be more appropriate for this model. I will try both.

First, count vectorizer

In [162]:
count_n_bayes = MultinomialNB()
count_n_bayes.fit(train_counts, train_y)

count_n_bayes.score(test_counts, test_y)

0.5390625

And now Tfidf

In [163]:
tf_n_bayes = MultinomialNB()
tf_n_bayes.fit(train_tf, train_y)

tf_n_bayes.score(test_tf, test_y)

0.53515625