In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.naive_bayes import MultinomialNB

from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import SGDClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

After reading in the dataset, the 'Complete' column needs to be moved so that the train_test_split function is easier to deal with

In [2]:
enron = pd.read_pickle("./enron_cleaned.pkl")
tags = pd.read_pickle("./pos_tags.pkl")

In [3]:
enron['Complete'][17]

'caiso notice cmr recommrendation board approval market participant iso recommendation congestion management reform present approval iso governing board september post iso web site reminder informational conference call a.m. p.m. tomorrow august provide opportunity clarify content format ask board approve teleconference information follow call password leader byron woertz byron woertz director client relation '

In [4]:
enron.columns

Index(['Message-ID', 'Date', 'From', 'To', 'Subject', 'X-From', 'X-To', 'X-cc',
       'X-bcc', 'X-Folder', 'X-Origin', 'X-FileName', 'content', 'user',
       'Cat_1_level_1', 'Cat_1_level_2', 'Cat_1_weight', 'Cat_2_level_1',
       'Cat_2_level_2', 'Cat_2_weight', 'Cat_3_level_1', 'Cat_3_level_2',
       'Cat_3_weight', 'Cat_4_level_1', 'Cat_4_level_2', 'Cat_4_weight',
       'Cat_5_level_1', 'Cat_5_level_2', 'Cat_5_weight', 'Cat_6_level_1',
       'Cat_6_level_2', 'Cat_6_weight', 'Cat_7_level_1', 'Cat_7_level_2',
       'Cat_7_weight', 'Cat_8_level_1', 'Cat_8_level_2', 'Cat_8_weight',
       'Cat_9_level_1', 'Cat_9_level_2', 'Cat_9_weight', 'Cat_10_level_1',
       'Cat_10_level_2', 'Cat_10_weight', 'Cat_11_level_1', 'Cat_11_level_2',
       'Cat_11_weight', 'Cat_12_level_1', 'Cat_12_level_2', 'Cat_12_weight',
       'labeled', 'Complete'],
      dtype='object')

In [5]:
column_complete = enron.pop('Complete')
enron.insert(0, 'Complete', column_complete)
enron.head()

Unnamed: 0,Complete,Message-ID,Date,From,To,Subject,X-From,X-To,X-cc,X-bcc,...,Cat_10_level_1,Cat_10_level_2,Cat_10_weight,Cat_11_level_1,Cat_11_level_2,Cat_11_weight,Cat_12_level_1,Cat_12_level_2,Cat_12_weight,labeled
0,confidential employee informationlenhart also ...,<9831685.1075855725804.JavaMail.evans@thyme>,2001-03-15 14:45:00,frozenset({'phillip.allen@enron.com'}),frozenset({'todd.burke@enron.com'}),Re: Confidential Employee Information/Lenhart,Phillip K Allen,Todd Burke,,,...,,,,,,,,,,True
1,personal confidential compensation information...,<21041312.1075855725847.JavaMail.evans@thyme>,2001-03-15 14:11:00,frozenset({'phillip.allen@enron.com'}),frozenset({'kim.bolton@enron.com'}),RE: PERSONAL AND CONFIDENTIAL COMPENSATION INF...,Phillip K Allen,Kim Bolton,,,...,,,,,,,,,,True
2,fw western wholesale activity gas & power conf...,<5907100.1075858639941.JavaMail.evans@thyme>,2001-06-20 17:04:51,frozenset({'k..allen@enron.com'}),"frozenset({'matt.smith@enron.com', 'matthew.le...",FW: Western Wholesale Activities - Gas & Power...,"Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...","Lenhart, Matthew </O=ENRON/OU=NA/CN=RECIPIENTS...",,,...,,,,,,,,,,True
3,fw western wholesale activity gas & power conf...,<26625142.1075858639964.JavaMail.evans@thyme>,2001-06-20 17:09:00,frozenset({'k..allen@enron.com'}),"frozenset({'matt.smith@enron.com', 'matthew.le...",FW: Western Wholesale Activities - Gas & Power...,"Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...","Lenhart, Matthew </O=ENRON/OU=NA/CN=RECIPIENTS...",,,...,,,,,,,,,,True
4,fw western wholesale activity gas & power conf...,<19730598.1075858642129.JavaMail.evans@thyme>,2001-08-09 12:30:58,frozenset({'k..allen@enron.com'}),"frozenset({'matt.smith@enron.com', 'm..tholt@e...",FW: Western Wholesale Activities - Gas & Power...,"Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...","Smith, Matt </O=ENRON/OU=NA/CN=RECIPIENTS/CN=M...",,,...,,,,,,,,,,True


# Split the dataset into a train and test set
X = first 14 columns of the dataframe --> up to the user column

y = the label columns. The remaining columns are all labels, except for the "labeled" column which is True for every sample.

_For now, let's only look at the Cat level 2 column as our topic label. 
This column represents 8 different coarse genres:_
* _1 Company Business, Strategy, etc. (855 cnt.)_
* _2 Purely Personal (49 cnt.)_
* _3 Personal but in professional context (e.g., it was good working with you) (165 cnt.)_
* _4 Logistic Arrangements (meeting scheduling, technical support, etc) (533 cnt.)_
* _5 Employment arrangements (job seeking, hiring, recommendations, etc) (96 cnt.)_
* _6 Document editing/checking (collaboration) (176 cnt.)_
* _7 Empty message (due to missing attachment) (25 cnt.)_
* _8 Empty message (26 cnt.)_


In [6]:
train_X, test_X, train_y, test_y = train_test_split(enron.iloc[:, 0:15], enron.iloc[:, 16], test_size=0.20)

# Supervised Models

## CountVectorizer, Tfidf Vectorizer, and Similarity Matrix

In [7]:
cvec = CountVectorizer(analyzer='word',
                      ngram_range=(1,1),
                      max_features=None,
                      stop_words='english',
                      min_df=2,
                      max_df=0.95)

train_counts = cvec.fit_transform(train_X.Complete)
test_counts = cvec.transform(test_X.Complete)

In [8]:
tfvec = TfidfVectorizer(analyzer='word',
                       ngram_range=(1,1),
                       max_features=None,
                       stop_words='english',
                       min_df=2,
                       max_df=0.95)

train_tf = tfvec.fit_transform(train_X['Complete'].fillna(''))
test_tf = tfvec.transform(test_X['Complete'].fillna(' '))

## Naive Bayes Classifier
Sklearn's documentation states that a count vectorizer should be more appropriate for this model. I will try both.

First, count vectorizer

In [9]:
n_bayes = MultinomialNB()
n_bayes.fit(train_counts, train_y)

n_bayes.score(test_counts, test_y)

0.5454545454545454

In [10]:
n_bayes = MultinomialNB()
n_bayes.fit(train_tf, train_y)

n_bayes.score(test_tf, test_y)

0.5513196480938416

The tfidf seems to work a little bit better (Depending on the train test split, the tfidf outperforms by up to 4%). This makes sense, because the tfidf adds a weight factor for every word, and this is expected to be important to classification of text. It is interesting that the tfidf matrix works just as well, if not better, than the count matrix despite the documentation recommending the use of a count vector over tfidf.

While the accuracy itself was not great, I am expecting better results from the version that does not include stopwords. Accuracy is also likely affected by the small sample size for both train and test sets.

## SVM
We will try a few different versions of SVM.

First is the standard 'svc'

In [11]:
svm = SVC()
svm.fit(train_counts, train_y)

svm.score(test_counts, test_y)

0.49560117302052786

In [12]:
svm = SVC()
svm.fit(train_tf, train_y)

svm.score(test_tf, test_y)

0.49853372434017595

Next, LinearSVC

In [13]:
svm = LinearSVC()
svm.fit(train_counts, train_y)

svm.score(test_counts, test_y)

0.6070381231671554

In [33]:
svm = LinearSVC()
svm.fit(train_tf, train_y)

svm.score(test_tf, test_y)

0.7038123167155426

Last, SGDClassifier which uses gradient descent

In [15]:
svm = SGDClassifier(loss='hinge')
svm.fit(train_counts, train_y)

svm.score(test_counts, test_y)



0.6158357771260997

In [16]:
svm = SGDClassifier(loss='hinge')
svm.fit(train_tf, train_y)

svm.score(test_tf, test_y)



0.6832844574780058

Across multiple runs, the LinearSVC model outperforms the standard SVC model by upwards of 10%. The SGDClassifier is closer in accuracy, although LinearSVC still slightly outperforms it.

Interestingly, the SGDClassifier's score can vary by over 9% for the same train/test samples. While the high-end of its scores can be around 66% for this dataset, its low end is closer to 50%. This variance makes the model unreliable.

## Regression
In other works dealing with text classification, logistic regression seems to perform somewhere between the LinearSVC and Multinomial Naive Bayes models.

In [17]:
log_reg = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
log_reg.fit(train_tf, train_y)

log_reg.score(test_tf, test_y)

0.6774193548387096

# Running Supervised Models with optimal parameters
Above, I went through some models using mostly default values. I got decent results, but now I want to see how high each model can score if it uses optimal parameters.

Luckily, sklearn provides a pretty easy way to do this. The main drawback, however, is that this method can be very slow.

## Naive Bayes

In [18]:
# Naive Bayes Pipeline
bayes_pipeline = Pipeline([('vect', CountVectorizer()),
                          ('tfidf', TfidfTransformer()),
                          ('bayes', MultinomialNB())])

# Naive Bayes Parameters
bayes_params = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
               'tfidf__use_idf': (True, False),
               'bayes__alpha': (1e-2, 1e-3)}


In [19]:
gridSearch = GridSearchCV(bayes_pipeline, bayes_params, n_jobs=-1)
gridSearch = gridSearch.fit(train_X['Complete'].fillna(' '), train_y)

In [20]:
print(gridSearch.best_score_)
print(gridSearch.best_params_)

0.6583394562821455
{'bayes__alpha': 0.01, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}


## SVM

### Simple SVC

In [21]:
# SVC Pipeline
svc_pipeline = Pipeline([('vect', CountVectorizer()),
                          ('tfidf', TfidfTransformer()),
                          ('svm', SVC())])

# SVC Parameters
svc_params = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'svm__C': (0.2, 0.5, 1.0),
              'svm__tol': (1e-2, 1e-3)}


In [22]:
gridSearch = GridSearchCV(svc_pipeline, svc_params, n_jobs=-1)
gridSearch = gridSearch.fit(train_X['Complete'].fillna(' '), train_y)

In [23]:
print(gridSearch.best_score_)
print(gridSearch.best_params_)

0.5033063923585599
{'svm__C': 0.2, 'svm__tol': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}


### LinearSCV

In [24]:
# LinearSVC Pipeline
lin_svc_pipeline = Pipeline([('vect', CountVectorizer()),
                          ('tfidf', TfidfTransformer()),
                          ('svm', LinearSVC())])

# LinearSVC Parameters
lin_svc_params = {'vect__ngram_range': [(1, 1), (1, 2)],
                  'tfidf__use_idf': (True, False),
                  'svm__C': (0.2, 0.5, 1.0),
                  'svm__tol': (1e-3, 1e-4),
                  'svm__dual': (True, False)}


In [25]:
gridSearch = GridSearchCV(lin_svc_pipeline, lin_svc_params, n_jobs=-1)
gridSearch = gridSearch.fit(train_X['Complete'].fillna(' '), train_y)

In [26]:
print(gridSearch.best_score_)
print(gridSearch.best_params_)

0.6722997795738428
{'svm__C': 1.0, 'svm__dual': True, 'svm__tol': 0.001, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}


### SGDClassifier

In [27]:
# SGD Pipeline
sgd_pipeline = Pipeline([('vect', CountVectorizer()),
                          ('tfidf', TfidfTransformer()),
                          ('svm', SGDClassifier())])

# SGD Parameters
sgd_params = {'vect__ngram_range': [(1, 1), (1, 2)],
                  'tfidf__use_idf': (True, False),
                  'svm__alpha': (0.0005, 0.0001, 0.00008),
                  'svm__loss': ('hinge', 'perceptron', 'modified_huber')}


In [28]:
gridSearch = GridSearchCV(sgd_pipeline, sgd_params, n_jobs=-1)
gridSearch = gridSearch.fit(train_X['Complete'].fillna(' '), train_y)



In [29]:
print(gridSearch.best_score_)
print(gridSearch.best_params_)

0.6700955180014695
{'svm__alpha': 0.0005, 'svm__loss': 'hinge', 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}


## Logistic Regression

In [30]:
# Logistic Regression Pipeline
logreg_pipeline = Pipeline([('vect', CountVectorizer()),
                          ('tfidf', TfidfTransformer()),
                          ('logreg', LogisticRegression())])

# Logistic Regression Parameters
logreg_params = {'vect__ngram_range': [(1, 1), (1, 2)],
                  'tfidf__use_idf': (True, False),
                  'logreg__tol': (1e-4, 1e-3),
                  'logreg__solver': ('lbfgs', 'liblinear'),
                  'logreg__class_weight': (None, 'balanced')}


In [31]:
gridSearch = GridSearchCV(logreg_pipeline, logreg_params, n_jobs=-1)
gridSearch = gridSearch.fit(train_X['Complete'].fillna(' '), train_y)

In [32]:
print(gridSearch.best_score_)
print(gridSearch.best_params_)

0.6700955180014695
{'logreg__class_weight': 'balanced', 'logreg__solver': 'liblinear', 'logreg__tol': 0.0001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}


https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f