# Initialization

In [25]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [26]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import StandardScaler

from sklearn.dummy import DummyClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.metrics import confusion_matrix

# Class, for use in pipelines, to select certain columns from a DataFrame and convert to a numpy array
# From A. Geron: Hands-On Machine Learning with Scikit-Learn & TensorFlow, O'Reilly, 2017
# Modified by Derek Bridge to allow for casting in the same ways as pandas.DataFrame.astype
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names, dtype=None):
        self.attribute_names = attribute_names
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_selected = X[self.attribute_names]
        if self.dtype:
            return X_selected.astype(self.dtype).values
        return X_selected.values

# Reading in the dataset 

We define some functions to read the dataset from the files provided. 

In [28]:
# Reads all files from the directory specified, and their content is returned as
# a pandas Series of strings. 
def read_files_from_dir(directory):
    files_contents = []
    for file_name in os.listdir(directory):
        file_path = os.path.join(directory, file_name)
        with open(file_path) as f:
            files_contents.append(f.read())
    return pd.Series(files_contents)

# Converts a series to a dataframe and adds for each of the elements a 
# constant numeric label, specified in the parameter label. 
def to_pd_DF_with_label(ser, label):
    df = pd.DataFrame()
    df['text'] = ser
    df['label'] = pd.Series(np.ones(len(df), dtype=np.int64) * label, index=df.index)
    return df

Now we will use the functions we defined, to actually read in the dataset.
These lines of code assume that the spam and ham archives have been extracted to spam and ham directories.

In [29]:
# read hams with label 0, since they are the negative class
hams = to_pd_DF_with_label(read_files_from_dir('ham'), 0)
# read spams with label 1, since they are the positive class
spams = to_pd_DF_with_label(read_files_from_dir('spam'), 1)

In [30]:
# check if we succeeded in reading in the dataset.
print(hams.shape)
print(spams.shape)

(1650, 2)
(1248, 2)


Now that we have two separate dataframes, we should append one to the other, to have all data data in a single dataframe. After the append, we know that all hams are before all the spams, so we should shuffle the dataset to avoid problems with k-fold in the future.

In [31]:
emails = hams.append(spams, ignore_index=True)
emails = emails.take(np.random.permutation(len(emails)))
emails.reset_index(drop=True, inplace=True)
print(emails.shape)

(2898, 2)


# Cleaning the dataset

Opening the email files we can see that the first lines of all the emails are data about the email itself (metadata). Since we do not want to conduct metadata analysis of the email, we can delete this metadata, leaving us with the title and the body of the email. 
In order to strip the metadata we have to identify it. After opening a few files, I noticed a pattern: the metadata is delimited by an empty line in the files.

In [32]:
emails['stripped_metadata'] = emails['text'].str.replace(r'(.*?)\n\n', '', flags=re.MULTILINE | re.DOTALL, n=1)

Now that we got rid of the metadata, the next thing I think to be unnecessary is the data found between HTML tags, so we could remove those too, in order to remain with only the plain text of the documents.

In [33]:
emails['just_text'] = emails['stripped_metadata'].str.replace(r"<(.*?)>", '', flags=re.MULTILINE | re.DOTALL)

In order to run some tests later, I will strip the HTLM tags also, while leaving the metadata, so we can comapare these two methods.

In [40]:
emails['stripped_html'] = emails['text'].str.replace(r"<(.*?)>", '', flags=re.MULTILINE | re.DOTALL)

In [41]:
preprocessings = [('raw text', 'text'), 
                  ('stripped metadata', 'stripped_metadata'),
                  ('stripped HTML tags', 'stripped_html'),
                  ('stripped metadata and HTML', 'just_text'),
]

In [64]:
count_vect_eng_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english')),
    ('classifier', LogisticRegression()),
])

tfidf_eng_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', LogisticRegression()),
])

count_vect_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression()),
])

tfidf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', LogisticRegression()),
])

dummy_pipeline = Pipeline([
    ('selector', DataFrameSelector(['label'])),
    ('dummy', DummyClassifier(strategy='most_frequent')),
])

pipelines = [('Count vectorizer with English stop words', count_vect_eng_pipeline), 
             ('tf-idf vectorizer with English stop words', tfidf_eng_pipeline), 
             ('Count vectorizer', count_vect_pipeline), 
             ('tf-idf vectorizer', tfidf_pipeline), 
]

In [36]:
# the labels
y = emails['label'].values

Check how the dummy classifier performs, which will predict the most frequent class.

In [37]:
np.mean(cross_val_score(dummy_pipeline, emails, y, scoring='accuracy', cv=10))

0.56935926500417611

Checking the accuracy of the classifier, with 10-fold cross validation

In [38]:
np.mean(cross_val_score(count_vect_pipeline, emails['stripped_metadata'], y, scoring='accuracy', cv=10))

0.97308197112516415

The confusion matrix:

In [39]:
y_predicted = cross_val_predict(count_vect_pipeline, emails['stripped_metadata'], y, cv=10)
confusion_matrix(y, y_predicted)

array([[1601,   49],
       [  29, 1219]])

As you can observe, the classifier is not making too many false positives (ham classified as spam), the type of error we are trying to avoid.

In [66]:
for pipeline_name, pipeline in pipelines:
    for preproc_name, preproc in preprocessings:
        mean = np.mean(cross_val_score(pipeline, emails[preproc], y, cv=10))
        print(pipeline_name, preproc_name, mean)

Count vectorizer with English stop words raw text 0.9799856819
Count vectorizer with English stop words stripped metadata 0.973081971125
Count vectorizer with English stop words stripped HTML tags 0.981017778308
Count vectorizer with English stop words stripped metadata and HTML 0.972728791314
tf-idf vectorizer with English stop words raw text 0.954103328958
tf-idf vectorizer with English stop words stripped metadata 0.9630783916
tf-idf vectorizer with English stop words stripped HTML tags 0.959966591099
tf-idf vectorizer with English stop words stripped metadata and HTML 0.97031499821
Count vectorizer raw text 0.980330509486
Count vectorizer stripped metadata 0.976529053812
Count vectorizer stripped HTML tags 0.981706240305
Count vectorizer stripped metadata and HTML 0.976872688223
tf-idf vectorizer raw text 0.955137811717
tf-idf vectorizer stripped metadata 0.967218708985
tf-idf vectorizer stripped HTML tags 0.962728791314
tf-idf vectorizer stripped metadata and HTML 0.971699081255


In [62]:
(emails['just_text'])[13]

'\n \n        \n        \n\nZDNet AnchorDesk Newsletter\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n        \n\n\n      \n\n\n\n\n\n-->\n\n\n\n\n\n\n\n\n\n\n        \n                \n                        \n                        \n                        \n                        \n                        \n                                TUESDAY, JULY 16, 2002                                                        \n                        \n                        \n                        \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n   \n      \n         \n             \n         \n      \n   \n\n\n      \n          \n             \n                &nbsp;David Coursey\n             \n          \n          \n             \n          \n      \n   \n\n   \n      \n         \n            \n         \n      \n      \n         \n            \n         \n      \n   \n-->\n\n   \n   \n   Why we\'re changing our publishing schedule\n\n\n\nDear Reader,\n\nAs of Monday, July 15, AnchorDesk is being publishe