In [75]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [76]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [77]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.metrics import confusion_matrix

# Class, for use in pipelines, to select certain columns from a DataFrame and convert to a numpy array
# From A. Geron: Hands-On Machine Learning with Scikit-Learn & TensorFlow, O'Reilly, 2017
# Modified by Derek Bridge to allow for casting in the same ways as pandas.DataFrame.astype
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names, dtype=None):
        self.attribute_names = attribute_names
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_selected = X[self.attribute_names]
        if self.dtype:
            return X_selected.astype(self.dtype).values
        return X_selected.values

In [78]:
def read_files_from_dir(directory):
    files_contents = []
    for file_name in os.listdir(directory):
        file_path = os.path.join(directory, file_name)
        with open(file_path) as f:
            files_contents.append(f.read())
    return pd.Series(files_contents)

def to_pd_DF_with_label(ser, label):
    df = pd.DataFrame()
    df['text'] = ser
    df['label'] = pd.Series(np.ones(len(df), dtype=np.int64) * label, index=df.index)
    return df

In [79]:
hams = to_pd_DF_with_label(read_files_from_dir('ham'), 0)
spams = to_pd_DF_with_label(read_files_from_dir('spam'), 1)

In [80]:
print(hams.shape)
print(spams.shape)

(1650, 2)
(1248, 2)


In [81]:
emails = hams.append(spams, ignore_index=True)
emails = emails.take(np.random.permutation(len(emails)))
print(emails.shape)

(2898, 2)


In [82]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english')),
    ('classifier', LogisticRegression()),
])

In [83]:
y = emails['label'].values
np.mean(cross_val_score(pipeline, emails['text'], y, scoring='accuracy', cv=10))

0.98102016465815522

In [85]:
y_predicted = cross_val_predict(pipeline, emails['text'], y, cv=10) # NB cross-val_predict, not cross_val_score
confusion_matrix(y, y_predicted)

array([[1622,   28],
       [  27, 1221]])