An optional exercise to download prepare the data used for training the spam classifier. Data for this exercise can be found in the spam assassin public corpus: http://spamassassin.apache.org/old/publiccorpus/.
The data consists of email examples either labeled as spam or ham and categorized by detection difficulty.

In [1]:
import os
import re
import pandas as pd
import numpy as np

In [2]:
# Change filenames in directory to make them readable text files.

for folder in os.listdir('/Users/user/Spam_Emails'):
    if 'ham' in folder or 'spam' in folder:
        for filename in os.listdir(folder):
            if not filename.endswith(".txt"):
                os.rename(folder + '/' + filename, folder + '/' + filename + '.txt')

In [3]:
# Iterate through folders containing the word ham(non spam emails) storing email contents in ham_list variable.

ham_list = []
for folder in os.listdir('/Users/user/Spam_Emails'):
    if 'ham' in folder:
        for filename in os.listdir(folder):
            if len(filename) > 10:
                with open(folder + '/' + filename, 'r', errors='ignore') as email:
                    contents=email.read()
                    ham_list.append(contents)

In [4]:
# Iterate through folders containing the word spam(spam emails) storing email contents in spam_list variable.

spam_list = []
for folder in os.listdir('/Users/user/Spam_Emails'):
    if 'spam' in folder:
        for filename in os.listdir(folder):
            if len(filename) > 10:
                with open(folder + '/' + filename, 'r', errors='ignore') as email:
                    contents=email.read()
                    spam_list.append(contents)

In [5]:
ham_array = np.array(ham_list)
spam_array = np.array(spam_list)

In [6]:
# Convert to dataframe and add label to indicate class membership.

ham_data = pd.DataFrame({'email': ham_array, 'label' : 0})
spam_data = pd.DataFrame({'email': spam_array, 'label' : 1})

In [7]:
# Concatenate ham/spam data and shuffle to randomize order.

data = pd.concat([ham_data, spam_data])
data.reset_index(drop=True, inplace=True)
data = data.sample(frac=1).reset_index(drop=True)

In [8]:
data.head()

Unnamed: 0,email,label
0,From submit27@desertmail.com Wed Jul 24 02:48...,1
1,Return-Path: <Online#3.20502.53-KAuEgBlRpyKZc9...,0
2,From qtopic+admin@quicktopic.com Sun Oct 6 2...,0
3,Received: from b.smtp-out.sonic.net (b.smtp-ou...,1
4,Received: from hq.pro-ns.net (localhost [127.0...,1


In [9]:
import nltk
from nltk.stem.porter import *

In [10]:
# The vocab list is a list of predetermined words to use as features in the model's predictions.

vocab = pd.read_table('/Users/user/Spam_Emails', header=None)
vocab_list = vocab.drop([0], axis = 1).values
vocab_list = vocab_list.ravel().tolist()


In [11]:
def process_email(email):
    # reads through the contents of an email and returns a list of numbers 
    # signifying at what index each word occurs in the vocab list. (To be used as a feature vector).
    
    # Simplifies words down to their root form. Spamilicious -> spam.
    stemmer = PorterStemmer()
    
    # Converts links, numbers, email handles, and dollar signs to simple one word features.
    email = email[email.find('\n\n'):].lower()
    email = re.sub(r'(http|https)://[^\s]*', 'httpaddr', email)
    email = re.sub('[0-9]+.*', 'number', email)
    email = re.sub(r'[^\s]+@[^\s]+', 'emailaddr', email)
    email = re.sub(r'[$]+', 'dollar', email)
    
    # Tokenize email, stem words, and remove non alphanumeric characters.
    email = nltk.word_tokenize(email)
    email = [re.sub(r'[^a-zA-Z0-9]', '', i) for i in email]
    email = [stemmer.stem(word) for word in email if len(word) >= 1]
    word_indices = [vocab_list.index(i) for i in email if i in vocab_list]
    
    return word_indices

In [12]:
def email_features(word_indices):
    # Converts the word index arrays into a sparse array of of zeros and ones denoting 
    # whether or not a word in the vocab list appeared in the email.
    
    feature = np.zeros([len(vocab_list), 1])
    feature[word_indices] = 1
    
    return feature

In [13]:
# Apply functions to dataframe.

data['word_indices'] = data.email.apply(process_email)

In [14]:
data['features'] = data.word_indices.apply(email_features)

In [15]:
data['features'] = data.features.apply(np.ravel)

In [16]:
feats = data['features'].apply(pd.Series)

In [17]:
X = feats

In [18]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1889,1890,1891,1892,1893,1894,1895,1896,1897,1898
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [19]:
y = data.label
y = y.values.reshape(len(y),)

In [20]:
y

array([1, 0, 0, ..., 1, 1, 0])

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

In [22]:
# Split data into training and testing sets.

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [23]:
# Instantiate and fit classifier.

clf = SVC(kernel='linear', C=1)
clf.fit(X_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [24]:
# Spam detection accuracy on test set.

clf.score(X_test, y_test)

0.99181243021957577

In [25]:
# K-fold cross validation performed on the training set.

scores = cross_val_score(clf, X_train, y_train, cv=5)

In [26]:
# Results. 

scores

array([ 0.99007444,  0.98573201,  0.98945409,  0.98635236,  0.98819876])