# Downloading the data

In [4]:
import os
import tarfile
import requests

import re
import glob

In [45]:
def get_data():
    DL_ROOT = 'https://spamassassin.apache.org/old/publiccorpus/'

    # get file names
    r = requests.get(DL_ROOT)
    pat = re.compile('<a href=[\'"]([\w\._]+)["\']>([\w\._]+)</a>')
    links = re.finditer(pat, r.text)
    links = [link.group(1) for link in links]
    README = links[-1]
    links = links[:-1]

    SPAM_PATH = os.path.join('datasets', 'spam')
    os.makedirs(SPAM_PATH, exist_ok=True)

    # fetch readme file
    path = os.path.join(SPAM_PATH, 'readme.txt')
    DL_URL = DL_ROOT + README
    r = requests.get(DL_URL)
    with open(path, 'wb') as f:
        f.write(r.content)

    # get spam files    
    def fetch_data(file_name):
        bz2_path = os.path.join(SPAM_PATH, file_name)
        DL_URL = DL_ROOT + file_name
        r = requests.get(DL_URL)
        with open(bz2_path, 'wb') as f:
            f.write(r.content)
            file_bz2 = tarfile.open(bz2_path)
            file_bz2.extractall(path=SPAM_PATH)
            file_bz2.close()

    for link in links:
        fetch_data(link)
        
    # clean up
    for path in glob.glob(os.path.join(SPAM_PATH, '*.bz2')):
        os.remove(path)

In [43]:
get_data()

In [61]:
def get_readme():
    file = os.path.join('datasets', 'spam', 'readme.txt')
    with open(file, 'r') as f:
        result = f.read()
    return result

In [63]:
print(get_readme())

<pre>

REVISION HISTORY OF THIS CORPUS:

(**update**: Oct 21 2002 jm: added nearly 3000 more messages.)
(**update**: Nov 24 2002 jm: removed Replied: and Forwarded: headers.)
(**update**: Dec  4 2002 jm: removed a German message, some left-over
SpamAssassin markup, and quite a few duplicate messages.  Also replaced header
obfuscation using "example.com" with "spamassassin.taint.org", since
example.com has no MX record.)
(**update**: Feb 28 2003 jm: Bob Dickinson reported some leftover markup
that should have been removed from the headers.  Now cleaned.)
(**update**: Apr 23 2003 jm: removed 3 messages with malicious Javascript)
(**update**: Oct 10 2003 jm: noted that we'd love to hear about papers ;)
(**update**: Dec 16 2004 jm: changed a couple of hostnames in
headers, in 20021010*/hard_ham/0198* and 20030228*/hard_ham/00230*.)
(**update**: Mar  2 2005 jm: added note about live testing)
(**update**: Mar 11 2005 jm: removed a listed-as-spam mail that was really
a misclassified non-spam,

# Creating a dataset

- each directory in datasets/spam contains text files
- extract these text files
- need to figure out text encoding
- choose which data to use (all?)
- attach labels to the text files
- separate a stratified test set
- do we have enough data for hold-out validation, or should we use cross validation? ..if we want to use a hold-out set, we need to separate that as well

In [157]:
import subprocess

def get_encoding(file_path):
    file_info = subprocess.run(['file', '-i', file_path],
                                capture_output=True)
    file_info = file_info.stdout.decode('utf-8').split(' ')
    encoding = file_info[-1].lstrip('charset=').rstrip('\n')
    return encoding

In [144]:
def get_email(dirs, encoding_='utf-8'):
    email = []
    error = []
    for dir in dirs:
        PATH = os.path.join(SPAM_PATH, dir)
        dir_email = []
        dir_error = []
        for file in os.listdir(PATH):
            file_path = os.path.join(PATH, file)
            try:
                with open(file_path, 'r', encoding=encoding_) as f:
                    dir_email.append(f.read())
            except UnicodeError:
                try:
                    with open(file_path, 'r',
                              encoding=get_encoding(file_path)) as f:
                        dir_email.append(f.read())
                except LookupError as e:
                    dir_error.append('LookupError: {}\tFile: {}'
                                     .format(e, file_path))
                except UnicodeError as e:
                    try:
                        with open(file_path, 'r',
                                  encoding='windows-1253') as f:
                            dir_email.append(f.read())
                    except UnicodeError as e2:
                        dir_error.append('UnicodeErr: {}{}\tFile: {}'
                                         .format(e, e2, file_path))
        email.append(dir_email)
        error.append(dir_error)

    return list(zip(dirs, email)), error

In [158]:
def extract_emails(path=SPAM_PATH):
    """
    Return two lists, ham_emails and spam_emails,
    that contain pairs of the form (dir_name, list_of_emails),
    where dir_name is the name of a directory, and list_of_emails
    is a list of emails in that directory as strings.
    
    Note: 110 emails from the spam directories had unknown encodings
    and they are not included in the output.
    """
    SPAM_PATH = os.path.join('datasets', 'spam')

    dirs = [dir for dir in os.listdir(SPAM_PATH)
            if not dir.endswith('.txt')]
    ham_dirs = [dir for dir in dirs if 'ham' in dir]
    spam_dirs = [dir for dir in dirs if 'spam' in dir]
    
    ham_emails, ham_errors = get_email(ham_dirs, encoding_='windows-1252')
    spam_emails, spam_errors = get_email(spam_dirs)
    
    return ham_emails, spam_emails

## Train/val/test split

Let's try 60/20/20; we need to stratify spam and ham.

In [159]:
ham, spam = extract_emails()

In [164]:
import random

In [173]:
def train_val_test(ham, spam):
    all_ham, all_spam = [], []
    for x in ham:
        all_ham.extend(x[1])
    for x in spam:
        all_spam.extend(x[1])
        
    random.shuffle(all_ham)
    random.shuffle(all_spam)
    
    all_ham = [(x, 0) for x in all_ham]
    all_spam = [(x, 1) for x in all_spam]
    
    n_ham = len(all_ham)
    n_spam = len(all_spam)
    
    a, b = int(0.6 * n_ham), int(0.8 * n_ham)
    c, d = int(0.6 * n_spam), int(0.8 * n_spam)
    
    train = all_ham[:a] + all_spam[:c]
    val = all_ham[a:b] + all_spam[c:d]
    test = all_ham[b:] + all_spam[d:]
    
    random.shuffle(train)
    random.shuffle(val)
    random.shuffle(test)
    
    return train, val, test

In [174]:
train, val, test = train_val_test(ham, spam)

In [176]:
print(len(train), len(val), len(test))

5546 1849 1849


In [177]:
def unzip(x):
    y, z = zip(*x)
    return list(y), list(z)

In [178]:
X_train, y_train = unzip(train)
X_val, y_val = unzip(val)
X_test, y_test = unzip(test)

# Processing emails

- bag of words?
- bigrams? n-grams?
- header, address, salutation, signature features? (e.g. r'([-\w]+): (.+)')
- all lowercase, drop punctuation?
- drop common words? (might lose salutation...)
- use BeautifulSoup? 
- want to have different features as hyperparameters eventually

- Need a list of words to use... could combine output of all cleaning
- remove very frequent or very infrequent words?
- how can we remove "non-word" words?

In [401]:
import pandas as pd

In [407]:
def clean1(email):
    pat = re.compile(r'([-\w]+: .+|<.*|.*>|.*NextPart.*|charset=.*|\w+\.\w+\.?\w*\.?\w*\.?\w*)')
    email2 = pat.sub(' ', email)
    return [x.lower()
            for x in re.findall(r'[A-Za-z]+', email2)
            if len(x) > 2]

def create_corpus(f, X):
    "X training set, f maps email to list of words"
    corpus = {}
    for email in X:
        for word in f(email):
            corpus[word] = corpus.get(word, 0) + 1
    return corpus

def create_df(X, word_freq):
    template = {k: 0 for k in sorted(word_freq.index.to_list())}
    keys = template.keys()

    def feature_vec1(email):
        vec = template.copy()
        for x in clean1(email):
            if x in keys:
                vec[x] = 1
        return vec

    df = pd.DataFrame({i: feature_vec1(email)
                       for i, email in enumerate(X)})

    return df.T

In [410]:
word_freq = pd.Series(create_corpus(clean1, X_train))
filt = word_freq.between(6, 900)

In [422]:
X_train_df = create_df(X_train, word_freq[filt])

In [421]:
X_train_df.shape

(5546, 12418)

In [411]:
X_val_df = create_df(X_val, word_freq[filt])

In [420]:
X_val_df.shape

(1849, 13988)

In [412]:
y_train_df = pd.Series(y_train)

In [431]:
y_train_df.head()

0    0
1    0
2    1
3    0
4    1
dtype: int64

In [413]:
y_val_df = pd.Series(y_val)

# Training a quick model!

## Logistic regression

In [414]:
import sklearn

In [415]:
from sklearn.linear_model import LogisticRegression

In [423]:
clf = LogisticRegression()

In [424]:
clf.fit(X_train_df, y_train_df)

LogisticRegression()

In [425]:
clf.score(X_train_df, y_train_df)

0.998196898665705

In [426]:
clf.score(X_val_df, y_val_df)

0.9929691725256896

In [427]:
from sklearn.metrics import confusion_matrix

In [428]:
y_train_preds = clf.predict(X_train_df)
y_val_preds = clf.predict(X_val_df)

In [429]:
confusion_matrix(y_train_df, y_train_preds)

array([[4172,    0],
       [  10, 1364]])

In [430]:
confusion_matrix(y_val_df, y_val_preds)

array([[1391,    0],
       [  13,  445]])

In [432]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [433]:
def score(truth, preds):
    print(f'Precision: {precision_score(truth, preds):.3f}')
    print(f'Recall: {recall_score(truth, preds):.3f}')
    print(f'F_1 score: {f1_score(truth, preds):.3f}')

In [434]:
score(y_train_df, y_train_preds)

Precision: 1.000
Recall: 0.993
F_1 score: 0.996


In [435]:
score(y_val_df, y_val_preds)

Precision: 1.000
Recall: 0.972
F_1 score: 0.986


## Naive Bayes

In [437]:
from sklearn.naive_bayes import BernoulliNB

In [438]:
nb_clf = BernoulliNB()

In [439]:
nb_clf.fit(X_train_df, y_train_df)

BernoulliNB()

In [440]:
nb_train_preds = nb_clf.predict(X_train_df)

In [441]:
nb_val_preds = nb_clf.predict(X_val_df)

In [442]:
score(y_train_df, nb_train_preds)

Precision: 0.971
Recall: 0.926
F_1 score: 0.948


In [443]:
score(y_val_df, nb_val_preds)

Precision: 0.979
Recall: 0.913
F_1 score: 0.945


## Linear SVC

In [444]:
from sklearn.svm import LinearSVC

In [447]:
svc_clf = LinearSVC(max_iter=10000)

In [448]:
svc_clf.fit(X_train_df, y_train_df)

LinearSVC(max_iter=10000)

In [450]:
svc_train_preds = svc_clf.predict(X_train_df)

In [451]:
svc_val_preds = svc_clf.predict(X_val_df)

In [452]:
score(y_train_df, svc_train_preds)

Precision: 0.995
Recall: 1.000
F_1 score: 0.997


In [453]:
score(y_val_df, y_val_preds)

Precision: 1.000
Recall: 0.972
F_1 score: 0.986


# Improving predictions

- set up pipeline to search over different word frequency thresholds
- word frequency vectors vs. bag of words. (Need to rescale if we use word frequencies)
- think of other features to add
- think of other processing steps (stemming, tagging (e.g. each image, or each link)
- tune regularization parameters
- examine false negatives
- probably don't need to look outside of linear models, since they're already performing quite well
- it could just be coincidence, but Logit and SVC performed differently on the training set, so maybe we could ensemble

## Making a transformer for our processing steps