# 1. Setup

In [3]:
# file paths
import os

# math and data operations
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# save models and scores
import joblib

MODELS_PATH = os.path.join(".", "_models", "spamassassin")

if not os.path.isdir(MODELS_PATH):
    os.makedirs(MODELS_PATH)

def save_model(model, model_id):
    path = os.path.join(MODELS_PATH, model_id)
    print("Saving ", model_id)
    joblib.dump(model, path)

def load_model(model_id):
    path = os.path.join(MODELS_PATH, model_id)
    return joblib.load(path)

# 2. Get Data

## Fetch Spam Data

In [5]:
import tarfile
import urllib.request

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("_datasets", "spamassassin")

def fetch_spam_data(ham_url=HAM_URL, spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", ham_url), ("spam.tar.bz2", spam_url)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=spam_path)
        tar_bz2_file.close()

In [6]:
fetch_spam_data()

In [37]:
HAM_DIR = os.path.join(SPAM_PATH, "easy_ham")
SPAM_DIR = os.path.join(SPAM_PATH, "spam")

# we don't want cmds or .ipynb_checkpoints
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) >= 38]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) >= 38]

In [38]:
len(ham_filenames)

2500

In [39]:
len(spam_filenames)

500

- We want to get the text of these emails and combine the ham and spam into a DataFrame `spam_data`
- Then we want to split the `spam_data` into training and test sets
    - We may implement hash splitting later, but currently we just want a quick framework for the project
- Afterwards, we will set aside the test set and create a pipeline to convert the email into numerical data (1s and 0s for existence of words in the email)
- Then we will test some classifier models (svm, randomforest, knn)
- Fine tune the model's inherent hyperparams and get cross-val-scores
- Go back to feature engineering and data preparation and possibly include these in a pipeline for fine-tuning
- Test till a reasonable accuracy
- Evaluate on test set
- Analyze errors and potentially repeat steps to create a better model

## Get Email Objects

In [41]:
import email
import email.policy

# open and parse a single email file using Python's email library
def load_email(path, filename):
    with open(os.path.join(path, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

ham_emails = [load_email(HAM_DIR, name) for name in ham_filenames]
spam_emails = [load_email(SPAM_DIR, name) for name in spam_filenames]

In [50]:
print(ham_emails[0].get_content())

    Date:        Wed, 21 Aug 2002 10:54:46 -0500
    From:        Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>
    Message-ID:  <1029945287.4797.TMDA@deepeddy.vircio.com>


  | I can't reproduce this error.

For me it is very repeatable... (like every time, without fail).

This is the debug log of the pick happening ...

18:19:03 Pick_It {exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {4852-4852 -sequence mercury}
18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury
18:19:04 Ftoc_PickMsgs {{1 hit}}
18:19:04 Marking 1 hits
18:19:04 tkerror: syntax error in expression "int ...

Note, if I run the pick command by hand ...

delta$ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace  4852-4852 -sequence mercury
1 hit

That's where the "1 hit" comes from (obviously).  The version of nmh I'm
using is ...

delta$ pick -version
pick -- nmh-1.0.4 [compiled on fuchsia.cs.mu.OZ.AU at Sun Mar 17 14:55

## Structure of Email Objects

Emails can have different structures, this is included in the payload of the email object. Let's create functions for getting and counting the structures of emails and how many emails are of that structure.

In [60]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

Now lets create a function for getting all structures and counting how many occurences of each

In [68]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [82]:
ham_structures = structures_counter(ham_emails).most_common()

ham_structures

[('text/plain', 2408),
 ('multipart(text/plain, application/pgp-signature)', 66),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [81]:
spam_structures = structures_counter(spam_emails).most_common()

spam_structures

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

<br></br>
Insights from email's **structure**
- spam has a lot more html texts and is never signed with a pgp-signature
- we can use structure as a valuable feature

## Create Training and Test Sets

We will use train_test_split to be fast. There are other ways for reproducibility and preserving train, test set integrity while adding new data
* Combine ham and spam emails into `X`
* Create similar length label arrays with labels 0 for ham and 1 for spam - `y`
* Apply train_test_split and get `X_train`, `X_test`, `y_train`, `y_test`

In [78]:
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails, dtype=object)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [83]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2400,), (600,), (2400,), (600,))

# 3. Prepare Data

## Convert Email to Plain Text

The follow function converts **HTML to plain text**
- Drops `<head>` section
- Converts all `<a>` tags to text HYPERLINK
- Gets rid of all HTML tags
- Replaces multiple newlines with single newline
- Unescapes HTML entities

In [84]:
import re
from html import unescape

def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

The following is html spam email.

In [86]:
html_spam_emails = [email for email in X_train[y_train==1]
                    if get_email_structure(email) == "text/html"]
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content().strip()[:1000], "...")

<HTML><HEAD><TITLE></TITLE><META http-equiv="Content-Type" content="text/html; charset=windows-1252"><STYLE>A:link {TEX-DECORATION: none}A:active {TEXT-DECORATION: none}A:visited {TEXT-DECORATION: none}A:hover {COLOR: #0033ff; TEXT-DECORATION: underline}</STYLE><META content="MSHTML 6.00.2713.1100" name="GENERATOR"></HEAD>
<BODY text="#000000" vLink="#0033ff" link="#0033ff" bgColor="#CCCC99"><TABLE borderColor="#660000" cellSpacing="0" cellPadding="0" border="0" width="100%"><TR><TD bgColor="#CCCC99" valign="top" colspan="2" height="27">
<font size="6" face="Arial, Helvetica, sans-serif" color="#660000">
<b>OTC</b></font></TD></TR><TR><TD height="2" bgcolor="#6a694f">
<font size="5" face="Times New Roman, Times, serif" color="#FFFFFF">
<b>&nbsp;Newsletter</b></font></TD><TD height="2" bgcolor="#6a694f"><div align="right"><font color="#FFFFFF">
<b>Discover Tomorrow's Winners&nbsp;</b></font></div></TD></TR><TR><TD height="25" colspan="2" bgcolor="#CCCC99"><table width="100%" border="0" 

***
Same email converted to plain text.

In [88]:
print(html_to_plain_text(sample_html_spam.get_content())[:1000], "...")


OTC
 Newsletter
Discover Tomorrow's Winners 
For Immediate Release
Cal-Bay (Stock Symbol: CBYI)
Watch for analyst "Strong Buy Recommendations" and several advisory newsletters picking CBYI.  CBYI has filed to be traded on the OTCBB, share prices historically INCREASE when companies get listed on this larger trading exchange. CBYI is trading around 25 cents and should skyrocket to $2.66 - $3.25 a share in the near future.
Put CBYI on your watch list, acquire a position TODAY.
REASONS TO INVEST IN CBYI
A profitable company and is on track to beat ALL earnings estimates!
One of the FASTEST growing distributors in environmental & safety equipment instruments.
Excellent management team, several EXCLUSIVE contracts.  IMPRESSIVE client list including the U.S. Air Force, Anheuser-Busch, Chevron Refining and Mitsubishi Heavy Industries, GE-Energy & Environmental Research.
RAPIDLY GROWING INDUSTRY
Industry revenues exceed $900 million, estimates indicate that there could be as much as $25 billi

***
Following function converts email object to plain text no matter the format.
- Loops through all parts of email
- Returns plain text content
- Converts html to plain text and returns resulting content if there isn't plain text content

In [89]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [90]:
print(email_to_text(sample_html_spam)[:100], "...")


OTC
 Newsletter
Discover Tomorrow's Winners 
For Immediate Release
Cal-Bay (Stock Symbol: CBYI)
Wat ...


## Stemming using NLTK Library

You will need to install nltk
- `pip install nltk`

In [91]:
try:
    import nltk

    stemmer = nltk.PorterStemmer()
    for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
        print(word, "=>", stemmer.stem(word))
except ImportError:
    print("Error: stemming requires the NLTK module.")
    stemmer = None

Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


## Replace urls with text URL

We will use library urlextract to replace urls with "URL"
- `pip install urlextract`

In [92]:
try:
    import urlextract # may require an Internet connection to download root domain names
    
    url_extractor = urlextract.URLExtract()
    print(url_extractor.find_urls("Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s"))
except ImportError:
    print("Error: replacing URLs requires the urlextract module.")
    url_extractor = None

['github.com', 'https://youtu.be/7Pq-S557XQU?t=3m32s']


## Transform Email to Array of Word Counts

The following code converts an array of email objects into an array of counter objects that track word count in the email.
- Loops through each email in array of emails
- Transforms the email to text
- Counts the words using Counter
- Append each counter to list
- After looping through all emails, return list of counters as array

In [93]:
from sklearn.base import BaseEstimator, TransformerMixin

class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

Let's test this.

In [104]:
X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
print(X_few[1].get_content())
print(X_few_wordcounts[1])


Some interesting quotes...

http://www.postfun.com/pfp/worbois.html


Thomas Jefferson:

"I have examined all the known superstitions of the word, and I do not
find in our particular superstition of Christianity one redeeming feature.
They are all alike founded on fables and mythology. Millions of innocent
men, women and children, since the introduction of Christianity, have been
burnt, tortured, fined and imprisoned. What has been the effect of this
coercion? To make one half the world fools and the other half hypocrites;
to support roguery and error all over the earth."

SIX HISTORIC AMERICANS,
by John E. Remsburg, letter to William Short
Jefferson again:

"Christianity...(has become) the most perverted system that ever shone on
man. ...Rogueries, absurdities and untruths were perpetrated upon the
teachings of Jesus by a large band of dupes and importers led by Paul, the
first great corrupter of the teaching of Jesus."



Counter({'the': 11, 'of': 9, 'and': 8, 'all': 3, 'christian':

## Converting Word Counts to Vectors

In the code below
- fit creates a vocabulary of the top 1000 (default) words from word count array
- transform creates a sparse matrix or 2D array 
    - rows are instances
    - columns are the top 1000 most common words
    - values are the word counts of the column it belongs to

In [111]:
from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        # list of most common words
        most_common = total_count.most_common()[:self.vocabulary_size]
        # dictionary of word: index + 1 (so it starts at 1 for most common word)
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        # enumerate generates counter for each word_count array in X
        for row, word_count in enumerate(X):
            # word_count is a Counter() which is a dictionary with elements "word": count
            for word, count in word_count.items():
                rows.append(row)
                # appends index or identifier and 0 if the word is not found in top 1000 
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [131]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

<3x11 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [132]:
X_few_vectors.toarray()

array([[ 6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [99, 11,  9,  8,  3,  1,  3,  1,  3,  2,  3],
       [67,  0,  1,  2,  3,  4,  1,  2,  0,  1,  0]])

The first number in the vector is for col 0 aka 6 words are not in the top 10 vocabulary (and those are all the 6 words in the first email).

We can look at the vocabulary below to see which numbers correspond to which words.

In [133]:
vocab_transformer.vocabulary_

{'the': 1,
 'of': 2,
 'and': 3,
 'to': 4,
 'url': 5,
 'all': 6,
 'in': 7,
 'christian': 8,
 'on': 9,
 'by': 10}

## Data Pipeline

Now let's make this into a pipeline.
- Convert emails into array of word counts Counter objects
- Convert word counts objects into vectors of number of instances of top words

In [154]:
from sklearn.pipeline import Pipeline

preparation_pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer())
])

X_train_transformed = preparation_pipeline.fit_transform(X_train)

In [155]:
X_train_transformed.shape

(2400, 1001)

2400 instances and 1000 top words and one more column for words that aren't in the top 1000.

# 4. Model Selection

There are many models that can classify email, but I don't know the more complicated ones, so I will just try out the following:
- Logistic Regression
- SVM Classifier
- Random Forest Classifier
- k Nearest Neighbors Classifier

In [147]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_val_score

log_reg = LogisticRegression(max_iter=1000, random_state=42)
svm_clf = SVC(gamma="auto")
forest_clf = RandomForestClassifier(random_state=42)
knn_clf = KNeighborsClassifier()

## Logistic Regression

In [148]:
scores = cross_val_score(log_reg, X_train_prepared, y_train,
                         scoring="accuracy", cv=3)
scores.mean()

0.985

## SVC

In [144]:
scores = cross_val_score(svm_clf, X_train_prepared, y_train,
                         scoring="accuracy", cv=3)
scores.mean()

0.9504166666666666

## RandomForest

In [145]:
scores = cross_val_score(forest_clf, X_train_prepared, y_train,
                         scoring="accuracy", cv=3)
scores.mean()

0.9820833333333333

## kNN

In [146]:
scores = cross_val_score(knn_clf, X_train_prepared, y_train,
                         scoring="accuracy", cv=3)
scores.mean()

0.9208333333333334

## Precision-Recall

The logistic regression model performed the best, but this is because the dataset was an easy one and wasn't as complex. If it was more complex we would definitely see different results.

In [156]:
from sklearn.metrics import precision_score, recall_score

X_test_transformed = preparation_pipeline.transform(X_test)

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

Precision: 96.88%
Recall: 97.89%
