In [1]:
import pandas as pd
import numpy as np
from os import makedirs, path, remove, rename, rmdir
from tarfile import open as open_tar
from shutil import rmtree
from urllib import request, parse
from glob import glob
from os import path
from re import sub
from email import message_from_file
from glob import glob
from sklearn.model_selection import StratifiedShuffleSplit
from collections import defaultdict
from functools import partial
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score)
from sklearn.model_selection import cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
import gc

``This code block downloads the folder with spam and non-spam emails from the online source and 
creates local directories where this python notebook is stored``

In [2]:
def download_corpus(dataset_dir: str = 'data'):
    base_url = 'https://spamassassin.apache.org'
    corpus_path = 'old/publiccorpus'
    files = {
        '20021010_easy_ham.tar.bz2': 'ham',
        '20021010_hard_ham.tar.bz2': 'ham',
        '20021010_spam.tar.bz2': 'spam',
        '20030228_easy_ham.tar.bz2': 'ham',
        '20030228_easy_ham_2.tar.bz2': 'ham',
        '20030228_hard_ham.tar.bz2': 'ham',
        '20030228_spam.tar.bz2': 'spam',
        '20030228_spam_2.tar.bz2': 'spam',
        '20050311_spam_2.tar.bz2': 'spam' }
    
    #creates the folders: downloads, ham and spam
    downloads_dir = path.join(dataset_dir, 'downloads')
    ham_dir = path.join(dataset_dir, 'ham')
    spam_dir = path.join(dataset_dir, 'spam')

    makedirs(downloads_dir, exist_ok=True)
    makedirs(ham_dir, exist_ok=True)
    makedirs(spam_dir, exist_ok=True)
    
    
    for file, spam_or_ham in files.items():
        # download files from URL of each specific .bz2 file 
        url = parse.urljoin(base_url, f'{corpus_path}/{file}')
        tar_filename = path.join(downloads_dir, file)
        request.urlretrieve(url, tar_filename)
        
        #list e-mails in the compressed .bz2 file
        emails = []
        with open_tar(tar_filename) as tar:
            tar.extractall(path=downloads_dir)
            for tarinfo in tar:
                if len(tarinfo.name.split('/')) > 1:
                    emails.append(tarinfo.name)
        
        # move e-mails to ham or spam directory
        for email in emails:
            directory, filename = email.split('/')
            directory = path.join(downloads_dir, directory)
            
            if not path.exists(path.join(dataset_dir, spam_or_ham, filename)):
                rename(path.join(directory, filename),
                   path.join(dataset_dir, spam_or_ham, filename))
                
        rmtree(directory)

download_corpus()

In [3]:
#How many e-mails are classified in our dataset as either Spam or not Spam?
ham_dir = path.join('data', 'ham')
spam_dir = path.join('data', 'spam')

print('Number of Non-Spam E-mails:', len(glob(f'{ham_dir}/*')))  
print('\nNumber of Spam E-mails:', len(glob(f'{spam_dir}/*')))  

Number of Non-Spam E-mails: 6952

Number of Spam E-mails: 2399


```This class retreives a clean string for each e-mail with a subject line text and body text```

In [4]:
class SimpleEmail:
    def __init__(self, subject: str, body: str):
        self.subject = subject
        self.body = body
    
    #as long as its not a letter, make it blank
    @property
    def clean(self):
        sanitizer = '[^A-Za-z]+' #non-letters
        clean = sub(sanitizer, ' ', f'{self.subject} {self.body}') #replace non-letters with space
        clean = clean.lower()
        return sub('\s+', ' ', clean) 
    
    #this function classifies the subject and body of e-mail by first new line \n. Returns email as a string
    def __str__(self):
        subject = f'subject: {self.subject}'
        body_first_line = self.body.split('\n')[0]
        body = f'body: {body_first_line}...'
        return f'{subject}\n{body}' #output is subject and body Text

```This class treats the entire folder of emails as if it were a list, as an iterable object then 
applies a parsing function to each list element```

In [5]:
class EmailIterator:
 
    #This function pulls every file from whatever our imput directory, positions it's iteration 0  
    def __init__(self, directory: str):
        self._files = glob(f'{directory}/*')
        self._pos = 0
    
    def __iter__(self):
        self._pos = -1
        return self
    
    #if the position is not at the end of the list of emails, we apply parse_emails and returns the output
    def __next__(self):
        if self._pos < len(self._files) - 1:
            self._pos += 1
            return self.parse_email(self._files[self._pos])
        raise StopIteration()

    #This function defines each file as a SimpleEmail class object with subject and body
    @staticmethod
    def parse_email(filename: str) -> SimpleEmail:
        with open(filename, encoding='utf-8', errors='replace') as fp:
            message = message_from_file(fp)
        
        subject = None
        for item in message.raw_items():  
            if item[0] == 'Subject':
                subject = item[1] 
        
        if message.is_multipart(): #is this multipart? if so add to body list of sub email message objects
            body = []
            for b in message.get_payload():  #add each part to the body and iterate through it 
                body.append(str(b))
            body = '\n'.join(body)
        else:
            body = message.get_payload() #just one part of the body the payload is one part
        
        return SimpleEmail(subject, body)

```Apply the EmailIterator class to our Spam and Non-Spam e-mail folders and then transform it into
an array of cleaned E-mail objects with subject and body```

In [8]:
#Apply the class to each class of e-mails
ham_emails = EmailIterator('data/ham')
spam_emails = EmailIterator('data/spam')

#Numpy arrays of our Ham and Spam e-mails. Our data!
hams = np.array([email.clean for email in ham_emails])
spams = np.array([email.clean for email in spam_emails])

#purge memory we are not using
del ham_emails
del spam_emails
gc.collect() 

#Train test split 80, 20 for our classifier by balancing the % of training and testing for each class
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1)
emails = np.concatenate((hams, spams))
labels = np.concatenate((np.zeros(hams.size), np.ones(spams.size)))

#clears memory that doesnt get used
del hams
del spams
gc.collect()

#Ensuring labels are aligned with emails, each label is correctly associated with each email
for train_index, test_index in split.split(emails, labels):
    emails_train, labels_train = \
        emails[train_index], labels[train_index]
    emails_test, labels_test = \
        emails[test_index], labels[test_index]
    
#Importing word dictionary to create term document matrix, counting occurances of each word
dictionary = defaultdict(int)
for email in emails_train:
    for word in email.split(' '):
        dictionary[word] += 1

#Selecting top 1000 most occured words, and lists its occurrances in each email
top = 1000
descending_dictionary = sorted(dictionary.items(), key=lambda v: v[1], reverse=True)
dictionary = [word for (word, occur) in descending_dictionary if len(word) > 1][:top]

``The following function creates a term document matrix covering the top 1000 most occuring words in the
corpus (document) either one-hot encoding format if binary is True or frequency encoding if binary is False``

In [9]:
def encode_email(email: SimpleEmail, dictionary_: list, binary: bool = False) -> np.array:
    encoded = np.zeros(dictionary_.size)
    words = email.split(' ')
    
    for word in words:
        index = np.where(dictionary_ == word)[0]
        if index.size == 1:  #we ignore unknown words if they do not appear in top 1000
            if binary: 
                encoded[index[0]] = 1 #set to 1 if the word exists
            else:
                encoded[index[0]] += 1 #otherwise add to the count i.e. frequency of the word
    return encoded

``Note: the Term Document matrix we ouput will have each e-mail in our data is a row 
and there are 1000 columns (for each word)``

In [10]:
dictionary = np.array(dictionary)  

#Every email is encoded as a different term document matrix row
_encode_email = partial(encode_email, dictionary_=dictionary)

#Train as numpy arrays
encoded_train = np.array(list(map(_encode_email, emails_train))) #entire term document matrix with 80%
encoded_test = np.array(list(map(_encode_email, emails_test))) #testing the term document matrix with 20%

In [11]:
#The classifer we use is K-Nearest Neighbor from Sci-Kit Learn
knn_clf = KNeighborsClassifier()

#Testing with 5 cross validation folds
labels_pred = cross_val_predict(knn_clf, encoded_train,labels_train, cv=5)
print('Accuracy of Model:', accuracy_score(labels_train, labels_pred))
print('Precision of Model:', precision_score(labels_train, labels_pred))
print('Recall of Model:', recall_score(labels_train, labels_pred))
print('F1 Score of Model:', f1_score(labels_train, labels_pred))

Accuracy of Model: 0.9637700534759358
Precision of Model: 0.9629213483146067
Recall of Model: 0.8931735278791036
F1 Score of Model: 0.9267369559340363


In [12]:
#Testing with 10 cross validation folds
labels_pred = cross_val_predict(knn_clf, encoded_train,labels_train, cv=10)
print('Accuracy of Model:', accuracy_score(labels_train, labels_pred))
print('Precision of Model:', precision_score(labels_train, labels_pred))
print('Recall of Model:', recall_score(labels_train, labels_pred))
print('F1 Score of Model:', f1_score(labels_train, labels_pred))

Accuracy of Model: 0.9643048128342246
Precision of Model: 0.9619686800894854
Recall of Model: 0.8963001563314226
F1 Score of Model: 0.9279741030482871


In [13]:
#Testing with 15 cross validation folds
labels_pred = cross_val_predict(knn_clf, encoded_train,labels_train, cv=15)
print('Accuracy of Model:', accuracy_score(labels_train, labels_pred))
print('Precision of Model:', precision_score(labels_train, labels_pred))
print('Recall of Model:', recall_score(labels_train, labels_pred))
print('F1 Score of Model:', f1_score(labels_train, labels_pred))

Accuracy of Model: 0.9649732620320856
Precision of Model: 0.9605336297943302
Recall of Model: 0.9004689942678479
F1 Score of Model: 0.9295320064550834


``References``

https://colab.research.google.com/github/PseudoCodeNerd/blog/blob/master/_notebooks/2019-10-19-spamClassifier-Oreilly-homework-chapter3.ipynb#scrollTo=dePbzCrARDID

https://www.youtube.com/watch?v=8rXD5-xhemo&t=1103s

https://medium.com/@thiagolcmelo/train-you-own-spam-detector-57725e8e81c0

https://towardsdatascience.com/spam-or-ham-introduction-to-natural-language-processing-part-2-a0093185aebd

https://github.com/happilyeverafter95/Medium/blob/master/spam_or_ham.py

