# Spam classifier

### Step 1: Download dataset
Downloading examples of spam and ham from Apache SpamAssassin’s public datasets and splitting the datasets into a training set and a test set. 

In [30]:
import os
import numpy as np
import glob
from urllib.request import urlretrieve
import tarfile
import shutil
import sklearn.utils
from sklearn.model_selection import train_test_split


def download_dataset():

    def download_url(url, dataset_dir="data"):

        tar_dir = os.path.join(dataset_dir, "tar")
        if not os.path.isdir(tar_dir):
            os.makedirs(tar_dir)

        filename = url.rsplit("/", 1)[-1]
        tarpath = os.path.join(tar_dir, filename)

        try:
            tarfile.open(tarpath)
        except:
            urlretrieve(url, tarpath)

        with tarfile.open(tarpath) as tar:
            dirname = os.path.join(dataset_dir, tar.getnames()[0])
            if os.path.isdir(dirname):
                shutil.rmtree(dirname)
            tar.extractall(path=dataset_dir)

            cmds_path = os.path.join(dirname, "cmds")
            if os.path.isfile(cmds_path):
                os.remove(cmds_path)

        return dirname

    def load_dataset(dirpath):
        files = []
        filepaths = glob.glob(dirpath + "/*")
        for path in filepaths:
            with open(path, "rb") as f:
                byte_content = f.read()
                str_content = byte_content.decode("utf-8", errors="ignore")
                files.append(str_content)
        return files

    spam_url = "https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2"
    easy_ham_url = "https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2"
    hard_ham_dir = "https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2"

    spam = load_dataset(download_url(spam_url))
    easy_ham = load_dataset(download_url(easy_ham_url))
    hard_ham = load_dataset(download_url(hard_ham_dir))

    X = spam + easy_ham + hard_ham
    y = np.concatenate((
        np.ones(len(spam)),
        np.zeros(len(easy_ham) + len(hard_ham)),
    ))

    return X, y


# Download dataset.
X, y = download_dataset()

# Split dataset into training and testing sets.
X, y = sklearn.utils.shuffle(X, y, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=42)

print(f"The number of training samples: {len(X_train)}")
print(f"The number of test samples: {len(X_test)}")

The number of training samples: 2436
The number of test samples: 610


### Step 2: Feature extraction

Doing some data cleaning and feature extraction:

- Transforming an email into a (sparse) vector that indicates the presence or absence of each possible word. For example, if all emails only ever contain four words, "Hello," "how," "are," "you," then the email "Hello you Hello Hello you" would be converted into a vector [1, 0, 0, 1] (meaning ["Hello" is present, "how" is absent, "are" is absent, "you" is present]), or [3, 0, 0, 2] if counting the number of occurrences of each word.

In [31]:
import re
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline


class EmailCleaner(BaseEstimator, TransformerMixin):

    def __init__(self,
                 no_header=True,
                 to_lowercase=True,
                 url_to_word=True,
                 num_to_word=True,
                 remove_punc=True):
        self.no_header = no_header
        self.to_lowercase = to_lowercase
        self.url_to_word = url_to_word
        self.num_to_word = num_to_word
        self.remove_punc = remove_punc

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_cleaned = []
        for email in X:
            if self.no_header:
                email = EmailCleaner.remove_header(email)
            if self.to_lowercase:
                email = EmailCleaner.lower_letters(email)

            email_words = email.split()
            if self.url_to_word:
                email_words = EmailCleaner.convert_url_to_word(email_words)
            if self.num_to_word:
                email_words = EmailCleaner.convert_num_to_word(email_words)
            email = " ".join(email_words)
            if self.remove_punc:
                email = EmailCleaner.remove_punctuation(email)
            X_cleaned.append(email)
        return X_cleaned

    @staticmethod
    def remove_header(email):
        return email[email.index("\n\n"):]

    @staticmethod
    def is_url(s):
        url = re.match(
            "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|"
            "[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", s)
        return url is not None

    @staticmethod
    def convert_url_to_word(words):
        for i, word in enumerate(words):
            if EmailCleaner.is_url(word):
                words[i] = "URL"
        return words

    @staticmethod
    def lower_letters(email):
        return email.lower()

    @staticmethod
    def convert_num_to_word(words):
        for i in range(len(words)):
          try:
            words[i] = int(words[i])
            words[i] = "NUM"
          except:
            continue

        return words

    @staticmethod
    def remove_punctuation(email):
        email = re.sub(r"[^\w\s]", "", email)
        return email

In [32]:
# Here are some unit tests to check code.

# Check lower_letters().
src_string = "Message-Id: <LISTMANAGERSQL-25343"
dst_string = "message-id: <listmanagersql-25343"
assert EmailCleaner.lower_letters(src_string) == dst_string

# Check convert_num_to_word().
src_string = "Date: Wed, 10 Jul 2002"
src_words = src_string.split()
dst_words = ["Date:", "Wed,", "NUM", "Jul", "NUM"]
assert EmailCleaner.convert_num_to_word(src_words) == dst_words

# Check remove_punctuation().
src_string = "superstars -- you'll find investing more fun..."
dst_string = "superstars  youll find investing more fun"
assert EmailCleaner.remove_punctuation(src_string) == dst_string

In [33]:
# Step 1 of pipeline: data cleaning.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

email_cleaner = EmailCleaner()

# Step 2 of pipeline: CountVectorizer.
count_vectorizer = CountVectorizer()

# Build pipeline.
prepare_pipeline = Pipeline([
    ("email_cleaner", email_cleaner),
    ("count_vectorizer", count_vectorizer),
])

# Run preprocessing.
X_all = X_train + X_test
prepare_pipeline.fit(X_all)
X_all = prepare_pipeline.transform(X_all)
num_train = len(X_train)
X_train = X_all[:num_train]
X_test = X_all[num_train:]

print(X_train.shape)
print(X_test.shape)

(2436, 109039)
(610, 109039)


### Step 3: Train a spam classifier

Building a spam classifier, and training a classifier with the training set.

In [34]:
from sklearn.linear_model import LogisticRegression
      
logis = LogisticRegression(C=100, random_state=0, solver="liblinear")

logis.fit(X_train, y_train)

LogisticRegression(C=100, random_state=0, solver='liblinear')

### Step 4: Evaluating the classifier

Testing the classifier with the test set and printing the precision and recall.

In [35]:
from sklearn.metrics import precision_score, recall_score, accuracy_score

y_pred = logis.predict(X_test)

acc = accuracy_score(y_true=y_test, y_pred=y_pred)
ps = precision_score(y_true=y_test, y_pred=y_pred)
rs = recall_score(y_true=y_test, y_pred=y_pred)

print(f"The precision score for Logistic Regression individually is {ps:.5f}")
print(f"The recall score for Logistic Regression individually is {rs:.5f}")

The precision score for Logistic Regression individually is 0.98195
The recall score for Logistic Regression individually is 0.97143


### Step 5: Ensemble of classifiers

1. Implementing 4 new classifiers (in total, we have 5 classifiers now).
2. Using hard or soft voting to ensemble those classifiers.
3. Train our ensemble model on the training set. Reporting training/testing precision and recall.

In [36]:
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

print("******************* For Individual Classifiers *******************\n")

print(f"The accuracy score for Logistic Regression individually is {acc:.5f}\n")

# Perceptron
perc = Perceptron()
perc.fit(X_train, y_train)
perc_pred = perc.predict(X_test)
perc_acc = accuracy_score(y_true=y_test, y_pred=perc_pred)
print(f"The accuracy score for Perceptron individually is {perc_acc:.5f}\n")

# SVM
svm = SVC(kernel="linear", C=1, random_state=1)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
svm_acc = accuracy_score(y_true=y_test, y_pred=svm_pred)
print(f"The accuracy score for SVM individually is {svm_acc:.5f}\n")

# Decision Tree
tree = DecisionTreeClassifier(criterion="gini", max_depth=5, random_state=2)
tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)
tree_acc = accuracy_score(y_true=y_test, y_pred=tree_pred)
print(f"The accuracy score for Decision Tree individually is {tree_acc:.5f}\n")

# KNN
knn = KNeighborsClassifier(n_neighbors=5, p=2, metric="minkowski")
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
knn_acc = accuracy_score(y_true=y_test, y_pred=knn_pred)
print(f"The accuracy score for KNN individually is {knn_acc:.5f}\n")

print("******************* For Ensemble *******************\n")

# Ensemble
estimators = [("logis", logis), ("perc", perc), ("svm", svm), ("tree", tree), ("knn", knn)]
ensemble = VotingClassifier(estimators=estimators)
ensemble.fit(X_train, y_train)
ensemble_pred_train = ensemble.predict(X_train)
ensemble_acc_train = accuracy_score(y_true=y_train, y_pred=ensemble_pred_train)
ensemble_pre_train = precision_score(y_true=y_train, y_pred=ensemble_pred_train)
ensemble_rec_train = recall_score(y_true=y_train, y_pred=ensemble_pred_train)

print(f"The accuracy score for ensemble using train set is {ensemble_acc_train:.5f}\n")
print(f"The precision score for ensemble using train set is {ensemble_pre_train:.5f}\n")
print(f"The recall score for ensemble using train set is {ensemble_rec_train:.5f}\n\n")

ensemble_pred_test = ensemble.predict(X_test)
ensemble_acc_test = accuracy_score(y_true=y_test, y_pred=ensemble_pred_test)
ensemble_pre_test = precision_score(y_true=y_test, y_pred=ensemble_pred_test)
ensemble_rec_test = recall_score(y_true=y_test, y_pred=ensemble_pred_test)

print(f"The accuracy score for ensemble using test set is {ensemble_acc_test:.5f}\n")
print(f"The precision score for ensemble using test set is {ensemble_pre_test:.5f}\n")
print(f"The recall score for ensemble using test set is {ensemble_rec_test:.5f}\n")

******************* For Individual Classifiers *******************

The accuracy score for Logistic Regression individually is 0.97869

The accuracy score for Perceptron individually is 0.96393

The accuracy score for SVM individually is 0.97705

The accuracy score for Decision Tree individually is 0.92459

The accuracy score for KNN individually is 0.91967

******************* For Ensemble *******************

The accuracy score for ensemble using train set is 1.00000

The precision score for ensemble using train set is 1.00000

The recall score for ensemble using train set is 1.00000


The accuracy score for ensemble using test set is 0.98689

The precision score for ensemble using test set is 0.98921

The recall score for ensemble using test set is 0.98214

