# Spam classificator

# Problem definition

We need to classify either if an email is SPAM or not.
This is a BINARY CLASSIFICATION problem, where the label will be spam and get binary values 0/1.

## Precission vs Recall tradeoff

We also want to study the tradeoff between PRECISION and RECALL.

**Precision**: rate of correct classifications. If from 100 mails we classify 70 as spam and 50 are spam, the precision is 50/70. It is, from all the classifications we made, how many of them are correct.

**Recall**: rate of detected spams. If from 100 mails there are 80 spams, and we classify correctly 70, the recall is 70/80. It is how many of the mails that are true spams are correctly classified.

*Precision* = TP / TP+FP

*Recall* = TP / TP+FN

So for a spam filter, we want to maximize the recall.

# Data gathering

### Downloading

In [43]:
import os
import tarfile
import urllib.request
import numpy as np


DOWNLOAD_PATH = "datasets"

SPAM_URL = 'https://spamassassin.apache.org/old/publiccorpus/20021010_spam.tar.bz2'
SPAM_2_URL = 'https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2'
HAM_URL = 'https://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2'

def fetch_spam_data(spam_url=SPAM_URL, spam2_url=SPAM_2_URL, ham_url=HAM_URL, download_path=DOWNLOAD_PATH):
    if not os.path.isdir(download_path):
        os.makedirs(download_path)
    for url in (spam_url, spam2_url, ham_url):
        filename = url.split('/')[-1]
        path = os.path.join(download_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        with tarfile.open(path) as tar:
            tar.extractall(path=download_path)

fetch_spam_data()

### Loading data

In [44]:
from os import listdir
import pandas as pd

SPAM_FOLDER = os.path.join(DOWNLOAD_PATH, 'spam')
SPAM2_FOLDER = os.path.join(DOWNLOAD_PATH, 'spam_2')
HAM_FOLDER = os.path.join(DOWNLOAD_PATH, 'easy_ham')

exclude = ['0000.7b1b73cf36cf9dbc3d64e3f2ee2b91f1']

def load_spam_data(spam_folder=SPAM_FOLDER,  spam2_folder=SPAM2_FOLDER, ham_folder=HAM_FOLDER):
    data = []
    for filename in listdir(spam_folder):
        if filename in exclude:
            continue
        with open(os.path.join(spam_folder, filename), 'rb') as f:
            text = f.read().decode('latin-1')
            data.append((text, 1))
    for filename in listdir(spam2_folder):
        with open(os.path.join(spam2_folder, filename), 'rb') as f:
            text = f.read().decode('latin-1')
            data.append((text, 1))
    for filename in listdir(ham_folder):
        with open(os.path.join(ham_folder, filename), 'rb') as f:
            text = f.read().decode('latin-1')
            data.append((text, 0))       

    spam_df = pd.DataFrame(data, columns=['text', 'spam'])
    return spam_df

spam_df = load_spam_data()
# save in csv
spam_df.to_csv(os.path.join(DOWNLOAD_PATH, 'spam.csv'), index=False)

In [45]:
spam_df.head()

Unnamed: 0,text,spam
0,From 12a1mailbot1@web.de Thu Aug 22 13:17:22 ...,1
1,From ilug-admin@linux.ie Thu Aug 22 13:27:39 ...,1
2,From sabrina@mx3.1premio.com Thu Aug 22 14:44...,1
3,From wsup@playful.com Thu Aug 22 16:17:00 200...,1
4,From social-admin@linux.ie Thu Aug 22 16:37:3...,1


In [46]:
print(spam_df['spam'].value_counts())

spam
0    2551
1    1897
Name: count, dtype: int64


# Data preparation

## Train test split

In [47]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(spam_df, spam_df['spam']):
    strat_train_set = spam_df.iloc[train_index]
    strat_test_set = spam_df.iloc[test_index]

print(f"Training set size: {len(strat_train_set)}")
print(f"Test set size: {len(strat_test_set)}")
print(f"Training set spam ratio: {strat_train_set['spam'].value_counts() / len(strat_train_set)}")
print(f"Test set spam ratio: {strat_test_set['spam'].value_counts() / len(strat_test_set)}")

Training set size: 3558
Test set size: 890
Training set spam ratio: spam
0    0.573637
1    0.426363
Name: count, dtype: float64
Test set spam ratio: spam
0    0.573034
1    0.426966
Name: count, dtype: float64


## Feature engineering

I am going to create two types of new features:
- stats about the email: proportion of upper/lower chars, num of exclamations, question marks, etc
- vector of processed words

### Mail stats

In [48]:
# Function to calc stats from sender
import re
def sender_stats(text):
    sender = re.findall(r'From: (.*)', text)[0]
    if sender == '':
        return np.array([0, 0, 0])
    sender_num_rate = len([c for c in sender if c.isdigit()]) / len(sender)
    sender_upper_rate = len([c for c in sender if c.isupper()]) / len(sender)
    sender_exclamation_rate = len([c for c in sender if c == '!']) / len(sender)

    return np.array([sender_num_rate, sender_upper_rate, sender_exclamation_rate])

# Function to calc stats from subject

CURRENCY_SYMBOLS = ['$', '£', '€', '¥', '₹', '₽', '₩', '₴', '₱', '₲', '₪', '₫', '₵', '₭', '₦', '₸', '₼', '₡', '₢', '₯', '₠', '₧', '₣', '₤', '₶', '₸', '₺', '₼', '₽', '₾', '₿']

def subject_stats(text):
    subject = re.findall(r'Subject: (.*)', text)[0]
    if subject == '':
        return np.array([0, 0, 0, 0])
    subject_num_rate = len([c for c in subject if c.isdigit()]) / len(subject)
    subject_upper_rate = len([c for c in subject if c.isupper()]) / len(subject)
    subject_currency_rate = len([c for c in subject if c in CURRENCY_SYMBOLS]) / len(subject)
    subject_exclamation_rate = len([c for c in subject if c == '!']) / len(subject)
    
    return np.array([subject_num_rate, subject_upper_rate, subject_currency_rate, subject_exclamation_rate])

In [49]:
# Create a custom Transformer that will be called from a ColumnTransformer
# This custom transformer will ingest a Pandas Df of dimmension (n, 1) and return a numpy array of dimmension (n, x), x = number of new features

from sklearn.base import BaseEstimator, TransformerMixin
class AttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # X is a Pandas Df of dimmension (n, 1)
        # Each new feature is a np array of dimmension (n, 1)

        # If len in words is 0, return np array of zeros
        avg_word_len = X['text'].apply(lambda x: np.divide(len(x), len(x.split()), out=np.zeros_like(float(len(x))), where=len(x.split())!=0)).values.reshape(-1, 1)
        rate_upper = X['text'].apply(lambda x: np.divide(len([c for c in x if c.isupper()]), len(x), out=np.zeros_like(float(len(x))), where=len(x)!=0)).values.reshape(-1, 1)
        rate_exclamation = X['text'].apply(lambda x: np.divide(len([c for c in x if c == '!']), len(x), out=np.zeros_like(float(len(x))), where=len(x)!=0)).values.reshape(-1, 1)
        rate_question = X['text'].apply(lambda x: np.divide(len([c for c in x if c == '?']), len(x), out=np.zeros_like(float(len(x))), where=len(x)!=0)).values.reshape(-1, 1)
        sender_num_rate, sender_upper_rate, sender_exclamation_rate = np.array(list(X['text'].apply(sender_stats))).T
        subject_num_rate, subject_upper_rate, subject_currency_rate, subject_exclamation_rate = np.array(list(X['text'].apply(subject_stats))).T


        return np.c_[
            avg_word_len, rate_upper, rate_exclamation, rate_question, 
            sender_num_rate, sender_upper_rate, sender_exclamation_rate,
            subject_num_rate, subject_upper_rate, subject_currency_rate, subject_exclamation_rate
        ]

In [50]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

stats_preprocessing_pipeline = Pipeline([
    ('add_stats', AttributesAdder()),
    ('std_scaler', StandardScaler())
])

# Create column transformer that will apply the stats_preprocessing_pipeline to the text column
stats_column_transformer = ColumnTransformer([
    ('stats', stats_preprocessing_pipeline, ['text'])
], remainder='passthrough')

In [51]:
# Test on sample data
sample = strat_train_set.sample(5)
print(sample)

sample_pr = stats_column_transformer.fit_transform(sample)
print(sample_pr)

                                                   text  spam
113   From thisisagreatfreepornmovie@framesetup.com ...     1
1746  From eighbor2k@hotmail.com  Tue Aug  6 10:56:5...     1
4253  From rssfeeds@jmason.org  Fri Oct  4 11:02:01 ...     0
2981  From exmh-workers-admin@redhat.com  Wed Sep 11...     0
2617  From fork-admin@xent.com  Fri Sep 20 11:32:42 ...     0
[[ 1.4744794  -1.88372521 -0.64264869 -1.14362847 -0.5        -0.561505
   0.         -0.13054181 -1.21175003  0.          0.          1.        ]
 [-0.1889133   0.33962339  1.93929861  0.11491267  2.         -0.90933209
   0.          1.96499771 -1.21175003  0.          0.          1.        ]
 [-0.6542118  -0.06082571 -0.64264869 -1.14362847 -0.5        -0.90933209
   0.         -0.6114853   0.72809218  0.          0.          0.        ]
 [ 0.72425675  0.65306607 -0.01135254  1.01039893 -0.5         0.87517209
   0.         -0.6114853   0.62861309  0.          0.          0.        ]
 [-1.35561106  0.95186145 -0.64264

#### Word Vectorizer

In [52]:
import re
from nltk.stem import PorterStemmer

def spamEmailPreprocessor(
    email_text,
    stop_words_set,
    remove_html_header=True,
    remove_punctuation=True,
    stem=True
):
    currency_symbols = ['$', '£', '€', '¥', '₹', '₽', '₩', '₴', '₱', '₲', '₪', '₫', '₵', '₭', '₦', '₸', '₼', '₡', '₢', '₯', '₠', '₧', '₣', '₤', '₶', '₸', '₺', '₼', '₽', '₾', '₿']
    punctuation_symbols = ['!', '"', '#', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', 
                   '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']

    if remove_html_header:
        # HTML header is separated from the rest of the email by at least one blank line
        split_email = email_text.split('\n\n', 1)
        email_text = split_email[1] if len(split_email) > 1 else split_email[0]

    email_text = email_text.lower()
    email_text = ' '.join([word.lower() for word in email_text.split() if word.lower() not in stop_words_set])

    if remove_punctuation:
        for symbol in punctuation_symbols:
            email_text = email_text.replace(symbol, ' ')

    if stem:
        stemmer = PorterStemmer()
        email_text = ' '.join([stemmer.stem(word) for word in email_text.split()])

    # Replace numbers with NUMBER xx or xx.xx
    email_text = re.sub(r'\d+\.\d+', ' NUMBER ', email_text)
    email_text = re.sub(r'\d+', ' NUMBER ', email_text)

    # Replace currency symbols with CURRENCY
    for symbol in currency_symbols:
        email_text = email_text.replace(symbol, ' CURRENCY ')

    # Replace email addresses with EMAIL
    email_text = re.sub(r'\S+@\S+', ' EMAIL ', email_text)

    # Replace URLs with URL
    email_text = re.sub(r'http\S+', ' URL ', email_text)

    # Replace IP addresses with IP
    email_text = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' IP ', email_text)
    
    return email_text

In [53]:
from nltk.corpus import stopwords
stop_words_en = set(stopwords.words('english'))

In [54]:
# Test on rand email
rand_email = strat_train_set.sample(1, random_state=41)['text'].values[0]
# Preprocess
rand_email_pr = spamEmailPreprocessor(rand_email, stop_words_en)
print(rand_email_pr)

got bad credit fix it ye fix credit report easi use softwar attorney fee simpli download way good credit get rid neg credit report easili nucredit avail two version nucredit individu use  CURRENCY  NUMBER   NUMBER  credit pro use  NUMBER  peopl  CURRENCY  NUMBER   NUMBER  follow link start repair credit today http  NUMBER   NUMBER   NUMBER   NUMBER  inbox NUMBER  wish remov mail list pleas click link indic address receiv origin email properli remov list mailto nucredit NUMBER  yahoo com subject remov


In [55]:
# Create a custom Transformer, which Transform will apply the spamEmailPreprocessor function to the text column

stop_words_set = set(stopwords.words('english'))

class EmailPreprocessorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, remove_html_header=True, remove_punctuation=True, stem=True):
        self.stop_words_set = stop_words_set
        self.remove_html_header = remove_html_header
        self.remove_punctuation = remove_punctuation
        self.stem = stem

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_pr = X['text'].apply(
            lambda x: spamEmailPreprocessor(
                x, 
                self.stop_words_set, 
                self.remove_html_header, 
                self.remove_punctuation, 
                self.stem
            )
        ).values.reshape(-1, 1)

        return X_pr

In [56]:
# Vectorizer custom transformer

from sklearn.feature_extraction.text import CountVectorizer

class VectorizerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, max_features=1000):
        self.max_features = max_features

    def fit(self, X, y=None):
        self.vectorizer = CountVectorizer(max_features=self.max_features)
        self.vectorizer.fit(X.ravel())
        return self

    def transform(self, X, y=None):
        return self.vectorizer.transform(X.ravel())

In [57]:
preprocessor = EmailPreprocessorTransformer()

text_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('vectorizer', VectorizerTransformer())
])

In [58]:
# Final pipeline

from sklearn.pipeline import FeatureUnion

full_pipeline = ColumnTransformer([
    ('stats', stats_preprocessing_pipeline, ['text']),
    ('text', text_pipeline, ['text'])
], remainder='passthrough')

# Test on sample data
sample = strat_train_set.sample(5)
# print(sample)

sample_pr = full_pipeline.fit_transform(sample)
print(sample_pr.shape)

(5, 400)


# Model training

## Choosing a classifier

In [59]:
# Number of samples
spam_df.shape

(4448, 2)

Linear SVC -> Naive Bayes

### Linear SVC

In [60]:
strat_train_set_pr = full_pipeline.fit_transform(strat_train_set)

KeyboardInterrupt: 

In [None]:
X_train_pr = strat_train_set_pr[:, :-1]
print(X_train_pr.shape)
print(y_train.shape)

(3558, 1011)
(3558, 1)


In [None]:
# SVC Classifier with GridSearchCV

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
# Those are hyperparameters that will be tuned for the SVC

pipeline_with_svc = Pipeline([
    ('preprocessing', full_pipeline),
    ('svc', SVC())
])

param_grid = [
    {
        'preprocessing__text__preprocessor__remove_html_header': [True, False], 
        'preprocessing__text__preprocessor__remove_punctuation': [True, False], 
        'preprocessing__text__preprocessor__stem': [True, False], 
        'preprocessing__text__vectorizer__max_features': [1000, 2000, 3000, 4000, 5000], 
        'svc__kernel': ['linear', 'rbf'], 
        'svc__C': [0.1, 1, 10, 100, 1000]
    }
]

svc_clf = SVC()

grid_search = GridSearchCV(pipeline_with_svc, param_grid, cv=5, scoring='accuracy', verbose=10, n_jobs=-1)

grid_search.fit(strat_train_set_pr, strat_train_set['spam'])

Fitting 5 folds for each of 400 candidates, totalling 2000 fits


KeyboardInterrupt: 