# Spam filter using a Support Vector Machine Model 
In this notebook, we will try to develop a machine learning model that is able to classify an email as spam or not-spam. This problem combines two aspects of machine learning techniques: Natural Language Processing and Classification (binary in this case).

**Import section**

In [241]:
from sklearn.feature_extraction.text import TfidfVectorizer
from html.parser import HTMLParser
import re
import os
import nltk
from nltk.corpus import words
from nltk.tokenize import word_tokenize
import pickle
from sklearn.svm import SVR
import time
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
from gensim.models.word2vec import Word2Vec
from tqdm import tqdm 
import re

## Helper functions
Here we define some useful functions that will help us process the data.

In [3]:
# Extends HTMLParser to adapt it to our data
class HtmlHrefParser(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def handle_starttag(self, tag, attrs):
        if(tag=='a'):
            for (att, val) in attrs:
                if (att=='href'):
                    self.fed.append(' hreflink ')
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    """Uses the HtmlHrefParser class to format the email text"""
    s = HtmlHrefParser()
    s.feed(html)
    return s.get_data()

# Words in the english language
word_list = list(set(words.words()))
    
def read_email(filename):
    """Reads the data from a file, strips the tags using 'strip_tags' function, and returns the email text and the email vocabulary tokens"""
    vocab = []
    with open(filename, 'r') as File:  
        try:
            email = re.sub(r'[^a-zA-Z]', ' ', strip_tags(re.sub(r'http\S+', ' hreflink ', File.read()))).strip().lower()
            for word in word_tokenize(email):
                if (len(word) >= 2) and (word in word_list):
                    vocab.append(word)
            return email, vocab
        except UnicodeDecodeError:
            print('couldnt read {}'.formate(filename))
            return None, vocab

In [232]:
LIMIT = 500 # limit samples

In [200]:
def read_from_dir(path, label=None):
    """Reads data from a directory using 'read_email' and 'read_unlabeled_email' """
    corpus = []
    vocabulary = []
    corpus_tokenized = []
    count = 0
    labels = []
    files = os.listdir(path)
    max_idx = len(files) if len(files)<LIMIT else LIMIT
    for filename in tqdm(files[:max_idx]):
        if (LIMIT is not None) and (count > LIMIT):
            break

        em, v = read_email(path+filename)
        if(label is not None):
            labels.append(label)

        if (em is None):
            continue

        corpus.append(em)
        corpus_tokenized.append(v)
        count += 1
        
        if(label):
            vocabulary = vocabulary + v

    return corpus, corpus_tokenized, vocabulary, labels

## Reading Data
In this part, we will read the training and testing data and build the training data vocabulary. 

In [20]:
corpus = []
corpus_tokenized = []
vocabulary = []
    
start = time.time()

spam_train_path = './training/spam/'
c, corpus_tokenized, v, labels = read_from_dir(spam_train_path,1)
corpus = corpus + c
vocabulary = vocabulary + v
    
    
ham_train_path = './training/ham/'
c, ct, v, lbls = read_from_dir(ham_train_path,0)
corpus = corpus + c
corpus_tokenized = corpus_tokenized + ct
vocabulary = vocabulary + v
labels = labels + lbls

end = time.time()
print('done in %.2f s' % (end - start))

100%|██████████| 500/500 [36:12<00:00,  2.08s/it]  
100%|██████████| 500/500 [33:09<00:00,  3.90s/it]


done in 4162.30 s


In [21]:
# Removing duplicates
vocabulary = list(set(vocabulary))
print(len(vocabulary))

5366


In [34]:
#labels = np.concatenate((np.ones(LIMIT), np.zeros(LIMIT)), axis=0)

### Reading variables
This is the section we can use to load our processed data from disk that we have saved in another section below

In [None]:
var_disct = {}
vars_path = './vars/'
for filename in tqdm(os.listdir(vars_path)):
    f = open(vars_path+filename, 'wb')
    var_disct[re.sub('\.pckl$', '', filename)] = pickle.load(f)
    f.close()
    
var_disct

In [None]:
# loading train vars
f = open('./vars/corpus.pckl', 'rb')
corpus = pickle.load(f)
f.close()

f = open('./vars/corpus_tokenized.pckl', 'rb')
corpus_tokenized = pickle.load(f)
f.close()

f = open('./vars/vocabulary.pckl', 'rb')
vocabulary = pickle.load(f)
f.close()

f = open('./vars/labels.pckl', 'rb')
labels = pickle.load(f)
f.close()




#CV vars
f = open('./vars/spam_cv_corpus.pckl', 'wb')
labels = pickle.load(f)
f.close()

f = open('./vars/spam_cv_corpus_tokenized.pckl', 'wb')
labels = pickle.load(f)
f.close()

f = open('./vars/ham_cv_corpus.pckl', 'wb')
labels = pickle.load(f)
f.close()

f = open('./vars/ham_cv_corpus_tokenized.pckl', 'wb')
labels = pickle.load(f)
f.close()

#test vars
f = open('./vars/spam_test_corpus.pckl', 'wb')
labels = pickle.load(f)
f.close()

f = open('./vars/spam_test_corpus_tokenized.pckl', 'wb')
labels = pickle.load(f)
f.close()

f = open('./vars/ham_test_corpus.pckl', 'wb')
labels = pickle.load(f)
f.close()

f = open('./vars/ham_test_corpus_tokenized.pckl', 'wb')
labels = pickle.load(f)
f.close()






## Vectorisation with Tf-Idf
This is the section we will vectorize our data using the Tf-Idf technique

In [44]:
#vectorization tf-idf style
vectorizer = TfidfVectorizer(vocabulary=vocabulary)
tfidf = vectorizer.fit_transform(corpus)

In [45]:
print(len(vectorizer.get_feature_names()))

5366


## SVM model
In this part we will try to come up with a model (using cross-validation for parameter search) to classify the data. 

In [212]:
#Trying different params
C_params = [0.1,1,10,100,1000]
gamma_params = [0.01,0.1,1,10]
svr_rbf_array = []

start = time.time()
for C_param in C_params:
    for gamma_param in gamma_params:
        svr_rbf = SVR(kernel='rbf',C=C_param, gamma=gamma_param)
        svr_rbf.fit(tfidf.toarray(), labels) 
        svr_rbf_array.append(svr_rbf)
end = time.time()
print('done in %.2f s' % (end - start))

done in 99.37 s


**Reading cross-validation and test data**

In [233]:
# Cross-validation
ham_cv_corpus = []
spam_cv_corpus = []
ham_cv_path = './cross_validation/ham/'
spam_cv_path = './cross_validation/spam/'

# read ham cross-validation data
ham_cv_corpus, ham_cv_corpus_tokenized, v, lbls = read_from_dir(ham_cv_path)

# read spam cross-validation data
spam_cv_corpus, spam_cv_corpus_tokenized, v, lbls = read_from_dir(spam_cv_path)

100%|██████████| 280/280 [09:16<00:00,  1.65s/it]
100%|██████████| 231/231 [35:35<00:00, 14.13s/it] 


Vetorizing the CV data using TF-IDF

In [236]:
spam_cv_tfidf = vectorizer.fit_transform(spam_cv_corpus)
ham_cv_tfidf = vectorizer.fit_transform(ham_cv_corpus)

**Testing various parameters using the CV data**

In [237]:
preds_spam = []
preds_ham = []
for m in svr_rbf_array:
    pred = m.predict(ham_cv_tfidf.toarray())
    preds_ham.append(sum(num < 0.5 for num in pred)/len(pred))
    pred = m.predict(spam_cv_tfidf.toarray())
    preds_spam.append(sum(num >= 0.5 for num in pred)/len(pred))

In [238]:
zip_preds = zip(preds_ham, preds_spam)
best_svm = svr_rbf_array[np.argmax([x + y for x, y in zip_preds])]
print(best_svm)

SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.01,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)


**Testing our final model on the test data**

In [239]:
# Test
ham_test_corpus = []
spam_test_corpus = []
ham_test_path = './testing/ham/'
spam_test_path = './testing/spam/'

# read ham test data
ham_test_corpus, ham_test_corpus_tokenized, v, lbls = read_from_dir(ham_test_path)
    
# read spam test data
spam_test_corpus, spam_test_corpus_tokenized, v, lbls = read_from_dir(spam_test_path)

100%|██████████| 275/275 [14:40<00:00,  2.33s/it]
100%|██████████| 259/259 [16:57<00:00, 10.15s/it] 


In [240]:
spam_test_tfidf = vectorizer.fit_transform(spam_test_corpus)
ham_test_tfidf = vectorizer.fit_transform(ham_test_corpus)

pred = best_svm.predict(spam_test_tfidf.toarray())
print('performance on spam test set {0:.0%}'.format(sum(num >= 0.5 for num in pred)/len(pred)))

pred = best_svm.predict(ham_test_tfidf.toarray())
print('performance on ham test set {0:.0%}'.format(sum(num < 0.5 for num in pred)/len(pred)))

performance on spam test set 76%
performance on ham test set 68%


### Saving variables
We will save the variables to disk so that we don't have to go through the whole data processing step -which takes a lot of time- the next time we want to run this  notebook on the same data. 

In [None]:
vars_to_save = {'corpus': corpus,
'corpus_tokenized': corpus_tokenized,
'vocabulary': vocabulary,
'labels': labels,
'spam_cv_corpus': spam_cv_corpus,
'spam_cv_corpus_tokenized': spam_cv_corpus_tokenized,
'ham_cv_corpus': ham_cv_corpus,
'ham_cv_corpus_tokenized': ham_cv_corpus_tokenized,
'spam_test_corpus': spam_test_corpus,
'spam_test_corpus_tokenized': spam_test_corpus_tokenized,
'ham_test_corpus': ham_test_corpus,
'ham_test_corpus_tokenized': ham_test_corpus_tokenized}



In [43]:
#train vars
f = open('./vars/corpus.pckl', 'wb')
pickle.dump(corpus, f)
f.close()

f = open('./vars/corpus_tokenized.pckl', 'wb')
pickle.dump(corpus_tokenized, f)
f.close()

f = open('./vars/vocabulary.pckl', 'wb')
pickle.dump(vocabulary, f)
f.close()

f = open('./vars/labels.pckl', 'wb')
pickle.dump(labels, f)
f.close()

#CV vars
f = open('./vars/spam_cv_corpus.pckl', 'wb')
pickle.dump(spam_cv_corpus, f)
f.close()

f = open('./vars/spam_cv_corpus_tokenized.pckl', 'wb')
pickle.dump(spam_cv_corpus_tokenized, f)
f.close()

f = open('./vars/ham_cv_corpus.pckl', 'wb')
pickle.dump(ham_cv_corpus, f)
f.close()

f = open('./vars/ham_cv_corpus_tokenized.pckl', 'wb')
pickle.dump(ham_cv_corpus_tokenized, f)
f.close()

#test vars
f = open('./vars/spam_test_corpus.pckl', 'wb')
pickle.dump(spam_test_corpus, f)
f.close()

f = open('./vars/spam_test_corpus_tokenized.pckl', 'wb')
pickle.dump(spam_test_corpus_tokenized, f)
f.close()

f = open('./vars/ham_test_corpus.pckl', 'wb')
pickle.dump(ham_test_corpus, f)
f.close()

f = open('./vars/ham_test_corpus_tokenized.pckl', 'wb')
pickle.dump(ham_test_corpus_tokenized, f)
f.close()

---
## Using Word2Vec
In this section, we will try to use **Word2Vec** instead of **TF-IDF**

In [219]:
len(min(corpus_tokenized, key=len))

53

In [220]:
len(max(corpus_tokenized, key=len))

6355

In [221]:
len(min(ham_cv_corpus_tokenized, key=len))

119

In [225]:
len(max(ham_cv_corpus_tokenized, key=len))

209

In [226]:
len(min(spam_cv_corpus_tokenized, key=len))

100

In [227]:
len(max(spam_cv_corpus_tokenized, key=len))

339

In [228]:
len(min(ham_test_corpus_tokenized, key=len))

199

In [229]:
len(max(ham_test_corpus_tokenized, key=len))

256

In [230]:
len(min(spam_test_corpus_tokenized, key=len))

57

In [231]:
len(max(spam_test_corpus_tokenized, key=len))

138

In [None]:
word_vec_size = 100
def to_corpus_matrix(crps_tokenized, model, em_limit):
    corpus_tokenized_limit = [(crt[:em_limit] if len(crt)>em_limit else crt) for crt in crps_tokenized]
    corpus_mat = []
    for cs_tn in corpus_tokenized_limit:
        if len(cs_tn)>em_limit:
            print(len(cs_tn))
        mat = [model.wv[w] for w in cs_tn]
        paddig = np.zeros((em_limit-len(cs_tn), 100))
        mat = np.concatenate((mat, paddig), axis=0)
        corpus_mat.append(mat)
    return np.array(corpus_mat)

In [None]:
w2v = Word2Vec(corpus_tokenized, size=word_vec_size, min_count=1)

In [None]:
email_limit = 300
corpus_matrix = to_corpus_matrix(corpus_tokenized, w2v, email_limit)

In [None]:
corpus_matrix.shape

In [None]:
len(corpus_matrix)

In [None]:
#Trying different params
C_params = [0.1,1,10,100,1000]
gamma_params = [0.01,0.1,1,10]
svr_rbf_array = []

start = time.time()
for C_param in C_params:
    for gamma_param in gamma_params:
        svr_rbf = SVR(kernel='rbf',C=C_param, gamma=gamma_param)
        svr_rbf.fit(np.reshape(corpus_matrix, (len(corpus_matrix),-1)), labels) 
        svr_rbf_array.append(svr_rbf)
end = time.time()
print('done in %.2f s' % (end - start))

In [182]:
arr = np.array([
    [
        [3,2],[1,5]
    ],
    [
        [0,9],[10,8]
    ],
    [
        [7,19],[12,6]
    ]
])
arr

array([[[ 3,  2],
        [ 1,  5]],

       [[ 0,  9],
        [10,  8]],

       [[ 7, 19],
        [12,  6]]])

In [183]:
arr.shape

(3, 2, 2)

In [194]:
arr_re = np.reshape(arr, (3,4))
arr_re

array([[ 3,  2,  1,  5],
       [ 0,  9, 10,  8],
       [ 7, 19, 12,  6]])

In [195]:
np.reshape(arr_re, (3,2,2))

array([[[ 3,  2],
        [ 1,  5]],

       [[ 0,  9],
        [10,  8]],

       [[ 7, 19],
        [12,  6]]])

(502, 100)

235892

'uplifting'