# 6.2 Spam Classification with SVMs 
## Part 1: Email Preprocessing

In [1]:
# import key libraries (remember to pip install numpy etc. first)
import numpy as np
import sys
import sklearn as sk
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
print('\nPreprocessing sample email (emailSample1.txt)\n')
PATH="./data/"
DATA="emailSample1.txt"

with open(f'{PATH}{DATA}', 'r') as email:
    file_contents=email.read()#.replace('\n', '')
    email.close()


Preprocessing sample email (emailSample1.txt)



In [3]:
def getVocabList():
    #GETVOCABLIST reads the fixed vocabulary list in vocab.txt and returns a
    #cell array of the words
    #   vocabList = GETVOCABLIST() reads the fixed vocabulary list in vocab.txt 
    #   and returns a cell array of the words in vocabList.
    DATA="vocab.txt"
    vocablist = []
    with open(f'{PATH}{DATA}', 'r') as vocabfile:
        lines=vocabfile.readlines()#.replace('\n','')
        vocabfile.close()
    for line in lines:
        vocablist += [line[line.find('\t')+1:-1]]
    return np.array(vocablist)

In [4]:
getVocabList()

array(['aa', 'ab', 'abil', ..., 'zdnet', 'zero', 'zip'], dtype='<U40')

In [5]:
import re # regular expressions

In [6]:
def regexprep (email, reg, repl):
    regex = re.compile(reg, re.IGNORECASE)
    email_contents, count = regex.subn(repl, email)
    return email_contents

In [7]:
def processEmail(email_contents, show=False):
    #PROCESSEMAIL preprocesses a the body of an email and
    #returns a list of word_indices
    #   word_indices = PROCESSEMAIL(email_contents) preprocesses
    #   the body of an email and returns a list of indices of the
    #   words contained in the email.
    #

    if show:
        print (f"=======ORIGINAL EMAIL BEG==========\n{email_contents}\n=======ORIGINAL EMAIL END==========")
    # Load Vocabulary
    vocabList = getVocabList()
    
    # Init return value
    word_indices = np.array([])

    # ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # hdrstart = strfind(email_contents, ([char(10) char(10)]));
    # email_contents = email_contents(hdrstart(1):end);

    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space

    email_contents = regexprep(email_contents, r'<[^<>]+>', ' ')

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = regexprep(email_contents, r'[0-9]+', 'number')

    # Handle URLS
    # Look for strings starting with http:// or https://

    email_contents = regexprep(email_contents, r'(http|https)://[^\s]*', 'httpaddr')

    # Handle Email Addresses
    # 3 Look for strings with @ in the middle
    email_contents = regexprep(email_contents, r'[^\s]+@[^\s]+', 'emailaddr');

    # Handle $ sign
    email_contents = regexprep(email_contents, r'[$]+', 'dollar');

    # Output the email to screen as well
    if show:
        print (f"=======PROCESSED EMAIL BEG==========")

    # Process file
    l = 0
    
    # Tokenize and also get rid of any punctuation
    regex = re.compile(r'[][ @$/#.-:&*+=?!(){},''">_<;%\n\t]', re.IGNORECASE)
    string_splits = re.split(regex, email_contents) 

    from nltk.stem import PorterStemmer
    ps = PorterStemmer()
    count = 0

    for str in string_splits:

        # Remove any non alphanumeric characters
        str = regexprep(str, '[^a-zA-Z0-9]', '')

        # Stem the word
        # (the porterStemmer sometimes has issues, so we use a try catch block)
        str=ps.stem(str)

        # Skip the word if it is too short
        if len(str) < 1:
           continue

        #wordindx = find(ismember(vocabList, str)==1)
        wordindx = np.argwhere(np.in1d(vocabList, str)).ravel()
        #print(f'word_indices{word_indices} \n wordindx{wordindx}')

        if len(wordindx)>0:
            word_indices = np.r_[word_indices, wordindx]

        # Print to screen, ensuring that the output lines are not too long
        if show:
            if (l + len(str) + 1) > 70:
                print()
                l = 0
            print(f'{str} ', end='')
            l = l + len(str) + 1
    if show:
        print (f"\n=======PROCESSED EMAIL END==========")
        print (word_indices.astype(int))

    return word_indices.astype(int)

In [8]:
word_indices  = processEmail(file_contents, show=True)

> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors you're expecting.
This can be anywhere from less than 10 bucks a month to a couple of $100. 
You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
if youre running something big..

To unsubscribe yourself from this mailing list, send an email to:
groupname-unsubscribe@egroups.com


anyon know how much it cost to host a web portal well it depend on 
how mani visitor your expect thi can be anywher from less than number 
buck a month to a coupl of dollarnumb you should checkout httpaddr or 
perhap amazon ecnumb if your run someth big to unsubscrib yourself 
from thi mail list send an email to emailaddr 
[  85  915  793 1076  882  369 1698  789 1821 1830  882  430 1170  793
 1001 1894  591 1675  237  161   88  687  944 1662 1119 1061 1698  374
 1161  478 1892 1509  798 1181 1236  809 1894 1439 1546  180 1698 1757
 1895  687 1675  991  960 1476   70  529 1698  530]


## Part 2: Feature Extraction

In [9]:
len(getVocabList())

1899

In [10]:
def emailFeatures(word_indices):
    #EMAILFEATURES takes in a word_indices vector and produces a feature vector
    #from the word indices
    #   x = EMAILFEATURES(word_indices) takes in a word_indices vector and
    #   produces a feature vector from the word indices.

    # Total number of words in the dictionary
    n = len(getVocabList())
    
    # binary feature vector
    x = np.zeros((n, 1))
    x[word_indices]=1

    return x

In [11]:
features = emailFeatures(word_indices)

# Print Stats
print(f'Length of feature vector: {len(features)}')
print(f'Number of non-zero entries: {np.sum(features > 0)}')
print('Program paused. Press enter to continue.\n')

Length of feature vector: 1899
Number of non-zero entries: 44
Program paused. Press enter to continue.



## Part 3: Train Linear SVM for Spam Classification

In [12]:
# import csv data
PATH="./data/"
DATA="spamTrain.mat"

import scipy.io
mat = scipy.io.loadmat(f'{PATH}{DATA}') # training data stored in arrays X, y
X = mat['X']
y = mat['y']

print('\nTraining Linear SVM (Spam Classification)\n')
print('(this may take 1 to 2 minutes) ...\n')

from sklearn import svm


C = 0.1
#model = svmTrain(X, y, C, @linearKernel);
clf = svm.SVC(C, kernel='linear', tol=1e-3, max_iter=10000) # note using svm built in linear kernel
clf.fit(X, y.ravel())  

p = clf.predict(X)
#p = svmPredict(model, X);

print(f'Training Accuracy: {np.mean((p.ravel() == y.ravel())*1.0) * 100}')


Training Linear SVM (Spam Classification)

(this may take 1 to 2 minutes) ...

Training Accuracy: 99.825


## Part 4: Test Spam Classification

In [13]:
# import csv data
PATH="./data/"
DATA="spamTest.mat"

import scipy.io
mat = scipy.io.loadmat(f'{PATH}{DATA}') # training data stored in arrays X, y
Xtest = mat['Xtest']
ytest = mat['ytest']

In [14]:
print('\nEvaluating the trained Linear SVM on a test set ...\n')

p = clf.predict(Xtest)

print(f'Test Accuracy: {np.mean((p.ravel() == ytest.ravel())*1.0) * 100}');


Evaluating the trained Linear SVM on a test set ...

Test Accuracy: 98.9


## Part 5: Top Predictors of Spam

In [50]:
idx = np.argsort(clf.coef_.ravel())[::-1] # Need ::-1 as want descending, so need to invert argsort result
vocab = getVocabList()
vocab[idx[0:10]]

weight = clf.coef_.ravel()[idx]

for i in range(15):
    print(f' {vocab[idx[i]]:15}({weight[i]:.6f})')

 our            (0.500614)
 click          (0.465916)
 remov          (0.422869)
 guarante       (0.383622)
 visit          (0.367710)
 basenumb       (0.345064)
 dollar         (0.323632)
 will           (0.269724)
 price          (0.267298)
 pleas          (0.261169)
 most           (0.257298)
 nbsp           (0.253941)
 lo             (0.253467)
 ga             (0.248297)
 hour           (0.246404)


## Part 6: Try Your Own Emails

In [64]:
print('\nPreprocessing sample email (emailSample1.txt)\n')
PATH="./data/"
DATA="spamSample1.txt"

with open(f'{PATH}{DATA}', 'r') as email:
    file_contents=email.read()#.replace('\n', '')
    email.close()


word_indices  = processEmail(file_contents, show=True)
x = emailFeatures(word_indices).T
p = clf.predict(x)

print(f'\n\n======SPAM CLASSIFIER=========\nProcessed {DATA}\n\nSpam Classification: {p}\n')
print('(1 indicates spam, 0 indicates not spam)\n\n')


Preprocessing sample email (emailSample1.txt)

Do You Want To Make $1000 Or More Per Week?

 

If you are a motivated and qualified individual - I 
will personally demonstrate to you a system that will 
make you $1,000 per week or more! This is NOT mlm.

 

Call our 24 hour pre-recorded number to get the 
details.  

 

000-456-789

 

I need people who want to make serious money.  Make 
the call and get the facts. 

Invest 2 minutes in yourself now!

 

000-456-789

 

Looking forward to your call and I will introduce you 
to people like yourself who
are currently making $10,000 plus per week!

 

000-456-789



3484lJGv6-241lEaN9080lRmS6-271WxHo7524qiyT5-438rjUv5615hQcf0-662eiDB9057dMtVl72


do you want to make dollarnumb or more per week if you are a motiv 
and qualifi individu i will person demonstr to you a system that will 
make you dollarnumb number per week or more thi is not mlm call our 
number hour prerecord number to get the detail numbernumbernumb i 
need peopl who want t