# 6.2 Spam Classification with SVMs 
## Part 1: Email Preprocessing

In [1]:
# import key libraries (remember to pip install numpy etc. first)
import numpy as np
import sys
import sklearn as sk
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
print('\nPreprocessing sample email (emailSample1.txt)\n')
PATH="./data/"
DATA="emailSample1.txt"

with open(f'{PATH}{DATA}', 'r') as email:
    file_contents=email.read()#.replace('\n', '')
    email.close()


Preprocessing sample email (emailSample1.txt)



In [3]:
file_contents

"> Anyone knows how much it costs to host a web portal ?\n>\nWell, it depends on how many visitors you're expecting.\nThis can be anywhere from less than 10 bucks a month to a couple of $100. \nYou should checkout http://www.rackspace.com/ or perhaps Amazon EC2 \nif youre running something big..\n\nTo unsubscribe yourself from this mailing list, send an email to:\ngroupname-unsubscribe@egroups.com\n\n"

In [117]:
def getVocabList():
    #GETVOCABLIST reads the fixed vocabulary list in vocab.txt and returns a
    #cell array of the words
    #   vocabList = GETVOCABLIST() reads the fixed vocabulary list in vocab.txt 
    #   and returns a cell array of the words in vocabList.
    DATA="vocab.txt"
    vocablist = []
    with open(f'{PATH}{DATA}', 'r') as vocabfile:
        lines=vocabfile.readlines()#.replace('\n','')
        vocabfile.close()
    for line in lines:
        vocablist += [line[line.find('\t')+1:-1]]
    return np.array(vocablist)

In [118]:
getVocabList()

array(['aa', 'ab', 'abil', ..., 'zdnet', 'zero', 'zip'], dtype='<U40')

In [5]:
import re # regular expressions

In [72]:
def regexprep (email, reg, repl):
    regex = re.compile(reg, re.IGNORECASE)
    email_contents, count = regex.subn(repl, email)
    return email_contents

In [161]:
def processEmail(email_contents):
    #PROCESSEMAIL preprocesses a the body of an email and
    #returns a list of word_indices
    #   word_indices = PROCESSEMAIL(email_contents) preprocesses
    #   the body of an email and returns a list of indices of the
    #   words contained in the email.
    #

    # Load Vocabulary
    vocabList = getVocabList()

    # Init return value
    word_indices = np.array([])

    # ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # hdrstart = strfind(email_contents, ([char(10) char(10)]));
    # email_contents = email_contents(hdrstart(1):end);

    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space

    email_contents = regexprep(email_contents, r'<[^<>]+>', ' ')

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = regexprep(email_contents, r'[0-9]+', 'number')

    # Handle URLS
    # Look for strings starting with http:// or https://

    email_contents = regexprep(email_contents, r'(http|https)://[^\s]*', 'httpaddr')

    # Handle Email Addresses
    # 3 Look for strings with @ in the middle
    email_contents = regexprep(email_contents, r'[^\s]+@[^\s]+', 'emailaddr');

    # Handle $ sign
    email_contents = regexprep(email_contents, r'[$]+', 'dollar');

    # Output the email to screen as well
    print('\n==== Processed Email ====\n\n');

    # Process file
    l = 0
    
    # Tokenize and also get rid of any punctuation
    regex = re.compile(r'[][ @$/#.-:&*+=?!(){},''">_<;%\n\t]', re.IGNORECASE)
    string_splits = re.split(regex, email_contents) 

    from nltk.stem import PorterStemmer
    ps = PorterStemmer()
    count = 0
    for str in string_splits:

        # Remove any non alphanumeric characters
        str = regexprep(str, '[^a-zA-Z0-9]', '')

        # Stem the word
        # (the porterStemmer sometimes has issues, so we use a try catch block)
        str=ps.stem(str)

        # Skip the word if it is too short
        if len(str) < 1:
           continue

        #wordindx = find(ismember(vocabList, str)==1)
        wordindx = np.argwhere(np.in1d(vocabList, str)).ravel()
        #print(f'word_indices{word_indices} \n wordindx{wordindx}')

        if len(wordindx)>0:
            word_indices = np.r_[word_indices, wordindx]

        # Print to screen, ensuring that the output lines are not too long
        if (l + len(str) + 1) > 78:
            print('\n')
            l = 0
        print(f'{str} ', end='')
        l = l + len(str) + 1

    return word_indices

In [162]:
word_indices  = processEmail(file_contents)


==== Processed Email ====


anyon know how much it cost to host a web portal well it depend on how mani 

visitor your expect thi can be anywher from less than number buck a month to 

a coupl of dollarnumb you should checkout httpaddr or perhap amazon ecnumb if 

your run someth big to unsubscrib yourself from thi mail list send an email 

to emailaddr 

In [138]:
# Load Vocabulary
vocabList = getVocabList()

# Init return value
word_indices = []

# ========================== Preprocess Email ===========================

# Find the Headers ( \n\n and remove )
# Uncomment the following lines if you are working with raw emails with the
# full headers

# hdrstart = strfind(email_contents, ([char(10) char(10)]));
# email_contents = email_contents(hdrstart(1):end);

# Lower case
email_contents = email_contents.lower()
print("Lower case\n", email_contents)

# Strip all HTML
# Looks for any expression that starts with < and ends with > and replace
# and does not have any < or > in the tag it with a space
email_contents = regexprep(email_contents, r'<[^<>]+>', ' ')
print("Strip All HTML\n", email_contents)

# Handle Numbers
# Look for one or more characters between 0-9
email_contents = regexprep(email_contents, r'[0-9]+', 'number')
print("Handle Numbers\n",email_contents)

# Handle URLS
# Look for strings starting with http:// or https://

email_contents = regexprep(email_contents, r'(http|https)://[^\s]*', 'httpaddr')
print("Handle URLs\n", email_contents)

# Handle Email Addresses
# 3 Look for strings with @ in the middle
email_contents = regexprep(email_contents, r'[^\s]+@[^\s]+', 'emailaddr');
print("Handle email\n", email_contents)

# Handle $ sign
email_contents = regexprep(email_contents, r'[$]+', 'dollar');
print("Handle $\n", email_contents)

# Output the email to screen as well
print('\n==== Processed Email ====\n\n');

# Process file
l = 0


Lower case
 dollarnumber blah number number blah httpaddr    dude > anyone knows how much it costs to host a web portal ?
>
well, it depends on how many visitors you're expecting.
this can be anywhere from less than number bucks a month to a couple of dollarnumber. 
you should checkout httpaddr or perhaps amazon ecnumber 
if youre running something big..

to unsubscribe yourself from this mailing list, send an email to:
emailaddr


Strip All HTML
 dollarnumber blah number number blah httpaddr    dude > anyone knows how much it costs to host a web portal ?
>
well, it depends on how many visitors you're expecting.
this can be anywhere from less than number bucks a month to a couple of dollarnumber. 
you should checkout httpaddr or perhaps amazon ecnumber 
if youre running something big..

to unsubscribe yourself from this mailing list, send an email to:
emailaddr


Handle Numbers
 dollarnumber blah number number blah httpaddr    dude > anyone knows how much it costs to host a web portal 

In [106]:
regex = re.compile(r'[][ @$/#.-:&*+=?!(){},''">_<;%\n\t]', re.IGNORECASE)
re.split(regex, email_contents)

['dollarnumber',
 'blah',
 'number',
 'number',
 'blah',
 'httpaddr',
 '',
 '',
 '',
 'dude',
 '',
 '',
 'anyone',
 'knows',
 'how',
 'much',
 'it',
 'costs',
 'to',
 'host',
 'a',
 'web',
 'portal',
 '',
 '',
 '',
 '',
 'well',
 '',
 'it',
 'depends',
 'on',
 'how',
 'many',
 'visitors',
 "you're",
 'expecting',
 '',
 'this',
 'can',
 'be',
 'anywhere',
 'from',
 'less',
 'than',
 'number',
 'bucks',
 'a',
 'month',
 'to',
 'a',
 'couple',
 'of',
 'dollarnumber',
 '',
 '',
 'you',
 'should',
 'checkout',
 'httpaddr',
 'or',
 'perhaps',
 'amazon',
 'ecnumber',
 '',
 'if',
 'youre',
 'running',
 'something',
 'big',
 '',
 '',
 '',
 'to',
 'unsubscribe',
 'yourself',
 'from',
 'this',
 'mailing',
 'list',
 '',
 'send',
 'an',
 'email',
 'to',
 '',
 'emailaddr',
 '',
 '']

In [94]:

while len(email_contents)!=0:

    # Tokenize and also get rid of any punctuation
    [str, email_contents] = ...
       strtok(email_contents, ...
              [' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]);

    # Remove any non alphanumeric characters
    str = regexprep(str, '[^a-zA-Z0-9]', '');

    # Stem the word
    # (the porterStemmer sometimes has issues, so we use a try catch block)
    try str = porterStemmer(strtrim(str));
    catch str = ''; continue;
    end;

    # Skip the word if it is too short
    if length(str) < 1:
       continue

    wordindx = find(ismember(vocabList, str)==1);
    if length(wordindx)==1
      word_indices = [word_indices; wordindx];
    end

    # Print to screen, ensuring that the output lines are not too long
    if (l + length(str) + 1) > 78:
        print('\n')
        l = 0
    print(f'{str}')
    l = l + length(str) + 1

return word_indices

SyntaxError: invalid syntax (<ipython-input-94-16b14a71f01a>, line 4)

In [145]:
wordindx = np.argwhere(np.in1d(vocabList, 'how')).ravel()

In [146]:
wordindx.shape, vocabList.shape

((1,), (1899,))

In [157]:
ps = PorterStemmer()

In [158]:
for i in words: 
    print(ps.stem(i))


game
game
game
game


In [159]:
words

['game', 'gaming', 'gamed', 'games']

In [160]:
ps.stem('expecting')

'expect'