In [77]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
import re
from bs4 import BeautifulSoup
from sklearn import svm

In [9]:
# Examples from the dataset
!cat ex6/emailSample1.txt

> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors you're expecting.
This can be anywhere from less than 10 bucks a month to a couple of $100. 
You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
if youre running something big..

To unsubscribe yourself from this mailing list, send an email to:
groupname-unsubscribe@egroups.com



In [10]:
!cat ex6/emailSample2.txt

Folks,
 
my first time posting - have a bit of Unix experience, but am new to Linux.

 
Just got a new PC at home - Dell box with Windows XP. Added a second hard disk
for Linux. Partitioned the disk and have installed Suse 7.2 from CD, which went
fine except it didn't pick up my monitor.
 
I have a Dell branded E151FPp 15" LCD flat panel monitor and a nVidia GeForce4
Ti4200 video card, both of which are probably too new to feature in Suse's default
set. I downloaded a driver from the nVidia website and installed it using RPM.
Then I ran Sax2 (as was recommended in some postings I found on the net), but
it still doesn't feature my video card in the available list. What next?
 
Another problem. I have a Dell branded keyboard and if I hit Caps-Lock twice,
the whole machine crashes (in Linux, not Windows) - even the on/off switch is
inactive, leaving me to reach for the power cable instead.
 
If anyone can help me in any way with these probs., I'd be really grateful -
I

In [70]:
import nltk, nltk.stem.porter
def processEmail(email):
    """Email preprocessing and normalization"""
    
    # Convert to lower-case
    email = email.lower()
    
    # Strip HTML
    email = BeautifulSoup(email, 'html.parser').get_text()
    
    # Normalize numbers
    email = re.sub('[0-9]+','number',email)
    
    # Normalize URLs
    email = re.sub('(http|https)://[^\s]*', 'httpaddr', email)
    
    # Normalize emails
    email = re.sub('[^\s]+@[^\s]+', 'emailaddr', email)
    
    # Normalize dollars
    email = re.sub('[$]+', 'dollar', email)
    
    return email

def tokenizeEmail(email):
    """
    Convert email to token list.
    """
    token_list = []
    
    # Pre-processing
    email = processEmail(email)
    
    # Split into tokens
    tokens = re.split('[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%]', email)
    
    # Use porter stemmer to stem individual words in the email
    # Stemmers remove derivational afixes.
    stemmer = nltk.stem.porter.PorterStemmer()
    
    for word in tokens:
        
        # Remove non-alphanumeric characters
        word = re.sub('[^a-zA-Z0-9]', '',word)
        
        # Stem
        word = stemmer.stem(word)
        if len(word) < 1: 
            continue
        
        token_list.append(word)
    
    return token_list

### Vocabulary list

In [66]:
def getVocabList():
    """
    Extract all words in vocabulary and their indices.
    """
    vocabulary = {}
    with open('ex6/vocab.txt') as vocab:
        for line in vocab:
            (index, word) = line.split()
            vocabulary[word] = index
            
    return vocabulary


In [67]:
vocab = getVocabList()

In [68]:
def emailIndices(email, vocabulary):
    """
    Map words (tokens) from the email to indices in the vocabulary.
    """
    # Get email tokens
    email_tokens = tokenizeEmail(email)
    
    # Get email token indices
    indices = [int(vocabulary[word]) for word in email_tokens if word in vocabulary]
    
    return indices

### Extracting features from emails

In [72]:
def emailFeatures(email, vocabulary):
    num_features = len(vocabulary)
    features = np.zeros((num_features,1))
    indices = emailIndices(email, vocabulary)
    for index in indices:
        features[index] = 1
    return features

# Once you have implemented emailFeatures, you should see that the feature
# vector has length 1899 and 45 non-zero entries

email_sample = open( 'ex6/emailSample1.txt', 'r' ).read()
email_features = emailFeatures(email_sample, vocab)

print sum(email_features == 1)
print len(email_features)

[45]
1899


### Training SVM for Spam Classification

In [89]:
# Load training and test data
training_data = sio.loadmat('ex6/spamTrain.mat')
testing_data = sio.loadmat('ex6/spamTest.mat')

# Labels and feature vectors
X, Y = training_data['X'], training_data['y']
Xtest , Ytest = testing_data['Xtest'], testing_data['ytest']

# Train classifier on training data
C = 0.1
model = svm.SVC(C=C, kernel='linear')
model.fit(X,Y.ravel())

# Once the training completes, you should see that the classifier gets,
# a training accuracy of about 99.8% and a test accuracy of about 98.5%

print model.score(X,Y)
print model.score(Xtest, Ytest)

0.99825
0.989


### Top predictors for spam

In [107]:
# Determine what are the words with the highest weights in the classifier 
# = words that are most likely indicators of spam.
sorted_indices = np.argsort( model.coef_, axis=None )[::-1]

vocabulary = {}
with open('ex6/vocab.txt') as vocab:
    for line in vocab:
        (index, word) = line.split()
        vocabulary[int(index)] = word

print "Clearest indicators of spam are:"
for i in range(30):
    idx = sorted_indices[i]
    print '* ' + vocabulary[idx]


Clearest indicators of spam are:
* otherwis
* clearli
* remot
* gt
* visa
* base
* doesn
* wife
* previous
* player
* mortgag
* natur
* ll
* futur
* hot
* air
* cv
* script
* wall
* dollarac
* believ
* entri
* receiv
* numberanumb
* creativ
* multi
* page
* boi
* black
* weblog
