In [1]:
## Machine Learning Online Class
#  Exercise 6 | Spam Classification with SVMs
#
#  Instructions
#  ------------
# 
#  This file contains code that helps you get started on the
#  exercise. You will need to complete the following functions:
#
#     gaussianKernel.m
#     dataset3Params.m
#     processEmail.m
#     emailFeatures.m
#
#  For this exercise, you will not need to change any code in this file,
#  or any other files other than those mentioned above.
#
import numpy as np
import scipy.io
from sklearn import svm
from collections import OrderedDict

from processEmail import processEmail
from emailFeatures import emailFeatures
from getVocabList import getVocabList


In [2]:
 ## ==================== Part 1: Email Preprocessing ====================
#  To use an SVM to classify emails into Spam v.s. Non-Spam, you first need
#  to convert each email into a vector of features. In this part, you will
#  implement the preprocessing steps for each email. You should
#  complete the code in processEmail.m to produce a word indices vector
#  for a given email.

print ('Preprocessing sample email (emailSample1.txt)')

# Extract Features
file = open('emailSample1.txt', 'r')
file_contents = file.readlines()
word_indices  = processEmail(''.join(file_contents))

# Print Stats
print ('Word Indices: ')
print (word_indices)



Preprocessing sample email (emailSample1.txt)
==== Processed Email ====

anyon
know
how
much
it
cost
to
host
a
web
portal
well
it
depend
on
how
mani
visitor
your
expect
thi
can
be
anywher
from
less
than
number
buck
a
month
to
a
coupl
of
dollar
number
you
should
checkout
httpaddr
or
perhap
amazon
ecnumb
if
your
run
someth
big
to
unsubscrib
yourself
from
thi
mail
list
send
an
email
to
emailaddr

Word Indices: 
[85, 915, 793, 1076, 882, 369, 1698, 789, 1821, 1830, 882, 430, 1170, 793, 1001, 1894, 591, 1675, 237, 161, 88, 687, 944, 1662, 1119, 1061, 1698, 374, 1161, 476, 1119, 1892, 1509, 798, 1181, 1236, 809, 1894, 1439, 1546, 180, 1698, 1757, 1895, 687, 1675, 991, 960, 1476, 70, 529, 1698, 530]


In [3]:
## ==================== Part 2: Feature Extraction ====================
#  Now, you will convert each email into a vector of features in R^n.
#  You should complete the code in emailFeatures.m to produce a feature
#  vector for a given email.

print ('Extracting features from sample email (emailSample1.txt)')

# Extract Features
features = emailFeatures(word_indices)

# Print Stats
print ('Length of feature vector: %d'% features.size)
print ('Number of non-zero entries: %d'% sum(features > 0))

Extracting features from sample email (emailSample1.txt)
Length of feature vector: 1899
Number of non-zero entries: 44


In [4]:
## =========== Part 3: Train Linear SVM for Spam Classification ========
#  In this section, you will train a linear classifier to determine if an
#  email is Spam or Not-Spam.

# Load the Spam Email dataset
# You will have X, y in your environment
data = scipy.io.loadmat('spamTrain.mat')
X = data['X']
y = data['y'].flatten()

print ('Training Linear SVM (Spam Classification)')
print ('(this may take 1 to 2 minutes) ...')

C = 0.1
clf = svm.SVC(C=C, kernel='linear', tol=1e-3, max_iter=200)
model = clf.fit(X, y)

acc = model.score(X,y)

print ('Training Accuracy: %f'%acc)


Training Linear SVM (Spam Classification)
(this may take 1 to 2 minutes) ...




Training Accuracy: 0.996000


In [5]:
## =================== Part 4: Test Spam Classification ================
#  After training the classifier, we can evaluate it on a test set. We have
#  included a test set in spamTest.mat

# Load the test dataset
# You will have Xtest, ytest in your environment
data = scipy.io.loadmat('spamTest.mat')
Xtest = data['Xtest']
ytest = data['ytest']
print ('Evaluating the trained Linear SVM on a test set ...')
acc = model.score(Xtest, ytest)

print('Test Accuracy: %f' % acc)



Evaluating the trained Linear SVM on a test set ...
Test Accuracy: 0.981000


In [6]:
## ================= Part 5: Top Predictors of Spam ====================
#  Since the model we are training is a linear SVM, we can inspect the
#  weights learned by the model to understand better how it is determining
#  whether an email is spam or not. The following code finds the words with
#  the highest weights in the classifier. Informally, the classifier
#  'thinks' that these words are the most likely indicators of spam.
#
# 查看权值最大的单词，即该单词与垃圾邮件最相关
# Sort the weights and obtain the vocabulary list

t = sorted(list(enumerate(model.coef_[0])),key=lambda e: e[1], reverse=True)
d = OrderedDict(t)
idx = list(d.keys()) # 从0开始
weight = list(d.values())
vocabList = getVocabList()
print ('Top predictors of spam: ')
for i in range(15):
    print (' %-15s (%f)' %(vocabList[idx[i]], weight[i]))

#print 'Program paused. Press enter to continue.'


Top predictors of spam: 
 our             (0.391337)
 click           (0.379293)
 remov           (0.365469)
 visit           (0.335558)
 guarante        (0.327508)
 basenumb        (0.292663)
 dollar          (0.258750)
 bodi            (0.233422)
 ga              (0.222452)
 below           (0.212933)
 price           (0.209824)
 most            (0.201052)
 will            (0.199862)
 al              (0.198814)
 am              (0.189730)


In [7]:
## =================== Part 6: Try Your Own Emails =====================
#  Now that you've trained the spam classifier, you can use it on your own
#  emails! In the starter code, we have included spamSample1.txt,
#  spamSample2.txt, emailSample1.txt and emailSample2.txt as examples.
#  The following code reads in one of these emails and then uses your
#  learned SVM classifier to determine whether the email is Spam or
#  Not Spam

# Set the file to be read in (change this to spamSample2.txt,
# emailSample1.txt or emailSample2.txt to see different predictions on
# different emails types). Try your own emails as well!
filename = 'myEmail.txt'

# Read and predict

file = open(filename)
file_contents = file.readlines()
word_indices = processEmail(''.join(file_contents))
x = emailFeatures(word_indices)
x
p = model.predict([x])

print ('Processed %s\n\nSpam Classification: %d' % (filename, p[0]))
print ('(1 indicates spam, 0 indicates not spam)')


==== Processed Email ====

dear
reader
from
polit
to
busi
and
financ
from
scienc
and
technolog
to
cultur
and
the
art
our
write
help
our
subscrib
explor
the
world
differ
with
full
access
to
the
economist
app
and
economistcom
you
can
cut
through
the
nois
and
give
yourself
the
advantag
of
an
independ
worldviewani
time
anywher
now
you
can
enjoi
an
immers
experi
of
our
awardwin
journal
with
number
off
an
annual
digit
subscript

Processed myEmail.txt

Spam Classification: 1
(1 indicates spam, 0 indicates not spam)
