# SVM

This is the code to accompany the Lesson 2 (SVM) mini-project.

Use a SVM to identify emails from the Enron corpus by their authors:    
Sara has label 0
Chris has label 1

In [1]:
import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess



In [2]:
### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()

no. of Chris training emails: 7936
no. of Sara training emails: 7884


In [3]:
from sklearn.svm import SVC

In [4]:
clf = SVC(kernel='linear')

In [5]:
t0 = time()
clf.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"

training time: 179.855 s


In [6]:
t0 = time()
print 'SVM Score: %f' % clf.score(features_test, labels_test)
print "prediction time:", round(time()-t0, 3), "s"

SVM Score: 0.984073
prediction time: 17.17 s


### Slicing the training set to 1% of its size

In [7]:
features_train = features_train[:len(features_train)/100]
labels_train = labels_train[:len(labels_train)/100]

In [8]:
clf = SVC(kernel='linear')

In [9]:
t0 = time()
clf.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"

training time: 0.128 s


In [10]:
t0 = time()
print 'SVM Score after slicing: %f' % clf.score(features_test, labels_test)
print "prediction time:", round(time()-t0, 3), "s"

SVM Score after slicing: 0.884528
prediction time: 1.008 s


### Using RBF kernel

In [13]:
clf = SVC(kernel='rbf')

In [14]:
t0 = time()
clf.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"

training time: 0.156 s


In [15]:
t0 = time()
print 'SVM Score after using rfb: %f' % clf.score(features_test, labels_test)
print "prediction time:", round(time()-t0, 3), "s"

SVM Score after using rfb: 0.616041
prediction time: 1.183 s


### Optimizing C parameter

In [18]:
for c in [10, 100, 1000, 10000]:
    clf = SVC(kernel='rbf', C=c)
    
    t0 = time()
    clf.fit(features_train, labels_train)
    print "training time:", round(time()-t0, 3), "s"
    
    t0 = time()
    print 'SVM Score with C=%d: %f' % (c, clf.score(features_test, labels_test))
    print "prediction time:", round(time()-t0, 3), "s"
    
    print '-' * 50

training time: 0.147 s
SVM Score with C=10: 0.616041
prediction time: 1.195 s
--------------------------------------------------
training time: 0.109 s
SVM Score with C=100: 0.616041
prediction time: 1.242 s
--------------------------------------------------
training time: 0.098 s
SVM Score with C=1000: 0.821388
prediction time: 1.08 s
--------------------------------------------------
training time: 0.097 s
SVM Score with C=10000: 0.892491
prediction time: 0.92 s
--------------------------------------------------


### Using SVM with RBF kernel and optimized value for C on the full training set

In [21]:
features_train, features_test, labels_train, labels_test = preprocess()

no. of Chris training emails: 7936
no. of Sara training emails: 7884


In [22]:
c = 10000
clf = SVC(kernel='rbf', C=c)

t0 = time()
clf.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"

t0 = time()
print 'The Score of SVM with RBF kernel and C=%d: %f' % (c, clf.score(features_test, labels_test))
print "prediction time:", round(time()-t0, 3), "s"

training time: 107.814 s
The Score of SVM with RBF kernel and C=10000: 0.990899
prediction time: 10.829 s


### Finding predicted labels for some of the points in the training set

In [23]:
features_train = features_train[:len(features_train)/100]
labels_train = labels_train[:len(labels_train)/100]

In [24]:
c = 10000
clf = SVC(kernel='rbf', C=c)

t0 = time()
clf.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"

t0 = time()
predictions = clf.predict(features_test)
print "prediction time:", round(time()-t0, 3), "s"

training time: 0.11 s
prediction time: 0.897 s


In [26]:
for point in [10, 26, 50]:
    print 'predictions[%d] = %s(%d)' % (point, 'Chris' if predictions[point] == 1 else 'Sara', predictions[point])

predictions[10] = Chris(1)
predictions[26] = Sara(0)
predictions[50] = Chris(1)


### How many Chris emails predicted?

In [27]:
features_train, features_test, labels_train, labels_test = preprocess()

no. of Chris training emails: 7936
no. of Sara training emails: 7884


In [28]:
c = 10000
clf = SVC(kernel='rbf', C=c)

t0 = time()
clf.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"

t0 = time()
predictions = clf.predict(features_test)
print "prediction time:", round(time()-t0, 3), "s"

training time: 108.859 s
prediction time: 10.84 s


In [29]:
print 'Number of Chris emails predicted: %d' % sum(predictions == 1)

Number of Chris emails predicted: 877
