In [1]:
import random
import numpy as np
import matplotlib.pyplot as plt

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython

%load_ext autoreload
%autoreload 2

In [2]:
from sklearn import preprocessing, metrics, cross_validation
import utils
import scipy.io
import numpy as np
from linear_classifier import LinearSVM_twoclass

# load the SPAM email training dataset

X,y = utils.load_mat('data/spamTrain.mat')
yy = np.ones(y.shape)
yy[y==0] = -1

# load the SPAM email test dataset

test_data = scipy.io.loadmat('data/spamTest.mat')
X_test = test_data['Xtest']
y_test = test_data['ytest'].flatten()

##################################################################################
#  YOUR CODE HERE for training the best performing SVM for the data above.       #
#  what should C be? What should num_iters be? Should X be scaled?               #
#  should X be kernelized? What should the learning rate be? What should the     #
#  number of iterations be?                                                      #
##################################################################################

svm = LinearSVM_twoclass()
svm.theta = np.zeros((X.shape[1],))

Cvals = [0.01,0.03,0.1,0.3,1,3,10,30]
sigma_vals = [0.01,0.03,0.1,0.3,1,3,10,30]

X, Xval, yy, yyval = cross_validation.train_test_split(X, yy, test_size=0.2)



In [3]:
##################################################################################
#                            Using Linear Model                                  #
##################################################################################

print "1. Using Linear Model:"

best_C = None
best_acc = 0
best_svm = None

for C in Cvals:
    svm = LinearSVM_twoclass()
    svm.theta = np.zeros((X.shape[1],))
    svm.train(X,yy,learning_rate=1e-4,reg=C,num_iters=2000,verbose=False,batch_size=X.shape[0])

    y_pred = svm.predict(Xval)
    acc = metrics.accuracy_score(yyval,y_pred)
    print "While C =", C, ", accuracy on validation data =", acc

    if (acc > best_acc):
        best_acc = acc
        best_C = C
        best_svm = svm

print "The best case: best_C =", best_C, ", best_accuracy on training data =", best_acc

##################################################################################
# YOUR CODE HERE for testing your best model's performance                       #
# what is the accuracy of your best model on the test set? On the training set?  #
##################################################################################

# training dataset
y_pred = best_svm.predict(X)
print "Accuracy on training data = ", metrics.accuracy_score(yy,y_pred)

# test dataset
yy_test = np.ones(y_test.shape)
yy_test[y_test==0] = -1
test_pred = best_svm.predict(X_test)
print "Accuracy on test data = ", metrics.accuracy_score(yy_test,test_pred)

1. Using Linear Model:
While C = 0.01 , accuracy on validation data = 0.68625
While C = 0.03 , accuracy on validation data = 0.68625
While C = 0.1 , accuracy on validation data = 0.68625
While C = 0.3 , accuracy on validation data = 0.68625
While C = 1 , accuracy on validation data = 0.69625
While C = 3 , accuracy on validation data = 0.89
While C = 10 , accuracy on validation data = 0.965
While C = 30 , accuracy on validation data = 0.97375
The best case: best_C = 30 , best_accuracy on training data = 0.97375
Accuracy on training data =  0.9709375
Accuracy on test data =  0.973


In [4]:
##################################################################################
#                              Using Gaussian Kernel                             #
##################################################################################

print "2. Using Gaussian Kernel:"

best_C = None
best_sigma = None
best_acc = 0
best_svm = None

for sigma in sigma_vals:
    K = np.array([utils.gaussian_kernel(x1,x2,sigma) for x1 in X for x2 in X]).reshape(X.shape[0],X.shape[0])
    scaler = preprocessing.StandardScaler().fit(K)
    scaleK = scaler.transform(K)
    KK = np.vstack([np.ones((scaleK.shape[0],)),scaleK]).T
    
    Kval = np.array([utils.gaussian_kernel(x1,x2,sigma) for x1 in Xval for x2 in X]).reshape(Xval.shape[0], X.shape[0])
#     scaler = preprocessing.StandardScaler().fit(Kval)
    scaleKval = scaler.transform(Kval)
    KKval = np.vstack([np.ones((scaleKval.shape[0],)),scaleKval.T]).T
    
    for C in Cvals:
        svm = LinearSVM_twoclass()
        svm.theta = np.zeros((KK.shape[1],))
        svm.train(KK,yy,learning_rate=1e-4,reg=C,num_iters=2000,verbose=False,batch_size=KK.shape[0])
        
        y_pred = svm.predict(KKval)
        acc = metrics.accuracy_score(yyval,y_pred)
        print "While sigma =", sigma, ", C =", C, ", accuracy on validation data =", acc
        
        if (acc > best_acc):
            best_acc = acc
            best_C = C
            best_sigma = sigma
            best_svm = svm

print "The best case: best_sigma =", best_sigma, ", best_C =", best_C, ", best_accuracy on training data =", best_acc

##################################################################################
# YOUR CODE HERE for testing your best model's performance                       #
# what is the accuracy of your best model on the test set? On the training set?  #
##################################################################################

# training dataset
K = np.array([utils.gaussian_kernel(x1,x2,best_sigma) for x1 in X for x2 in X]).reshape(X.shape[0],X.shape[0])
scaler = preprocessing.StandardScaler().fit(K)
scaleK = scaler.transform(K)
KK = np.vstack([np.ones((scaleK.shape[0],)),scaleK.T]).T
y_pred = best_svm.predict(KK)
print "Accuracy on training data = ", metrics.accuracy_score(yy,y_pred)

# test dataset
K_test = np.array([utils.gaussian_kernel(x1,x2,best_sigma) for x1 in X_test for x2 in X]).reshape(X_test.shape[0],X.shape[0])
# scaler_test = preprocessing.StandardScaler().fit(K_test)
scaleK_test = scaler.transform(K_test)
KK_test = np.vstack([np.ones((scaleK_test.shape[0],)),scaleK_test.T]).T
yy_test = np.ones(y_test.shape)
yy_test[y_test==0] = -1
test_pred = best_svm.predict(KK_test)
print "Accuracy on test data = ", metrics.accuracy_score(yy_test,test_pred)

2. Using Gaussian Kernel:
While sigma = 0.01 , C = 0.01 , accuracy on validation data = 0.77625
While sigma = 0.01 , C = 0.03 , accuracy on validation data = 0.77625
While sigma = 0.01 , C = 0.1 , accuracy on validation data = 0.77625
While sigma = 0.01 , C = 0.3 , accuracy on validation data = 0.77625
While sigma = 0.01 , C = 1 , accuracy on validation data = 0.77625
While sigma = 0.01 , C = 3 , accuracy on validation data = 0.78125
While sigma = 0.01 , C = 10 , accuracy on validation data = 0.78125
While sigma = 0.01 , C = 30 , accuracy on validation data = 0.78125
While sigma = 0.03 , C = 0.01 , accuracy on validation data = 0.77625
While sigma = 0.03 , C = 0.03 , accuracy on validation data = 0.77625
While sigma = 0.03 , C = 0.1 , accuracy on validation data = 0.77625
While sigma = 0.03 , C = 0.3 , accuracy on validation data = 0.77625
While sigma = 0.03 , C = 1 , accuracy on validation data = 0.77625
While sigma = 0.03 , C = 3 , accuracy on validation data = 0.78125
While sigma = 

In [4]:
##################################################################################
# Concluded from above, the best model is to use linear model with C = 30        #
##################################################################################

best_C = 30

best_svm = LinearSVM_twoclass()
best_svm.theta = np.zeros((X.shape[1],))
best_svm.train(X,yy,learning_rate=1e-4,reg=best_C,num_iters=20000,verbose=False,batch_size=X.shape[0])

# training dataset
y_pred = best_svm.predict(X)
print "Accuracy on training data = ", metrics.accuracy_score(yy,y_pred)

# test dataset
yy_test = np.ones(y_test.shape)
yy_test[y_test==0] = -1
test_pred = best_svm.predict(X_test)
print "Accuracy on test data = ", metrics.accuracy_score(yy_test,test_pred)

##################################################################################
# YOUR CODE HERE for finding top 15 words predicted to be spam and ham           #
##################################################################################

words, inv_words = utils.get_vocab_dict()

index = np.argsort(best_svm.theta)[-15:]
print "Top 15 words predicted to be spam are:"
for i in reversed(index):
    print words[i+1]

index = np.argsort(best_svm.theta)[0:15]
print "Top 15 words predicted to be ham are:"
for i in index:
    print words[i+1]

##################################################################################
# END                                                                            #
##################################################################################

Accuracy on training data =  0.993125
Accuracy on test data =  0.988
Top 15 words predicted to be spam are:
click
remov
our
nbsp
basenumb
free
your
will
guarante
pleas
you
here
most
visit
offer
Top 15 words predicted to be ham are:
wrote
date
the
httpaddr
url
spamassassin
re
numbertnumb
it
thei
user
list
my
author
prefer
