In [1]:
import random
import numpy as np
import matplotlib.pyplot as plt

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython

%load_ext autoreload
%autoreload 2

In [5]:
from sklearn import preprocessing, metrics
import utils
import scipy.io
import numpy as np
from linear_classifier import LinearSVM_twoclass

# load the SPAM email training dataset

X,y = utils.load_mat('data/spamTrain.mat')
yy = np.ones(y.shape)
yy[y==0] = -1

val_indx = 3000
Xval = X[val_indx:y.shape[0]]
yval = yy[val_indx:y.shape[0]]

X = X[0:val_indx]
yy = yy[0:val_indx]

# load the SPAM email test dataset

test_data = scipy.io.loadmat('data/spamTest.mat')
X_test = test_data['Xtest']
y_test = test_data['ytest'].flatten()

##################################################################################
#  YOUR CODE HERE for training the best performing SVM for the data above.       #
#  what should C be? What should num_iters be? Should X be scaled?               #
#  should X be kernelized? What should the learning rate be? What should the     #
#  number of iterations be?                                                      #

In [None]:
##################################################################################
# Should X be scaled? Yes
# Should X be kernelized? Yes
from linear_classifier import LinearSVM
sigma = 10
svm = LinearSVM_twoclass()     

# compute the kernel (slow!)
K = np.array([utils.gaussian_kernel(x1,x2,sigma) for x1 in X for x2 in X]).reshape(X.shape[0],X.shape[0])
# scale the kernelized data matrix
scaler = preprocessing.StandardScaler().fit(K)
scaleK = scaler.transform(K)
# add the intercept term
KK = np.vstack([np.ones((scaleK.shape[0],)),scaleK.T]).T

# seperating val set
# compute the kernel (slow!)
Kval = np.array([utils.gaussian_kernel(x1,x2,sigma) for x1 in Xval for x2 in X]).reshape(Xval.shape[0],X.shape[0])
# scale the kernelized data matrix
scaler_val = preprocessing.StandardScaler().fit(Kval)
scaleK_val = scaler.transform(Kval)
# add the intercept term
KKval = np.vstack([np.ones((scaleK_val.shape[0],)),scaleK_val.T]).T 

# seperating train set
# compute the kernel (slow!)
Ktest = np.array([utils.gaussian_kernel(x1,x2,sigma) for x1 in X_test for x2 in X]).reshape(X_test.shape[0],X.shape[0])
# scale the kernelized data matrix
scaler_test = preprocessing.StandardScaler().fit(Ktest)
scaleK_test = scaler.transform(Ktest)
# add the intercept term
KKtest = np.vstack([np.ones((scaleK_test.shape[0],)),scaleK_test.T]).T 
svm.theta = np.zeros((KK.shape[1],))

In [111]:
Best_accuracy, Best_C = 0, 0
# what should C be?
Cvals = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]
for C in Cvals:
    svm = LinearSVM_twoclass()     
    svm.train(KK,yy,learning_rate=1e-4,reg=C,num_iters=1000,verbose=False,batch_size=KK.shape[0])
    predy = svm.predict(KKval)
    accuracy = np.mean(predy == yval)
    print("Accuracy is ",accuracy, "when C = ",C)
    if accuracy > Best_accuracy:
        Best_accuracy, Best_C = accuracy, C
print("Best accuracy = ", Best_accuracy, "Best C = ", Best_C)

Accuracy is  0.724 when C =  0.01
Accuracy is  0.725 when C =  0.03
Accuracy is  0.742 when C =  0.1
Accuracy is  0.769 when C =  0.3
Accuracy is  0.779 when C =  1
Accuracy is  0.781 when C =  3
Accuracy is  0.781 when C =  10
Accuracy is  0.781 when C =  30
Accuracy is  0.781 when C =  100
Best accuracy =  0.781 Best C =  3


In [116]:
# What should the learning rate be?
Best_accuracy, Best_lr = 0, 0
learning_rates = [1e-2, 1e-3, 1e-4, 1e-5]
for lr in learning_rates:
    svm = LinearSVM_twoclass()
    svm.train(KK,yy,learning_rate=lr,reg=Best_C,num_iters=1000,verbose=False,batch_size=KK.shape[0])
    predy = svm.predict(KKval)
    accuracy = np.mean(predy == yval)
    print("Accuracy is ",accuracy, "when learning rate = ",lr)
    if accuracy > Best_accuracy:
        Best_accuracy, Best_lr = accuracy,lr
print("Best accuracy = ", Best_accuracy, "Best learning rate = ", Best_lr)

Accuracy is  0.781 when learning rate =  0.01
Accuracy is  0.781 when learning rate =  0.001
Accuracy is  0.781 when learning rate =  0.0001
Accuracy is  0.757 when learning rate =  1e-05
Best accuracy =  0.781 Best learning rate =  0.01


In [122]:
# What should the number of iterations be? 
Best_accuracy, Best_ni = 0, 0
num_iters = [10000, 15000, 20000]
for ni in num_iters:
    svm = LinearSVM_twoclass()
    svm.train(KK,yy,learning_rate=Best_lr,reg=Best_C,num_iters=ni,verbose=False,batch_size=KK.shape[0])
    predy = svm.predict(KKval)
    accuracy = np.mean(predy == yval)
    print("Accuracy is ",accuracy, "when number of iteration = ",ni)
    if accuracy > Best_accuracy:
        Best_accuracy, Best_ni = accuracy,ni
print("Best accuracy = ", Best_accuracy, "Best iteration number = ", Best_ni)

Accuracy is  0.781 when number of iteration =  1000


KeyboardInterrupt: 

In [None]:
##################################################################################
# YOUR CODE HERE for testing your best model's performance                       #
# what is the accuracy of your best model on the test set? On the training set?  #
##################################################################################
svm = LinearSVM_twoclass()
svm.train(KK,yy,learning_rate=Best_lr,reg=Best_C,num_iters=ni,verbose=False,batch_size=KK.shape[0])
predy = svm.predict(KKtest)
accuracy = np.mean(predy == y_test)
print("Best accuracy = ", accuracy, "when best C is", Best_C, "Best learning rate is", Best_lr, "Best iteration number is", Best_ni)

In [3]:
##################################################################################
# ANALYSIS OF MODEL: Print the top 15 words that are predictive of spam and for  #
# ham. Hint: use the coefficient values of the learned model                     #
##################################################################################
words, inv_words = utils.get_vocab_dict()
print("top 15 word that are predictive of spam")
XXX = np.dot(svm.theta[1:],X).argsort()
for i in XXX[-15:][::-1]:
    print(words[i + 1])
print("top 15 word that are predictive of ham")
for i in XXX[:15][::1]:
    print(words[i + 1])
##################################################################################
#                    END OF YOUR CODE                                            #
##################################################################################

NameError: name 'sigma' is not defined