In [1]:
import random
import numpy as np
import matplotlib.pyplot as plt

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython

%load_ext autoreload
%autoreload 2

In [3]:
from sklearn import preprocessing, metrics
import utils
import scipy.io
import numpy as np
from linear_classifier import LinearSVM_twoclass
from sklearn.model_selection import train_test_split

# load the SPAM email training dataset

X,y = utils.load_mat('data/spamTrain.mat')
yy = np.ones(y.shape)
yy[y==0] = -1

# load the SPAM email test dataset

test_data = scipy.io.loadmat('data/spamTest.mat')
X_test = test_data['Xtest']
y_test = test_data['ytest'].flatten()


##################################################################################
#  YOUR CODE HERE for training the best performing SVM for the data above.       #
#  what should C be? What should num_iters be? Should X be scaled?               #
#  should X be kernelized? What should the learning rate be? What should the     #
#  number of iterations be?                                                      #
##################################################################################

X_train, X_val, yy_train, yy_val = train_test_split(X, yy, test_size=0.20)
Cvals = [1e1, 1e2, 1e3, 1e4]
itervals = [2000, 10000, 20000]
lrvals = [1e-5, 1e-4, 1e-3]

best_acc = 0
best_C = 0
best_lr = 0

print('Iteration begins!')

for C in Cvals:
    for it in itervals:
        for lr in lrvals:
            print('C, numiter, lr', C, it, lr)
            svm = LinearSVM_twoclass()
            svm.theta = np.zeros((X_train.shape[1],))
            svm.train(X_train, yy_train, reg=C, learning_rate=lr, num_iters=it,verbose=False)
            y_pred = svm.predict(X_val)
            acc = np.mean(y_pred == yy_val)
            print('accuracy:', acc)
        
            if acc > best_acc:
                best_acc = acc
                best_C = C
                best_learning_rate = lr
                best_iter = it
    
print('the best acc is', best_acc)
print('the best C is', best_C)
print('the best learning rate is', best_learning_rate)
print('it is achieved in', it, 'iterations')

Iteration begins!
C, numiter, lr 10.0 2000 1e-05
accuracy: 0.69
C, numiter, lr 10.0 2000 0.0001
accuracy: 0.955
C, numiter, lr 10.0 2000 0.001
accuracy: 0.97875
C, numiter, lr 10.0 10000 1e-05
accuracy: 0.93375
C, numiter, lr 10.0 10000 0.0001
accuracy: 0.97625
C, numiter, lr 10.0 10000 0.001
accuracy: 0.98125
C, numiter, lr 10.0 20000 1e-05
accuracy: 0.955
C, numiter, lr 10.0 20000 0.0001
accuracy: 0.97875
C, numiter, lr 10.0 20000 0.001
accuracy: 0.98
C, numiter, lr 100.0 2000 1e-05
accuracy: 0.955
C, numiter, lr 100.0 2000 0.0001
accuracy: 0.97875
C, numiter, lr 100.0 2000 0.001
accuracy: 0.9825
C, numiter, lr 100.0 10000 1e-05
accuracy: 0.97625
C, numiter, lr 100.0 10000 0.0001
accuracy: 0.98125
C, numiter, lr 100.0 10000 0.001
accuracy: 0.97875
C, numiter, lr 100.0 20000 1e-05
accuracy: 0.97875
C, numiter, lr 100.0 20000 0.0001
accuracy: 0.98
C, numiter, lr 100.0 20000 0.001
accuracy: 0.97875
C, numiter, lr 1000.0 2000 1e-05
accuracy: 0.97875
C, numiter, lr 1000.0 2000 0.0001
accu

[NOTE] Although the highest accuracy is achieved in 20000 iterations, we can see by simply using 2000 iterations is able to give us an accuracy that is good enough in far less time.

In [15]:
##################################################################################
# YOUR CODE HERE for testing your best model's performance                       #
# what is the accuracy of your best model on the test set? On the training set?  #
##################################################################################
yy_test = np.ones(y_test.shape)
yy_test[y_test==255] = -1

best_linearsvm = LinearSVM_twoclass()
best_linearsvm.theta = np.zeros((X_train.shape[1],))
best_linearsvm.train(X_train, yy_train, reg=best_C,learning_rate=best_learning_rate, num_iters=2000, verbose=False)
y_test_pred = best_linearsvm.predict(X_test)
acc = np.mean(y_test_pred == yy_test)
print('The best linear svm accuracy is', acc)

The best linear svm accuracy is 0.982


In [16]:
##################################################################################
# ANALYSIS OF MODEL: Print the top 15 words that are predictive of spam and for  #
# ham. Hint: use the coefficient values of the learned model                     #
##################################################################################
words, inv_words = utils.get_vocab_dict()
index = np.argsort(best_linearsvm.theta)[-15:]
for i in range(15):
    print(words[index[i]+1])

##################################################################################
#                    END OF YOUR CODE                                            #
##################################################################################

market
your
we
guarante
emailaddr
busi
am
inform
instruct
dollarnumb
hour
click
will
remov
our
