In [1]:
# general
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

%matplotlib inline

from pylab import rcParams
rcParams['figure.figsize'] = 8, 5

# first used in exercise one
import linearsvm as svm
from sklearn import preprocessing # for scale
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [2]:
import importlib
importlib.reload(svm)

<module 'linearsvm' from '/Users/andrewenfield/work/github/Data558/Week08/linearsvm.py'>

Note: Per the request in the "Collaboration policy" note, I've discussed at least part of this assignment with many of the MS employees in the class, including Abhishek, Geoff, Suman, and Charles. (Different weeks/different assignments have different people, depending upon who attends our study groups, but I'll probably just include this blurb w/ each homework since it's generally correct.) I've also gotten input from the discussion board.

# Exercise one

_Compute the gradient ∇F(β) of F._

![Gradient](gradient.png)

_Consider the Spam dataset from The Elements of Statistical Learning. Standardize the data, if you have not done so already._

In [3]:
spam = pd.read_table('data/spam.data', sep=' ', header=None)
spam.shape

(4601, 58)

In [4]:
spam[:2]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1


In [5]:
# and the train/test split info
spam_traintest = pd.read_table('data/spam.traintest', header=None, names=['TestIndicator'])
spam_traintest.shape

(4601, 1)

In [6]:
spam_traintest[:3]

Unnamed: 0,TestIndicator
0,1
1,0
2,1


In [7]:
# convert label 0/1 to -1/1
spam[57] = spam[57].apply(lambda v: -1 if v == 0 else 1)

In [8]:
spam[57].value_counts(dropna=False)

-1    2788
 1    1813
Name: 57, dtype: int64

In [9]:
X = spam.values[:,0:57]
y = spam.values[:,57]
X.shape, y.shape

((4601, 57), (4601,))

In [10]:
X_scaled = preprocessing.scale(X)
X_scaled.shape

(4601, 57)

In [11]:
X_scaled_train = X_scaled[spam_traintest['TestIndicator'] == 0, :]
X_scaled_test = X_scaled[spam_traintest['TestIndicator'] == 1, :]
y_train = y[spam_traintest['TestIndicator'] == 0]
y_test = y[spam_traintest['TestIndicator'] == 1]

X_scaled_train.shape, X_scaled_test.shape, y_train.shape, y_test.shape

((3065, 57), (1536, 57), (3065,), (1536,))

_Write a function mylinearsvm that implements the fast gradient algorithm to train the linear support vector machine with the squared hinge loss. The function takes as input the initial step-size value for the backtracking rule and a maximum number of iterations._

I implemented the mylinearsvm function, and all of the supporting functions including the gradient and objective functions, in the file linearsvm.py, which I imported into this notebook with the alias svm.

_Train your linear support vector machine with the squared hinge loss on the the Spam dataset for the λ = 1. Report your misclassiﬁcation error for this value of λ._ 

In [25]:
results = svm.fastgradalgo(
    X_scaled_train, y_train, t_init=0.01, 
    grad_func = svm.compute_linearsvm_gradient, 
    obj_func = svm.compute_linearsvm_objective, 
    lam=1)
results[-5:]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,50,51,52,53,54,55,56
96,0.004722,-0.017823,0.041006,0.022067,0.063296,0.054764,0.113741,0.060633,0.043391,0.024409,...,-0.023846,-0.023559,-0.017875,-0.011616,0.057524,0.105725,0.022038,0.024625,0.054172,0.056568
97,0.004746,-0.017818,0.041007,0.022053,0.063268,0.054747,0.113744,0.060613,0.043376,0.024423,...,-0.023843,-0.023565,-0.017887,-0.011614,0.057509,0.105748,0.022037,0.02463,0.054171,0.056561
98,0.004766,-0.017812,0.041009,0.022039,0.063242,0.054732,0.113743,0.060593,0.043364,0.024438,...,-0.023841,-0.023566,-0.0179,-0.011609,0.057496,0.105773,0.022036,0.024635,0.05417,0.056552
99,0.004783,-0.017804,0.041013,0.022026,0.063219,0.054721,0.113737,0.060574,0.043353,0.024454,...,-0.023839,-0.023562,-0.017913,-0.011603,0.057483,0.105801,0.022035,0.024639,0.054168,0.056542
100,0.004795,-0.017797,0.041018,0.022014,0.0632,0.054712,0.113726,0.060555,0.043346,0.024469,...,-0.023837,-0.023553,-0.017926,-0.011596,0.057472,0.10583,0.022033,0.024642,0.054167,0.056531


In [13]:
svm.get_final_coefs(results)

array([[ 0.00479497, -0.0177966 ,  0.04101844,  0.02201367,  0.0632002 ,
         0.05471219,  0.11372629,  0.06055507,  0.04334626,  0.0244685 ,
         0.04295497, -0.01973514,  0.01350283,  0.01194526,  0.03780159,
         0.10502347,  0.06651882,  0.05194878,  0.05429956,  0.04372403,
         0.09503506,  0.04485843,  0.09438071,  0.06443725, -0.05358436,
        -0.03836966, -0.04613438, -0.01988765, -0.01862775, -0.02545586,
        -0.00798179, -0.0042151 , -0.02998457, -0.00438627, -0.01557435,
        -0.00621837, -0.03105852, -0.01273458, -0.02824564,  0.00899199,
        -0.01605101, -0.03340787, -0.02412678, -0.02500915, -0.04566112,
        -0.04228721, -0.01269574, -0.02383744, -0.02355306, -0.01792578,
        -0.01159551,  0.05747161,  0.10582991,  0.02203296,  0.0246417 ,
         0.05416728,  0.05653074]])

In [14]:
def get_accuracy(beta_coefs, X, y, threshold=0):
    y_pred = X.dot(beta_coefs.T).ravel() # ravel to convert to vector
    # convert to -1 or +1 depending on threshold
    y_thresholded = np.where(y_pred > threshold, 1, -1)
    return accuracy_score(y, y_thresholded)

In [23]:
# note use of the held out test data to get the performance metrics
accuracy = get_accuracy(svm.get_final_coefs(results), X_scaled_test, y_test)
print("Accuracy: {0:.1%}".format(accuracy))
print("Misclassification error: {0:.1%}".format(1 - accuracy))

Accuracy: 90.4%
Misclassification error: 9.6%


**TODO** ? try with a basic sklearn impl and see if coefs and accuracy are close? not needed for this assignment, but could be worthwhile since I'll be using this fastgradalgo impl for both the polished code release and for the final project.

_Run cross-validation to ﬁnd the optimal value of λ. Report your misclassiﬁcation error for that value of λ._

In [16]:
def train_and_test_single_fold(X_full, y_full, lam, train_index, test_index):
    """
    Train using the data identified by the indices in train_index, and then test
    (and return accuracy) using the data identified by the indices in test_index.
    """
    beta_vals = svm.fastgradalgo(
        X_full[train_index], y_full[train_index], t_init=0.01, 
        grad_func = svm.compute_linearsvm_gradient, 
        obj_func = svm.compute_linearsvm_objective, 
        lam=lam)

#     beta_vals = mt.randcoorddescent(X_full[train_index], y_full[train_index], 
#                                     lam, alpha, max_iter=500, beta0=beta0)

    final_coefs = svm.get_final_coefs(beta_vals)
    
    return get_accuracy(final_coefs, X_full[test_index], y_full[test_index])
    
#     return mean_squared_error(y_full[test_index], 
#                               X_full[test_index].dot(final_coefs))

In [17]:
def train_and_test_for_all_folds(X_full, y_full, train_indices, 
                                 test_indices, lam):
    """
    Train and test for all folds - for now, 10 folds, hard-coded. Return 
    the mean of the set of accuracy scores from all folds."""
    accuracy_scores = [train_and_test_single_fold(X_full, y_full, lam,
                                       train_indices[i], 
                                       test_indices[i]) for i in range(10)]
    return(np.mean(accuracy_scores))

In [18]:
# get arrays with 10 sets of test and train indices - one for each fold
kf = KFold(10, shuffle=True, random_state=42)

train_indices_list = []
test_indices_list = []
for train_index, test_index in kf.split(X_scaled_train):
    train_indices_list.append(train_index)
    test_indices_list.append(test_index)
    
train_indices = np.array(train_indices_list)
test_indices = np.array(test_indices_list)

In [19]:
lambdas = [10 ** exponent for exponent in range(-10,2)]
lambdas

[1e-10, 1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10]

In [20]:
# lambdas = np.logspace(-5, 0.1, num=10)
# lambdas

In [21]:
# and finally, do 10-fold cross validation for each value of lambda, and
# show the mean of each set's MSEs
accuracy_values_by_lambda = [train_and_test_for_all_folds(X_scaled_train, y_train, 
                                train_indices, test_indices, 
                                lam) for lam in lambdas]
list(zip(lambdas, accuracy_values_by_lambda))

[(1e-10, 0.91322198803517074),
 (1e-09, 0.91322198803517074),
 (1e-08, 0.91322198803517074),
 (1e-07, 0.91322198803517074),
 (1e-06, 0.91322198803517074),
 (1e-05, 0.91322198803517074),
 (0.0001, 0.91322198803517074),
 (0.001, 0.91289412616295151),
 (0.01, 0.91093972876881479),
 (0.1, 0.91093334184922603),
 (1, 0.90929083902833663),
 (10, 0.8939569095825084)]

**TODO** why would we be getting the same low misclassification error with so many of the low lambda values? Try with diff maxiters to see if it changes things? Try with othere param changes? Think about it generally and see what else could be causing this? Would it make sense that lambda should basically be zero? So we don't penalize anything?

In [22]:
best_lambda = 0

In [26]:
# train with best lambda
results_best = svm.fastgradalgo(
    X_scaled_train, y_train, t_init=0.01, 
    grad_func = svm.compute_linearsvm_gradient, 
    obj_func = svm.compute_linearsvm_objective, 
    lam=best_lambda)
results_best[-5:]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,50,51,52,53,54,55,56
96,-0.00456,-0.02452,0.053345,0.168683,0.098476,0.07119,0.37908,0.09602,0.051182,0.021623,...,-0.1305,-0.142286,-0.069444,-0.026484,0.07829,0.60565,0.16415,0.0535,0.272586,0.105239
97,-0.004803,-0.024611,0.053362,0.170677,0.098611,0.071314,0.377722,0.096268,0.051379,0.021627,...,-0.131481,-0.14314,-0.069599,-0.026641,0.078389,0.608849,0.165773,0.052071,0.273975,0.105322
98,-0.005029,-0.024698,0.053376,0.172679,0.098738,0.071425,0.376326,0.096513,0.051553,0.021634,...,-0.132447,-0.143984,-0.069753,-0.026791,0.078482,0.611983,0.167397,0.050609,0.275349,0.10541
99,-0.005238,-0.024781,0.053385,0.174689,0.098857,0.071523,0.374894,0.096753,0.051702,0.021641,...,-0.133399,-0.144815,-0.069908,-0.026933,0.078567,0.615051,0.169022,0.049117,0.276707,0.105503
100,-0.005428,-0.024858,0.053391,0.176707,0.098968,0.071606,0.373429,0.096984,0.051826,0.021649,...,-0.134336,-0.145633,-0.070062,-0.027067,0.078642,0.618054,0.17065,0.047593,0.278051,0.1056


In [27]:
# note use of the held out test data to get the performance metrics
accuracy = get_accuracy(svm.get_final_coefs(results_best), X_scaled_test, y_test)
print("Accuracy: {0:.1%}".format(accuracy))
print("Misclassification error: {0:.1%}".format(1 - accuracy))

Accuracy: 91.5%
Misclassification error: 8.5%
