In [5]:
import numpy as np
from hw8_dataload import LFD_Data2
from sklearn import svm

# HW 8
## Primal vs Dual Problem

- The SVM **primal** problem minimizes 0.5w<sup>T</sup>w subject to y<sub>n</sub>(w<sup>T</sup>x<sub>n</sub>+b)&ge;1 for n = 1,2,...,N
- Since w is a d-dimensional vector (corresponding to the dimension of x) and we can also vary b, the primal problem involves a quadratic programming problem with **d+1 variables**.

## Polynomial Kernels

- Implementing polynimal kernels with a soft-margin SVM using the given data set of handwritten digits from the US Postal Service Zip Code data set with extracted features digit, intensity, and symmetry. 
- The polynomial kernel K(x<sub>n</sub>, x<sub>m</sub>) = (1+x<sub>n</sub><sup>T</sup>x<sub>m</sub>)<sup>Q</sup>
- Training **two** types of binary classifiers:
    - one-vs-one (one digit class is +1, another is -1, rest are ignored)
    - one-vs-all (one digit class is +1, everything else is -1)

In [9]:
hw8_train = "datasets/features.train"
hw8_test = "datasets/features.test"
hw8_C = 0.01
hw8_Q = 2
hw8_data = LFD_Data2(hw8_train, hw8_test)

my_svm = svm.SVC(C = 0.01, kernel = 'poly',degree = 2, coef0 = 1.0, gamma = 1.0)

In [12]:
alphas_odd = np.array([])
alphas_even = np.array([])

for cur_num in range(10):
    #cur_num-vs-all
    hw8_data.set_filter([cur_num])
    cur_X = hw8_data.get_X("train")
    cur_Y = hw8_data.get_Y("train")
    my_svm.fit(cur_X, cur_Y)
    cur_score = my_svm.score(cur_X, cur_Y)
    cur_numalphas = my_svm.n_support_
    cur_asum = np.array(cur_numalphas).sum()
    print("%d-vs-all binary classifier in-sample error: %f" % (cur_num, (1.0 - cur_score)))
    if cur_num % 2 == 0:
        alphas_even = np.concatenate((alphas_even, [cur_asum]))
    else:
        alphas_odd = np.concatenate((alphas_odd, [cur_asum]))
    
    
aodd_sum = np.sum(alphas_odd)
aeven_sum = np.sum(alphas_even)
a_diff = abs(aodd_sum - aeven_sum)
print("Diff in number of sv's between odd and even: %d" % a_diff)


0-vs-all binary classifier in-sample error: 0.105884
1-vs-all binary classifier in-sample error: 0.014401
2-vs-all binary classifier in-sample error: 0.100261
3-vs-all binary classifier in-sample error: 0.090248
4-vs-all binary classifier in-sample error: 0.089425
5-vs-all binary classifier in-sample error: 0.076258
6-vs-all binary classifier in-sample error: 0.091071
7-vs-all binary classifier in-sample error: 0.088465
8-vs-all binary classifier in-sample error: 0.074338
9-vs-all binary classifier in-sample error: 0.088328
Diff in number of sv's between odd and even: 2071


With C = 0.01, Q=2 and with a n-vs-all classifier, it turns out that 0 out of all the evens has the highest in-sample error and 1 out of all the odds has the lowest in-sample error.

In [13]:
#loading 1-vs-5 data

hw8_data.set_filter([1,5])
x_1v5_train = hw8_data.get_X("train")
y_1v5_train= hw8_data.get_Y("train")
x_1v5_test = hw8_data.get_X("test")
y_1v5_test= hw8_data.get_Y("test")

print(x_1v5_train.shape, y_1v5_train.shape, x_1v5_test.shape, y_1v5_test.shape)

(1561, 2) (1561,) (424, 2) (424,)


In [14]:
pk_Q = [2,5]
pk_C = [pow(10, -x) for x in reversed(range(5))]

for Q in pk_Q:
    my_svm.degree = Q
    print("~~~ For polynomial kernels of degree Q = %d ~~~" % Q)
    for C in pk_C:
        my_svm.C = C
        my_svm.fit(x_1v5_train, y_1v5_train)
        cur_ein = 1.0 - my_svm.score(x_1v5_train, y_1v5_train)
        cur_eout = 1.0 - my_svm.score(x_1v5_test, y_1v5_test)
        cur_numalphas = my_svm.n_support_
        cur_asum = np.array(cur_numalphas).sum()
        print("C = %f | E_in = %f, E_out = %f, num_sv = %d" % (C, cur_ein, cur_eout, cur_asum))
    print("")
        
        

~~~ For polynomial kernels of degree Q = 2 ~~~
C = 0.000100 | E_in = 0.008969, E_out = 0.016509, num_sv = 236
C = 0.001000 | E_in = 0.004484, E_out = 0.016509, num_sv = 76
C = 0.010000 | E_in = 0.004484, E_out = 0.018868, num_sv = 34
C = 0.100000 | E_in = 0.004484, E_out = 0.018868, num_sv = 24
C = 1.000000 | E_in = 0.003203, E_out = 0.018868, num_sv = 24

~~~ For polynomial kernels of degree Q = 5 ~~~
C = 0.000100 | E_in = 0.004484, E_out = 0.018868, num_sv = 26
C = 0.001000 | E_in = 0.004484, E_out = 0.021226, num_sv = 25
C = 0.010000 | E_in = 0.003844, E_out = 0.021226, num_sv = 23
C = 0.100000 | E_in = 0.003203, E_out = 0.018868, num_sv = 25
C = 1.000000 | E_in = 0.003203, E_out = 0.021226, num_sv = 21



From the previous output, we can see that the maximum C (C being the upper bound on &alpha;<sub>i</sub>'s, which are "weights" for support vector contributions) achieves the lowest E<sub>in</sub>. Furthermore, the number support vectors for Q = 5 (Q being the degree of the polynomial kernel) is lower for C = 0.001.

## Cross Validation

With Q = 2 and C &isin; {0.0001, 0.001, 0.01, 0.1, 1} using a polynomial kernel on the 1 vs 5 classifier (as used above), we will now use 10-fold cross validation, with the cross-validation erro E<sub>CV</sub> = (1/N) &sum;(n=1;N){e<sub>n</sub>}.e<sub>n</sub> is the validation error for a fold and if g<sup>-</sup><sub>n</sub> is the hypothesis learned on that fold, e<sub>n</sub> = e(g<sup>-</sup><sub>n</sub>(x<sub>n</sub>), y<sub>n</sub>)

In [19]:
from sklearn.model_selection import KFold

cv_Q = 2
cv_C = [pow(10, -x) for x in reversed(range(5))]
cv_runs = 100 #number of runs
cv_splits = 10 #number of splits

#note that k-fold validation is not in sklearn 0.17



e_cvs = np.ndarray((0, len(cv_C)))
cv_winner = np.zeros(len(cv_C)) #record of the winning C each run
#iterate over runs
for cur_run in range(cv_runs):
    #iterate over possible c values
    cur_ecvs = np.array([])
    cv_kf = KFold(n_splits=cv_splits, shuffle=False)
    for C in cv_C:
        my_svm.C = C
        e_vals = np.array([]) #array of validation errors
        #iterate over each fold
        for train_idx, test_idx in cv_kf.split(x_1v5_train):
            cv_xtrain, cv_xtest = x_1v5_train[train_idx], x_1v5_train[test_idx]
            cv_ytrain, cv_ytest = y_1v5_train[train_idx], y_1v5_train[test_idx]
            my_svm.fit(cv_xtrain, cv_ytrain)
            cur_err = 1.0 - my_svm.score(cv_xtest, cv_ytest)
            e_vals = np.concatenate((e_vals, [cur_err]))
        cur_ecv = np.average(e_vals) #current cv error
        cur_ecvs = np.concatenate((cur_ecvs, [cur_ecv]))
    win_idx = np.argmin(cur_ecvs) #index of the winning C
    #mark winner in our records
    cv_winner[win_idx] = cv_winner[win_idx] + 1
    #add cv errors to our records
    e_cvs = np.vstack((e_cvs, cur_ecvs))

#find average e_cvs for each C
ecv_avg = np.average(e_cvs, axis=0)
overall_winner = np.argmax(cv_winner)
ecv_win = ecv_avg[overall_winner]

print("C = %f is selected most often with average E_cv %f" % (cv_C[overall_winner], ecv_win))
        


C = 0.000100 is selected most often with average E_cv 0.004483


## RBF Kernel

Now we are going to use the radial basis function (RBF) kernel K(x<sub>n</sub>, x<sub>m</sub>) = exp(-&Vert;x<sub>n</sub> - x<sub>m</sub>&Vert;<sup>2</sup>) with a soft-margin SVM as a 1 vs 5 classifier.

We will test our SVM on various values of C &isin; {0.01, 1, 100, 10<sup>4</sup>, 10<sup>6</sup>}

In [5]:
rbf_C = [pow(10,x) for x in range(-2,7,2)]

my_svm.kernel = 'rbf'
my_svm.gamma = 1

for C in rbf_C:
    my_svm.C = C
    my_svm.fit(x_1v5_train, y_1v5_train)
    cur_ein = 1.0 - my_svm.score(x_1v5_train, y_1v5_train)
    cur_eout = 1.0 - my_svm.score(x_1v5_test, y_1v5_test)
    print("C = %f | E_in = %f, E_out = %f" % (C, cur_ein, cur_eout))
    

C = 0.010000 | E_in = 0.003844, E_out = 0.023585
C = 1.000000 | E_in = 0.004484, E_out = 0.021226
C = 100.000000 | E_in = 0.003203, E_out = 0.018868
C = 10000.000000 | E_in = 0.002562, E_out = 0.023585
C = 1000000.000000 | E_in = 0.000641, E_out = 0.023585


We see that C = 10<sup>6</sup> has the lowest E<sub>in</sub> while C = 100 has the lowest E<sub>out</sub>