In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

### Polynomial kernels

In [3]:
train = pd.read_table('datasets/features.train', header=None, sep='  ', engine='python')
test = pd.read_table('datasets/features.test', header=None, sep='  ', engine='python')

In [4]:
print('Train size: {}, test size: {}'.format(train.shape, test.shape))

Train size: (7291, 3), test size: (2007, 3)


In [5]:
train.head()

Unnamed: 0,0,1,2
0,6.0,0.341092,-4.528937
1,5.0,0.444131,-5.496812
2,4.0,0.231002,-2.88675
3,7.0,0.200275,-3.534375
4,3.0,0.291936,-4.352062


In [6]:
X_train = np.asarray(train.loc[:,1:])
y_train = np.asarray(train[0])

X_test = np.asarray(test.loc[:,1:])
y_test = np.asarray(test[0])

In [7]:
def classification_error(y_true, y_pred):
    return np.sum(y_true!=y_pred) / len(y_true)

#### Question 2, 3

In [35]:
models = []
Ein_models = []

for digit in range(10):
    y_train_tmp = (y_train==digit).astype(int)
    y_train_tmp[y_train_tmp==0] = -1
    model = SVC(kernel='poly', C=.01, degree=2, coef0=1, gamma=1)
    model.fit(X_train, y_train_tmp)
    pred = model.predict(X_train)
    Ein = classification_error(y_train_tmp, pred)
    models.append(model)
    Ein_models.append(Ein)
    print('{}-versus-all classifier: accuracy = {:.4f}'.format(digit, Ein))

0-versus-all classifier: accuracy = 0.1059
1-versus-all classifier: accuracy = 0.0144
2-versus-all classifier: accuracy = 0.1003
3-versus-all classifier: accuracy = 0.0902
4-versus-all classifier: accuracy = 0.0894
5-versus-all classifier: accuracy = 0.0763
6-versus-all classifier: accuracy = 0.0911
7-versus-all classifier: accuracy = 0.0885
8-versus-all classifier: accuracy = 0.0743
9-versus-all classifier: accuracy = 0.0883


In [13]:
print('Model with highest Ein: {}-versus-all'.format(np.argmax(Ein_models)))
print('Model with lowest Ein: {}-versus-all'.format(np.argmin(Ein_models)))

Model with highest Ein: 0-versus-all
Model with lowest Ein: 1-versus-all


#### Question 4

In [19]:
print('Difference in the number of support vectors: {}'
     .format(np.abs(len(models[0].support_vectors_) - len(models[1].support_vectors_))))

Difference in the number of support vectors: 1793


#### Question 5

In [37]:
y_train_tmp = y_train[(y_train==1) | (y_train==5)]
y_train_tmp[y_train_tmp==1] = 1
y_train_tmp[y_train_tmp==5] = -1
X_train_tmp = X_train[(y_train==1) | (y_train==5)]

y_test_tmp = y_test[(y_test==1) | (y_test==5)]
y_test_tmp[y_test_tmp==1] = 1
y_test_tmp[y_test_tmp==5] = -1
X_test_tmp = X_test[(y_test==1) | (y_test==5)]

In [22]:
C = [.001, .01, .1, 1]

print('Running 1-versus-5 classifier with Q = 2\n')
for c in C:
    model = SVC(kernel='poly', C=c, degree=2, coef0=1, gamma=1)
    model.fit(X_train_tmp, y_train_tmp)
    pred_train = model.predict(X_train_tmp)
    pred_test = model.predict(X_test_tmp)
    Ein = classification_error(y_train_tmp, pred_train)
    Eout = classification_error(y_test_tmp, pred_test)
    num_sv = len(model.support_vectors_)
    print('* C = {}: Ein = {:.4f}, Eout = {:.4f}, number of support vectors = {}'.format(c, Ein, Eout, num_sv))

Running 1-versus-5 classifier with Q = 2

* C = 0.001: Ein = 0.0045, Eout = 0.0165, number of support vectors = 76
* C = 0.01: Ein = 0.0045, Eout = 0.0189, number of support vectors = 34
* C = 0.1: Ein = 0.0045, Eout = 0.0189, number of support vectors = 24
* C = 1: Ein = 0.0032, Eout = 0.0189, number of support vectors = 24


#### Question 6

In [38]:
C = [.0001, .001, .01, 1]

print('Running 1-versus-5 classifier\n')
for c in C:
    for q in [2, 5]:
        model = SVC(kernel='poly', C=c, degree=q, coef0=1, gamma=1)
        model.fit(X_train_tmp, y_train_tmp)
        pred_train = model.predict(X_train_tmp)
        pred_test = model.predict(X_test_tmp)
        Ein = classification_error(y_train_tmp, pred_train)
        Eout = classification_error(y_test_tmp, pred_test)
        num_sv = len(model.support_vectors_)
        print('* C = {}, Q = {}: Ein = {:.5f}, Eout = {:.5f}, number of support vectors = {}'.format(c, q, Ein, Eout, num_sv))

Running 1-versus-5 classifier

* C = 0.0001, Q = 2: Ein = 0.00897, Eout = 0.01651, number of support vectors = 236
* C = 0.0001, Q = 5: Ein = 0.00448, Eout = 0.01887, number of support vectors = 26
* C = 0.001, Q = 2: Ein = 0.00448, Eout = 0.01651, number of support vectors = 76
* C = 0.001, Q = 5: Ein = 0.00448, Eout = 0.02123, number of support vectors = 25
* C = 0.01, Q = 2: Ein = 0.00448, Eout = 0.01887, number of support vectors = 34
* C = 0.01, Q = 5: Ein = 0.00384, Eout = 0.02123, number of support vectors = 23
* C = 1, Q = 2: Ein = 0.00320, Eout = 0.01887, number of support vectors = 24
* C = 1, Q = 5: Ein = 0.00320, Eout = 0.02123, number of support vectors = 21


### Cross-validation

#### Question 7

In [24]:
from sklearn.model_selection import KFold

In [44]:
C = [.0001, .001, .01, .1, 1]
folds = 10
selected_models = []
test_cv = []

for n in range(100):
    Ecv_models = []
    kf = KFold(n_splits=folds, shuffle=True)
    for c in C:
        Ecv_folds = []
        for train_idx, val_idx in kf.split(X_train_tmp, y_train_tmp):
            model = SVC(C=c, kernel='poly', degree=2, gamma=1, coef0=1)
            model.fit(X_train_tmp[train_idx], y_train_tmp[train_idx])
            pred = model.predict(X_train_tmp[val_idx])
            Ecv_folds.append(classification_error(y_train_tmp[val_idx], pred))
        Ecv_models.append(np.mean(Ecv_folds))
    selected_models.append(np.argmin(Ecv_models))
    
for m in range(len(C)):
    print('Model with C = {} is selected {} times.'.format(C[m], selected_models.count(m)))

Model with C = 0.0001 is selected 0 times.
Model with C = 0.001 is selected 43 times.
Model with C = 0.01 is selected 25 times.
Model with C = 0.1 is selected 22 times.
Model with C = 1 is selected 10 times.


#### Question 8

In [60]:
Ecvs = []
for n in range(100):
    kf = KFold(n_splits=10, shuffle=True)
    Ecv_folds = []
    for train_idx, val_idx in kf.split(X_train_tmp, y_train_tmp):
        model = SVC(C=.001, kernel='poly', degree=2, coef0=1, gamma=1)
        model.fit(X_train_tmp[train_idx], y_train_tmp[train_idx])
        pred = model.predict(X_train_tmp[val_idx])
        Ecv_folds.append(classification_error(y_train_tmp[val_idx], pred))
    Ecvs.append(np.mean(Ecv_folds))
    
print('Average Ecv: {:.4f}'.format(np.mean(Ecvs)))

Average Ecv: 0.0048


### RBF kernel
#### Question 9, 10

In [62]:
C = [.01, 1, 100, 10**4, 10**6]

for c in C:
    model = SVC(kernel='rbf', C=c, degree=2, gamma=1)
    model.fit(X_train_tmp, y_train_tmp)
    pred_train = model.predict(X_train_tmp)
    pred_test = model.predict(X_test_tmp)
    Ein = classification_error(y_train_tmp, pred_train)
    Eout = classification_error(y_test_tmp, pred_test)
    print('Model with C = {}: Ein = {:.4f}, Eout = {:.4f}'.format(c, Ein, Eout))

Model with C = 0.01: Ein = 0.0038, Eout = 0.0236
Model with C = 1: Ein = 0.0045, Eout = 0.0212
Model with C = 100: Ein = 0.0032, Eout = 0.0189
Model with C = 10000: Ein = 0.0026, Eout = 0.0236
Model with C = 1000000: Ein = 0.0006, Eout = 0.0236
