In [23]:
# Problem 2-10 on https://work.caltech.edu/homework/hw8.pdf

# the format of each row is: digit, intensity, symmetry
# we will train two types of binary classifiers: 
# - one-versus-one (one digit is class +1 and another digit is class -1, with the rest of digits disregarded),
# - one-versus-all (one digit is class +1 and the rest are class -1)

# When evaluating E_in and E_out, use binary classification error
# Do not scale the data

import numpy as np
from sklearn import svm
from pprint import pp

training_data = np.loadtxt('features.train')
testing_data = np.loadtxt('features.test')

training_xs = training_data[:,1:3]
training_ys = training_data[:,0]

testing_xs = testing_data[:,1:3]
testing_ys = testing_data[:,0]


## Polynomial Kernels

Covers problems 2-6

In [26]:
# Adapted from https://github.com/workflow/caltech-machine-learning-homework/blob/master/HW8.ipynb

def make_binary(ys, digit):
    return np.array([1 if y == digit else -1 for y in ys])

# Problems 1-5..

# Classify list of digits provided. Does one-versus all comparison.  Returns dictionary w/ digits => (e_in, support_vectors)
def classify(digits, C, Q):
    results = {}
    clf = svm.SVC(kernel='poly', C=C, degree=Q, gamma=1.0, coef0=1.0)
    for digit in digits:
        binary_ys = make_binary(training_ys, digit)
        clf.fit(training_xs, binary_ys)
        score = clf.score(training_xs, binary_ys)
        results[digit] = {'e_in': 1 - score, 'svs': len(clf.support_vectors_)}
    
    return results

# pp(classify([0,2,4,6,8], .01, 2))
# pp(classify([1,3,5,7,9], .01, 2))

def make_one_v_one(xs, ys, digit_1, digit_2):
    ids = (ys == digit_1) | (ys == digit_2)
    return (xs[ids], ys[ids])

def one_vs_five_classify():
    results = {}
    train_xs, train_ys = make_one_v_one(training_xs, training_ys, 1, 5)
    test_xs, test_ys = make_one_v_one(testing_xs, testing_ys, 1, 5)
    for C in (.0001, .001, .01, .1, 1):
        for Q in (2, 5):
            clf = svm.SVC(kernel='poly', C=C, degree=Q, gamma=1.0, coef0=1.0)
            clf.fit(train_xs, train_ys)
            e_in = 1 - clf.score(train_xs, train_ys) 
            e_out = 1 - clf.score(test_xs, test_ys)
            svs = len(clf.support_vectors_)
            results[str(C) + "_" + str(Q)] = {'e_in': e_in, 'e_out': e_out, 'svs': svs}
    
    return results

pp(one_vs_five_classify())
            

{'0.0001_2': {'e_in': 0.008968609865470878,
              'e_out': 0.01650943396226412,
              'svs': 236},
 '0.0001_5': {'e_in': 0.004484304932735439,
              'e_out': 0.018867924528301883,
              'svs': 26},
 '0.001_2': {'e_in': 0.004484304932735439,
             'e_out': 0.01650943396226412,
             'svs': 76},
 '0.001_5': {'e_in': 0.004484304932735439,
             'e_out': 0.021226415094339646,
             'svs': 25},
 '0.01_2': {'e_in': 0.004484304932735439,
            'e_out': 0.018867924528301883,
            'svs': 34},
 '0.01_5': {'e_in': 0.0038436899423446302,
            'e_out': 0.021226415094339646,
            'svs': 23},
 '0.1_2': {'e_in': 0.004484304932735439,
           'e_out': 0.018867924528301883,
           'svs': 24},
 '0.1_5': {'e_in': 0.0032030749519538215,
           'e_out': 0.018867924528301883,
           'svs': 25},
 '1_2': {'e_in': 0.0032030749519538215,
         'e_out': 0.018867924528301883,
         'svs': 24},
 '1_5': {'e_in

## Cross-Validation



In [37]:
from sklearn.model_selection import RepeatedKFold, RepeatedStratifiedKFold

# TODO: see if results differ between k fold and stratified k fold..

train_xs_1_v_5, train_ys_1_v_5 = make_one_v_one(training_xs, training_ys, 1, 5)

rkf = RepeatedKFold(n_splits=10, n_repeats=100)

winners = []
for train_indexes, val_indexes in rkf.split(train_xs, train_ys):
    train_xs = train_xs_1_v_5[train_indexes]
    train_ys = train_ys_1_v_5[train_indexes]
    val_xs = train_xs_1_v_5[val_indexes]
    val_ys = train_ys_1_v_5[val_indexes]

    best_score = 0
    winner = None
    for C in [.0001, .001, .01, .1, 1]:
        clf = svm.SVC(kernel='poly', C=C, degree=2, gamma=1.0, coef0=1.0)
        clf.fit(train_xs, train_ys)
        score = clf.score(val_xs, val_ys)
        if score > best_score:
            best_score = score
            winner = C

    winners.append(winner)

Cs, counts = np.unique(winners, return_counts = True)
print("Cs", Cs)
print("counts", counts)



Cs [1.e-04 1.e-03 1.e-02 1.e-01 1.e+00]
counts [388 500  47  24  41]


In [39]:
from sklearn.model_selection import cross_val_score

C = .001
rkf = RepeatedKFold(n_splits=10, n_repeats=100)
clf = svm.SVC(kernel='poly', C=C, degree=2, gamma=1.0, coef0=1.0)

scores = cross_val_score(clf, train_xs_1_v_5, train_ys_1_v_5, cv=rkf)

print(1 - scores.mean())


0.004683610975012242


## RBF Kernel

In [43]:
test_xs_1_v_5, test_ys_1_v_5 = make_one_v_one(testing_xs, testing_ys, 1, 5)

Cs = [.01, 1, 100, 10**4, 10**6]

e_ins = {}
e_outs = {}
for C in Cs:
    clf = svm.SVC(kernel='rbf', C=C, gamma=1.0, coef0=1.0)
    clf.fit(train_xs_1_v_5, train_ys_1_v_5)
    e_in = 1 - clf.score(train_xs_1_v_5, train_ys_1_v_5)
    e_out = 1 - clf.score(test_xs_1_v_5, test_ys_1_v_5)

    e_ins[C] = e_in
    e_outs[C] = e_out

print("e_ins", e_ins)
print("e_outs", e_outs)
    

{0.01: 0.0038436899423446302, 1: 0.004484304932735439, 100: 0.0032030749519538215, 10000: 0.002562459961563124, 1000000: 0.0006406149903908087}
{0.01: 0.02358490566037741, 1: 0.021226415094339646, 100: 0.018867924528301883, 10000: 0.02358490566037741, 1000000: 0.02358490566037741}
