In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

from crossval import cross_validate

from svm_multiple_labels import SvmMultipleLabels

In [3]:
pd_train_data = pd.read_csv("../dataset/train.csv") #reading the csv files using pandas

np_labels = pd_train_data['label']
np_data = pd_train_data.drop(columns = 'label')

In [4]:
# Preprocessing
np_data = np_data / 255.0

In [5]:
num_models = 3
num_datasets = 10
num_folds = 4

train_data = np_data[:1000].to_numpy()
test_data = np_data[1001:11001].to_numpy()
train_labels = np_labels[:1000].to_numpy()
test_labels = np_labels[1001:11001].to_numpy()

train_data = np.array_split(np.transpose(train_data), num_datasets, axis=1)
test_data = np.array_split(np.transpose(test_data), num_datasets, axis=1)
train_labels = np.array_split(np.transpose(train_labels), num_datasets)
test_labels = np.array_split(np.transpose(test_labels), num_datasets)

# initialize matrix to store test accuracies
test_accuracy = np.zeros((num_datasets, num_models))

In [6]:
# run linear SVM

c_vals = 10 ** np.linspace(-3, 1, 5)

for i in range(num_datasets):
    best_params = []
    best_score = 0

    for j in range(len(c_vals)):
        params = {
            'kernel': 'linear',
            'C': c_vals[j]
        }
        
        cv_score, _ = cross_validate(SvmMultipleLabels, train_data[i], train_labels[i], num_folds, params)
        
        if cv_score > best_score:
            best_score = cv_score
            best_params = params
    
    lin_svm = SvmMultipleLabels(best_params)
    lin_svm_model = lin_svm.fit(train_data[i], train_labels[i])
    predictions = lin_svm.predict(test_data[i])
    test_accuracy[i, 0] = np.mean(predictions == test_labels[i])
    
    print("Linear SVM had test accuracy %f on Dataset %d" % (test_accuracy[i, 0], i))
    print("with C = %f" % (best_params['C']))

Linear SVM had test accuracy 0.769000 on Dataset 0
with C = 1.000000
Linear SVM had test accuracy 0.739000 on Dataset 1
with C = 0.100000
Linear SVM had test accuracy 0.733000 on Dataset 2
with C = 0.100000
Linear SVM had test accuracy 0.745000 on Dataset 3
with C = 0.100000
Linear SVM had test accuracy 0.732000 on Dataset 4
with C = 1.000000
Linear SVM had test accuracy 0.717000 on Dataset 5
with C = 0.100000
Linear SVM had test accuracy 0.702000 on Dataset 6
with C = 0.100000
Linear SVM had test accuracy 0.709000 on Dataset 7
with C = 0.100000
Linear SVM had test accuracy 0.708000 on Dataset 8
with C = 0.100000
Linear SVM had test accuracy 0.699000 on Dataset 9
with C = 0.100000


In [7]:
# run polynomial SVM

c_vals = 10 ** np.linspace(-3, 1, 5)
orders = [1, 2, 3, 4]

for i in range(num_datasets):
    best_params = {}
    best_score = 0
    
    for j in range(len(c_vals)):
        for k in range(len(orders)):
            params = {
                'kernel': 'polynomial',
                'C': c_vals[j],
                'order': orders[k]
            }
        
            cv_score, _ = cross_validate(SvmMultipleLabels, train_data[i], train_labels[i], num_folds, params)
        
            if cv_score > best_score:
                best_score = cv_score
                best_params = params
    
    poly_svm = SvmMultipleLabels(best_params)
    lin_svm_model = poly_svm.fit(train_data[i], train_labels[i])
    predictions = poly_svm.predict(test_data[i])
    test_accuracy[i, 1] = np.mean(predictions == test_labels[i])

    print("Polynomial SVM had test accuracy %f on Dataset %d" % (test_accuracy[i, 1], i))
    print("with C = %f, order = %d" % (best_params['C'], best_params['order']))

ssing all-zeros solution.
Polynomial SVM had test accuracy 0.708000 on Dataset 9
with C = 0.001000, order = 2


In [8]:
# run RBF SVM

c_vals = 10 ** np.linspace(-3, 3, 7)
sigmas = np.linspace(1, 100, 20)


for i in range(num_datasets):
    best_params = []
    best_score = 0

    for j in range(len(c_vals)):
        for k in range(len(sigmas)):
            params = {
                'kernel': 'rbf',
                'C': c_vals[j],
                'sigma': sigmas[k]
            }
        
            cv_score, _ = cross_validate(SvmMultipleLabels, train_data[i], train_labels[i], num_folds, params)
        
            if cv_score > best_score:
                best_score = cv_score
                best_params = params
    
    rbf_svm = SvmMultipleLabels(best_params)
    lin_svm_model = rbf_svm.fit(train_data[i], train_labels[i])
    predictions = rbf_svm.predict(test_data[i])
    test_accuracy[i, 2] = np.mean(predictions == test_labels[i])
    
    print("RBF SVM had test accuracy %f on Dataset %d" % (test_accuracy[i, 2], i))
    print("with C = %f, sigma = %f" % (best_params['C'], best_params['sigma']))

RBF SVM had test accuracy 0.744000 on Dataset 0
with C = 0.001000, sigma = 11.421053
RBF SVM had test accuracy 0.774000 on Dataset 1
with C = 10.000000, sigma = 6.210526
RBF SVM had test accuracy 0.752000 on Dataset 2
with C = 10.000000, sigma = 6.210526
RBF SVM had test accuracy 0.774000 on Dataset 3
with C = 10.000000, sigma = 6.210526
RBF SVM had test accuracy 0.773000 on Dataset 4
with C = 10.000000, sigma = 6.210526
RBF SVM had test accuracy 0.744000 on Dataset 5
with C = 10.000000, sigma = 11.421053
RBF SVM had test accuracy 0.769000 on Dataset 6
with C = 10.000000, sigma = 6.210526
RBF SVM had test accuracy 0.713000 on Dataset 7
with C = 100.000000, sigma = 42.684211
RBF SVM had test accuracy 0.760000 on Dataset 8
with C = 10.000000, sigma = 6.210526
RBF SVM had test accuracy 0.733000 on Dataset 9
with C = 10.000000, sigma = 6.210526


In [13]:
# print accuracy table

methods = ['LinSVM\t', 'PolySVM\t', 'RBFSVM\t']

print((" " * 12) + "TEST ACCURACIES (Percent)")
print("-" * 54)

print("Set\t   " + "   ".join(["%d" % number for number in range(num_datasets)]) + "  AVG")
print("-" * 54)
for i in range(len(methods)):
    line = [methods[i]]
    sum_of_acc = 0
    for j in range(num_datasets):
        line += ["%d" % (100 * test_accuracy[j, i])]
        sum_of_acc += (100 * test_accuracy[j, i])
    line += ["%d" % (sum_of_acc / num_datasets)]
    print("  ".join(line))

            TEST ACCURACIES (Percent)
------------------------------------------------------
Set	   0   1   2   3   4   5   6   7   8   9  AVG
------------------------------------------------------
LinSVM	  76  73  73  74  73  71  70  70  70  69  72
PolySVM	  76  75  74  74  74  71  74  70  73  70  73
RBFSVM	  74  77  75  77  77  74  76  71  76  73  75
