In [1]:
import numpy as np
from sklearn import svm, metrics
from sklearn.model_selection import cross_val_score, KFold

## Load data and split into training and testing sets

In [2]:
train_dataset = np.loadtxt("features.train")
test_dataset = np.loadtxt("features.test")

# Initialise normal training data
train = train_dataset[:, 1:]
train_target = train_dataset[:, 0]

# Initialise testing data
test = test_dataset[:, 1:]
test_target = test_dataset[:, 0]

## Comparing one-versus-all classifiers

### Question 2
C = 0.001, Q = 2
Comparing in sample error of the following classifiers:
    0 versus all
    2 versus all
    4 versus all
    6 versus all
    8 versus all
            

In [3]:
def x_v(x, target):
    """
    Return classifier from US Postal Service Zip Code data set of 
    type x-versus-all. 
    Does not modify target.
    
    x - digit in postal code dataset
    target - dataset
    
    """
    data = np.copy(target)
    for point in np.nditer(data, op_flags=['readwrite']):
        if point == x:
            point[...] = 1
        else:
            point[...] = -1
    return data

In [4]:
classifier = svm.SVC(C=0.01, kernel='poly', degree=2)
for x in [0, 2, 4, 6, 8]:
   target_tmp = x_v(x, train_target)
   classifier.fit(train, target_tmp)
   print(x, classifier.score(train, target_tmp))

0 0.836236455905
2 0.899739404746
4 0.910574681114
6 0.908928816349
8 0.925661774791


### Question 3
C = 0.01, Q = 3
Comparing in sample error of the following classifiers:
    1 versus all
    3 versus all
    5 versus all
    7 versus all
    9 versus all

In [5]:
classifier = svm.SVC(C=0.01, kernel='poly', degree=2)
for x in [1, 3, 5, 7, 9]:
   target_tmp = x_v(x, train_target)
   classifier.fit(train, target_tmp)
   print(x, classifier.score(train, target_tmp))

1 0.984227129338
3 0.909751748731
5 0.923741599232
7 0.911534768893
9 0.91167192429


### Question 4
Comparing the number of support vectors in classifiers selected in Questions 2 and 3. 
Selected classifier for question 2 was '0 versus all'
Selected classifier for question 2 was '1 versus all'

In [6]:
classifier_1 = svm.SVC(C=0.01, kernel='poly', degree=2)
target_tmp = x_v(0, train_target)
classifier_1.fit(train, target_tmp)
print('Number of support vectors for Q2: ', len(classifier_1.support_vectors_))

classifier_2 = svm.SVC(C=0.01, kernel='poly', degree=2)
target_tmp = x_v(1, train_target)
classifier_2.fit(train, target_tmp)
print('Number of support vectors for Q3: ', len(classifier_2.support_vectors_))

Number of support vectors for Q2:  2390
Number of support vectors for Q3:  536


# Comparing one-versus-one classifiers

One digit is class +1 and another digit is class -1, with the rest of the digits disregarded. For the following problems, we use a 1-versus-5 classifier. 

In [7]:
def one_v_one(x, y, dataset):
    """
    Return classifier of type x-versus-all
    does not modify target
    """
    result = []
    for row in dataset:
        if row[0] == x:
            result.append([1, row[1], row[2]])
        elif row[0] == y:
            result.append([-1, row[1], row[2]])
    return np.array(result)

In [8]:
# Initialise 1v5 training data
train_dataset_tmp = one_v_one(1, 5, train_dataset)
train = train_dataset_tmp[:, 1:]
train_target = train_dataset_tmp[:, 0]

# Initialise 1v5 testing data
test_dataset_tmp = one_v_one(1, 5, test_dataset)
test = test_dataset_tmp[:, 1:]
test_target = test_dataset_tmp[:, 0]

## Question 5
Consider the 1 versus 5 classifier with Q = 2 and C = [0.001, 0.01, 0.1, 1]. Finding effect of an increase in C on:
    number of support vectors, 
    in sample error and 
    out of sample error.

In [9]:
number_support_vectors = []
in_sample_score = []
out_sample_score = []

for C in [0.001, 0.01, 0.1, 1]:
   classifier = svm.SVC(C=C, kernel='poly', degree=2, gamma=1.0)
   classifier.fit(train, train_target)

   number_support_vectors.append((len(classifier.support_vectors_)))
   in_sample_score.append(classifier.score(train, train_target))
   out_sample_score.append(classifier.score(test, test_target))

print('SV: ', number_support_vectors)
print('Training score: ', in_sample_score)
print('Testing score: ', out_sample_score)

SV:  [80, 34, 24, 24]
Training score:  [0.99551569506726456, 0.99551569506726456, 0.99551569506726456, 0.99615631005765537]
Testing score:  [0.98349056603773588, 0.98113207547169812, 0.98113207547169812, 0.98113207547169812]


## Question 6
Consider the 1 versus 5 classifier with Q = 2 and Q = 5. C = [0.0001, 0.001, 0.01, 1]. 

Finding effect of an increase in C with different Q on: number of support vectors, in sample error and out of sample error.

In [10]:
number_support_vectors_5 = []
in_sample_score_5 = []
out_sample_score_5 = []

number_support_vectors_2 = []
in_sample_score_2 = []
out_sample_score_2 = []

for C in [0.0001, 0.001, 0.01, 1]:
    classifier = svm.SVC(C=C, kernel='poly', degree=5, gamma=1.0,
                         decision_function_shape='ovo')
    classifier.fit(train, train_target)

    number_support_vectors_5.append((len(classifier.support_vectors_)))
    in_sample_score_5.append(1- classifier.score(train, train_target))
    out_sample_score_5.append(1- classifier.score(test, test_target))

for C in [0.0001, 0.001, 0.01, 1]:
    classifier = svm.SVC(C=C, kernel='poly', degree=2, gamma=1.0,
                         decision_function_shape='ovo')
    classifier.fit(train, train_target)

    number_support_vectors_2.append((len(classifier.support_vectors_)))
    in_sample_score_2.append(1 - classifier.score(train, train_target))
    out_sample_score_2.append(1- classifier.score(test, test_target))


print('SV 2: ', number_support_vectors_2)
print('SV 5: ', number_support_vectors_5)

print('Training score 2: ', in_sample_score_2)
print('Training score 5: ', in_sample_score_5)

print('Testing score 2: ', out_sample_score_2)
print('Testing score 5: ', out_sample_score_5)

SV 2:  [244, 80, 34, 24]
SV 5:  [26, 26, 27, 24]
Training score 2:  [0.010249839846252384, 0.004484304932735439, 0.004484304932735439, 0.0038436899423446302]
Training score 5:  [0.004484304932735439, 0.004484304932735439, 0.0051249199231262477, 0.004484304932735439]
Testing score 2:  [0.01650943396226412, 0.01650943396226412, 0.018867924528301883, 0.018867924528301883]
Testing score 5:  [0.01650943396226412, 0.01650943396226412, 0.01650943396226412, 0.01650943396226412]


## Cross Validation
In the next two problems, we experiment with 10-fold cross validation for the polynomial kernel. Because Ecv is a random variable that depends on the random partition of the data, we will try 100 runs with different partitions and base our answer on how many runs lead to a particular choice.

### Question 7
Consider the 1 versus 5 classiffer with Q = 2. Use Ecv to select between C = [0.0001, 0.001, 0.01, 1]. Which C is selected most often?

In [11]:
results = []
for i in range(10):
   scores = []
   for C in [0.0001, 0.001, 0.01, 0.1, 1]:
       classifier = svm.SVC(C=C, kernel='poly', degree=2, gamma=1.0)
       kf = KFold(n_splits=10)
       scores.append(np.mean(cross_val_score(classifier, train, train_target, cv=kf)))
   results.append(np.argmax(scores))

print(results)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


### Question 8
Again, consider the 1 versus 5 classiffer with Q = 2. For the winning selection in the previous problem, what is the average value of Ecv over the 100 runs?

In [12]:
results = []
for i in range(10):
   classifier = svm.SVC(C=0.001, kernel='poly', degree=2, gamma=1.0)
   kf = KFold(n_splits=10)
   results.append(np.mean(cross_val_score(classifier, train, train_target, cv=kf)))

print(1- np.mean(results))

0.00448309652131


## RBF Kernel

### Questions 9 and 10
Consider the radial basis function (RBF) kernel in the soft-margin SVM approach. Focus on the 1 versus 5 classiffer. Which c = [0.01, 1, 100, 10^4, 10^6] results in the lowest Ein and Eout?

In [13]:
in_sample_score = []
out_sample_score = []

for C in [0.01, 1, 100, 10**4, 10**6]:
   classifier = svm.SVC(C=C, kernel='rbf', gamma=1.0)
   classifier.fit(train, train_target)

   in_sample_score.append(classifier.score(train, train_target))
   out_sample_score.append(classifier.score(test, test_target))

print('Training score: ', in_sample_score)
print('Testing score: ', out_sample_score)

Training score:  [0.99615631005765537, 0.99551569506726456, 0.99679692504804618, 0.99743754003843688, 0.99935938500960919]
Testing score:  [0.97641509433962259, 0.97877358490566035, 0.98113207547169812, 0.97641509433962259, 0.97641509433962259]
