In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
from numpy import linalg
from sklearn import svm
from sklearn.model_selection import KFold
sns.set_context('notebook')
%matplotlib inline

In [4]:
E_in = []
support_vectors =[]

# Loop through the values 0 - 9 for all digits.
for i in range(10):
    # Parse data into form [(x1, x2), y]
    with open("features.train.txt", "r") as train_set:
        raw_data = [line.strip().split('\t') for line in train_set]
    
    split = []
    for point in raw_data:
        split.append(point[0].split())
    
    train = []
    for point in split:
        train.append([np.array([float(point[1]), float(point[2])]), int(float(point[0]))])
    
    X_training = []
    Y_training = []
    
    # Classification for digit vs all classifier. i iterates through all digits.
    for point in train:
        if point[1] == i:
            point[1] = 1.0
        else:
            point[1] = -1.0
        X_training.append(point[0])
        Y_training.append(point[1])
    
    # Run svm package, with Q = 2 and C = 0.01
    svm1 = svm.SVC(kernel = 'poly', degree = 2, C = 0.01, coef0 = 1.0, gamma = 1.0)
    svm1.fit(X_training, Y_training)
    support_vectors.append(len(svm1.support_vectors_))
    
    error = 0.0
    
    for point in train:
        svm_prediction = svm1.predict([point[0]])
        # Find errors when the prediction doesn't line up with actual classification.
        if svm_prediction[0] != point[1]:
            error += 1
    error /= len(train)
    
    E_in.append((error, i))
        

In [4]:
E_in

[(0.10588396653408312, 0),
 (0.014401316691811822, 1),
 (0.10026059525442327, 2),
 (0.09024825126868742, 3),
 (0.08942531888629818, 4),
 (0.07625840076807022, 5),
 (0.09107118365107666, 6),
 (0.08846523110684405, 7),
 (0.07433822520916199, 8),
 (0.08832807570977919, 9)]

In [5]:
# maximum error for 0, 2, 4, 6, and 8 vs all
max(E_in[:9:2])

(0.10588396653408312, 0)

In [6]:
# Minimum error for 1, 3, 5, 7, and 9 vs all
min(E_in[1:10:2])

(0.014401316691811822, 1)

In [7]:
sv_max = max(E_in[:9:2])[1]
sv_min = min(E_in[1:10:2])[1]
# Difference in number of support vectors from the answers to the last two problems.
np.abs(support_vectors[sv_max] - support_vectors[sv_min])

1793

In [8]:
first_digit = 1.0
second_digit = 5.0
C_values = [0.001, 0.01, 0.1, 1]
E_in = []
E_out = []
support_vectors = []

# Loop through all C values
for c in C_values:
    with open("features.train.txt", "r") as train_set:
        raw_data = [line.strip().split('\t') for line in train_set]
    
    split = []
    for point in raw_data:
        split.append(point[0].split())
    
    train = []
    
    # Parse data for 1 vs 5 classifier into training set
    for point in split:
        if float(point[0]) == first_digit:
            train.append([np.array([float(point[1]), float(point[2])]), 1.0])
        elif float(point[0]) == second_digit:
            train.append([np.array([float(point[1]), float(point[2])]), -1.0])
            
    X_train = []
    Y_train = []
    
    for point in train:
        X_train.append(point[0])
        Y_train.append(point[1])
        
    with open('features.test.txt','r') as test_set:
        raw_data = [line.strip().split('\t') for line in test_set]
        
    split = []
    for point in raw_data:
        split.append(point[0].split())
        
    test = []
    
    # Parse data for 1 vs 5 classifier in test set
    for point in split:
        if float(point[0]) == first_digit:
            test.append([np.array([float(point[1]), float(point[2])]), 1.0])
        elif float(point[0]) == second_digit:
            test.append([np.array([float(point[1]), float(point[2])]), -1.0])
    
    # Run svm package on Q = 2 and whatever values of c is currently at
    svm1 = svm.SVC(kernel = 'poly', degree = 2, C = c, coef0 = 1.0, gamma = 1.0)
    svm1.fit(X_train, Y_train)
    # Keep track of support vectors required for the different values of C
    support_vectors.append((len(svm1.support_vectors_), c))
    
    e_in = 0.0
    for point in train:
        # Run prediction on each point in training set and calculate classification error
        svm_prediction = svm1.predict([point[0]])
        if svm_prediction[0] != point[1]:
            e_in += 1
        
    e_in /= len(train)
    
    E_in.append((e_in, c))
    
    e_out = 0.0
    for point in test:
        # Run predicition on each point in test set and calculate classification error
        svm_prediction = svm1.predict([point[0]])
        if svm_prediction[0] != point[1]:
            e_out += 1
    
    e_out /= len(test)
    
    E_out.append((e_out, c))
    
        

In [9]:
E_out

[(0.01650943396226415, 0.001),
 (0.018867924528301886, 0.01),
 (0.018867924528301886, 0.1),
 (0.018867924528301886, 1)]

In [10]:
E_in

[(0.004484304932735426, 0.001),
 (0.004484304932735426, 0.01),
 (0.004484304932735426, 0.1),
 (0.0032030749519538757, 1)]

In [11]:
support_vectors

[(76, 0.001), (34, 0.01), (24, 0.1), (24, 1)]

In [12]:
first_digit = 1.0
second_digit = 5.0
C_values = [0.0001, 0.001, 0.01, 0.1, 1]
E_in1 = []
E_in2 = []
E_out1 = []
E_out2 = []
support_vectors1 = []
support_vectors2 = []

for c in C_values:
    with open("features.train.txt", "r") as train_set:
        raw_data = [line.strip().split('\t') for line in train_set]
    
    split = []
    for point in raw_data:
        split.append(point[0].split())
    
    train = []
    # Parse data for 1 vs 5 classifier in training set
    for point in split:
        if float(point[0]) == first_digit:
            train.append([np.array([float(point[1]), float(point[2])]), 1.0])
        elif float(point[0]) == second_digit:
            train.append([np.array([float(point[1]), float(point[2])]), -1.0])
            
    X_train = []
    Y_train = []
    
    for point in train:
        X_train.append(point[0])
        Y_train.append(point[1])
        
    with open('features.test.txt','r') as test_set:
        raw_data = [line.strip().split('\t') for line in test_set]
        
    split = []
    for point in raw_data:
        split.append(point[0].split())
        
    test = []
    # Parse data for 1 vs 5 classifier in test set
    for point in split:
        if float(point[0]) == first_digit:
            test.append([np.array([float(point[1]), float(point[2])]), 1.0])
        elif float(point[0]) == second_digit:
            test.append([np.array([float(point[1]), float(point[2])]), -1.0])
    
    # Run svm package on Q = 2 for all C values
    svm1 = svm.SVC(kernel = 'poly', degree = 2, C = c, coef0 = 1.0, gamma = 1.0)
    svm1.fit(X_train, Y_train)
    support_vectors1.append((len(svm1.support_vectors_), c))
    
    # Run svm package on Q = 5 for all C values
    svm2 = svm.SVC(kernel = 'poly', degree = 5, C = c, coef0 = 1.0, gamma = 1.0)
    svm2.fit(X_train, Y_train)
    support_vectors2.append((len(svm2.support_vectors_), c))
    
    e_in1 = 0.0
    e_in2 = 0.0
    for point in train:
        # Calculate classification error for both svm predictions (Q = 2 and Q = 5)
        svm_prediction1 = svm1.predict([point[0]])
        svm_prediction2 = svm2.predict([point[0]])
        if svm_prediction1[0] != point[1]:
            e_in1 += 1
        if svm_prediction2[0] != point[1]:
            e_in2 += 1
        
    e_in1 /= len(train)
    e_in2 /= len(train)
    
    E_in1.append((e_in1, c))
    E_in2.append((e_in2, c))
    
    e_out1 = 0.0
    e_out2 = 0.0
    for point in test:
        # Calculate classification error for both svm predictions (Q = 2 and Q = 5)
        svm_prediction1 = svm1.predict([point[0]])
        svm_prediction2 = svm2.predict([point[0]])
        if svm_prediction1[0] != point[1]:
            e_out1 += 1
        if svm_prediction2[0] != point[1]:
            e_out2 += 1
    
    e_out1 /= len(test)
    e_out2 /= len(test)
    
    E_out1.append((e_out1, c))
    E_out2.append((e_out2, c))
    
        

In [13]:
E_in1

[(0.008968609865470852, 0.0001),
 (0.004484304932735426, 0.001),
 (0.004484304932735426, 0.01),
 (0.004484304932735426, 0.1),
 (0.0032030749519538757, 1)]

In [14]:
E_in2

[(0.004484304932735426, 0.0001),
 (0.004484304932735426, 0.001),
 (0.003843689942344651, 0.01),
 (0.0032030749519538757, 0.1),
 (0.0032030749519538757, 1)]

In [15]:
E_out1

[(0.01650943396226415, 0.0001),
 (0.01650943396226415, 0.001),
 (0.018867924528301886, 0.01),
 (0.018867924528301886, 0.1),
 (0.018867924528301886, 1)]

In [16]:
E_out2

[(0.018867924528301886, 0.0001),
 (0.02122641509433962, 0.001),
 (0.02122641509433962, 0.01),
 (0.018867924528301886, 0.1),
 (0.02122641509433962, 1)]

In [17]:
support_vectors1

[(236, 0.0001), (76, 0.001), (34, 0.01), (24, 0.1), (24, 1)]

In [18]:
support_vectors2

[(26, 0.0001), (25, 0.001), (23, 0.01), (25, 0.1), (21, 1)]

In [19]:
with open("features.train.txt", "r") as train_set:
    raw_data = [line.strip().split('\t') for line in train_set]
    
split = []
for point in raw_data:
    split.append(point[0].split())
    
train = []
# Parse data for 1 vs 5 classifier in training set
for point in split:
    if float(point[0]) == 1:
        train.append([np.array([float(point[1]), float(point[2])]), 1.0])
    elif float(point[0]) == 5:
        train.append([np.array([float(point[1]), float(point[2])]), -1.0])
        
X_values = []
Y_values = []

for point in train:
    X_values.append(point[0])
    Y_values.append(point[1])

# Initialize dictionary to keep track of wins for each value of C
Cwins = {0.0001:0, 0.001:0, 0.01:0, 0.1:0, 1:0}

C_values = [0.0001, 0.001, 0.01, 0.1, 1]
# For 100 random runs
for i in range(100):
    E_cv_average = []
    for c in C_values:
        # Run svm package with Q = 2 and for all C values
        svm1 = svm.SVC(kernel = 'poly', degree = 2, C = c, coef0 = 1.0, gamma = 1.0)
        # Use KFold to split up data for training and validation. Shuffle = true for randomization.
        kf = KFold(n_splits=10, shuffle = True)
        E_cv_values = []
        for i_train, i_test in kf.split(X_values):
            X_train = []
            Y_train = []
            # Put the points set aside for training in lists
            for j in i_train:
                X_train.append(X_values[j])
                Y_train.append(Y_values[j])
            
            svm1.fit(X_train, Y_train)
            X_test = []
            Y_test = []
             
            # Put the points set aside for testing in lists
            for k in i_test:
                X_test.append(X_values[k])
                Y_test.append(Y_values[k])
            
            E_cv = 0.0
            
            for j in range(len(X_test)):
                svm_prediction = svm1.predict([X_test[j]])
                if svm_prediction[0] != Y_test[j]:
                    E_cv += 1
            E_cv /= len(X_test)
            
            E_cv_values.append(E_cv)
        E_cv_average.append((np.mean(E_cv_values), c))
    Cwins[min(E_cv_average)[1]] += 1
    

In [20]:
Cwins

{0.0001: 0, 0.001: 37, 0.01: 31, 0.1: 15, 1: 17}

In [21]:
E_cv_average

[(0.0089702760084925687, 0.0001),
 (0.0044871794871794869, 0.001),
 (0.0044830965213130819, 0.01),
 (0.0038379879144210352, 0.1),
 (0.0057651478033643646, 1)]

In [22]:
with open("features.train.txt", "r") as train_set:
    raw_data = [line.strip().split('\t') for line in train_set]
    
split = []
for point in raw_data:
    split.append(point[0].split())
    
train = []
# Parse data for 1 vs 5 classifier in training set
for point in split:
    if float(point[0]) == 1:
        train.append([np.array([float(point[1]), float(point[2])]), 1.0])
    elif float(point[0]) == 5:
        train.append([np.array([float(point[1]), float(point[2])]), -1.0])
        
X_values = []
Y_values = []

for point in train:
    X_values.append(point[0])
    Y_values.append(point[1])
    
with open("features.test.txt", "r") as test_set:
    raw_data = [line.strip().split('\t') for line in test_set]
    
split = []
for point in raw_data:
    split.append(point[0].split())
    
test = []
# Parse data for 1 vs 5 classifier
for point in split:
    if float(point[0]) == 1:
        test.append([np.array([float(point[1]), float(point[2])]), 1.0])
    elif float(point[0]) == 5:
        test.append([np.array([float(point[1]), float(point[2])]), -1.0])
        
C_values = [.01, 1, 100, 10 ** 4, 10 ** 6]
E_in = []
E_out = []

for c in C_values:
    # Use the rbf kernel for this svm
    svm1 = svm.SVC(kernel = 'rbf', C = c, coef0 = 1.0, gamma = 1.0)
    svm1.fit(X_values, Y_values)
    
    e_in = 0.0
    
    for i in range(len(Y_values)):
        # Run svm_prediction on training set to find E_in
        svm_prediction = svm1.predict([X_values[i]])
        if svm_prediction[0] != Y_values[i]:
            e_in += 1
            
    e_in /= len(X_values)
    E_in.append((e_in, c))
    
    e_out = 0.0
    
    for point in test:
        # Run svm prediction on test set to find E_out
        svm_prediction = svm1.predict([point[0]])
        if svm_prediction != point[1]:
            e_out += 1
    e_out /= len(test)
    E_out.append((e_out, c))
    
    
    

In [23]:
E_in


[(0.003843689942344651, 0.01),
 (0.004484304932735426, 1),
 (0.0032030749519538757, 100),
 (0.0025624599615631004, 10000),
 (0.0006406149903907751, 1000000)]

In [24]:
E_out

[(0.02358490566037736, 0.01),
 (0.02122641509433962, 1),
 (0.018867924528301886, 100),
 (0.02358490566037736, 10000),
 (0.02358490566037736, 1000000)]