In [1]:
# Fabian Ardeljan
# Data Mining
# Dr. Ye

import numpy as np
from libsvm.svmutil import *
from sklearn.model_selection import KFold

In [2]:
y, X = svm_read_problem('./DogsVsCats/DogsVsCats.train')
y_testset, X_testset = svm_read_problem('./DogsVsCats/DogsVsCats.test')

(1) SVM Accuracy and Cross-Validation for Model Selection

In [3]:
kf = KFold(n_splits = 10, random_state=12345, shuffle=True)
fold_num = 0
linear_val_acc = []
polynomial_val_acc = []

for train_index, test_index in kf.split(X):
    fold_num += 1
    print("FOLD " + str(fold_num) + ":   TRAIN:", train_index, "TEST:", test_index)
    y_train = np.array(y)[train_index]
    X_train = np.array(X)[train_index]
    y_test = np.array(y)[train_index]
    X_test = np.array(X)[train_index]
    
    print("Training linear model")
    linear_model = svm_train(y_train, X_train, '-t 0')
    print("Testing linear model")
    p_label, p_acc, p_val = svm_predict(y_test, X_test, linear_model)
    linear_val_acc.append(p_acc[0])
    
    print("Training polynomial model")
    polynomial_model = svm_train(y_train, X_train, '-t 1 -d 5')
    print("Testing polynomial model")
    p_label, p_acc, p_val = svm_predict(y_test, X_test, polynomial_model)
    polynomial_val_acc.append(p_acc[0])
    print()
    
lin_val_acc = np.mean(linear_val_acc)
pol_val_acc = np.mean(polynomial_val_acc)
print("Linear kernel average validation accuracy: " + "{:.2f}".format(lin_val_acc) + "%")
print("Polynomial kernel average validation accuracy: " + "{:.2f}".format(pol_val_acc) + "%")

FOLD 1:   TRAIN: [    0     1     3 ... 12497 12498 12499] TEST: [    2    28    29 ... 12476 12490 12491]
Training linear model
Testing linear model
Accuracy = 60.0356% (6754/11250) (classification)
Training polynomial model
Testing polynomial model
Accuracy = 50.1333% (5640/11250) (classification)

FOLD 2:   TRAIN: [    0     1     2 ... 12497 12498 12499] TEST: [   10    15    26 ... 12462 12486 12489]
Training linear model
Testing linear model
Accuracy = 60.2667% (6780/11250) (classification)
Training polynomial model
Testing polynomial model
Accuracy = 50.1244% (5639/11250) (classification)

FOLD 3:   TRAIN: [    0     1     2 ... 12497 12498 12499] TEST: [   16    20    23 ... 12482 12492 12494]
Training linear model
Testing linear model
Accuracy = 59.8756% (6736/11250) (classification)
Training polynomial model
Testing polynomial model
Accuracy = 50.0356% (5629/11250) (classification)

FOLD 4:   TRAIN: [    0     2     3 ... 12497 12498 12499] TEST: [    1    22    45 ... 12444 

In [4]:
print("Training linear model")
linear_model = svm_train(y, X, '-t 0')
print("Testing linear model on training data")
p_label, p_acc, p_val = svm_predict(y, X, linear_model)
lin_trn_acc = p_acc[0]
print("Testing linear model on test data")
p_label, p_acc, p_val = svm_predict(y_testset, X_testset, linear_model)
lin_tst_acc = p_acc[0]
print()

print("Training polynomial model")
polynomial_model = svm_train(y, X, '-t 1 -d 5')
print("Testing polynomial model on training data")
p_label, p_acc, p_val = svm_predict(y, X, polynomial_model)
pol_trn_acc = p_acc[0]
print("Testing polynomial model on test data")
p_label, p_acc, p_val = svm_predict(y_testset, X_testset, polynomial_model)
pol_tst_acc = p_acc[0]

Training linear model
Testing linear model on training data
Accuracy = 60.12% (7515/12500) (classification)
Testing linear model on test data
Accuracy = 59.2% (7400/12500) (classification)

Training polynomial model
Testing polynomial model on training data
Accuracy = 50.024% (6253/12500) (classification)
Testing polynomial model on test data
Accuracy = 50.048% (6256/12500) (classification)


In [5]:
print("Results")
print()
print("Linear Kernel:")
print("Training accuracy: " + "{:.2f}".format(lin_trn_acc) + "%")
print("Validation accuracy: " + "{:.2f}".format(lin_val_acc) + "%")
print("Testing accuracy: " + "{:.2f}".format(lin_tst_acc) + "%")
print()
print("Polynomial Kernel:")
print("Training accuracy: " + "{:.2f}".format(pol_trn_acc) + "%")
print("Validation accuracy: " + "{:.2f}".format(pol_val_acc) + "%")
print("Testing accuracy: " + "{:.2f}".format(pol_tst_acc) + "%")

Results

Linear Kernel:
Training accuracy: 60.12%
Validation accuracy: 60.07%
Testing accuracy: 59.20%

Polynomial Kernel:
Training accuracy: 50.02%
Validation accuracy: 50.08%
Testing accuracy: 50.05%


The validation accuracy gives a very similar prediction of the test accuracy as the training accuracy. The linear kernel has higher test accuracy than the polynomial kernel.

(2) Boosting SVM

In [6]:
def AdaBoost(K):
    weights = np.ones(len(X))
    weights[:] = 1/len(X)
    alphas = []
    models = []
    
    for i in range(K):
        print("Iteration " + str(i + 1) + ":")
        
        params = '-t 0'
        print("Weights: " + str(weights))
        for w in range(len(weights)):
            params += ' -w' + str(w) + ' ' + str(weights[w] * len(weights))
        
        print("Training model " + str(i + 1))
        linear_model = svm_train(y, X, params)
        models.append(linear_model)
        print("Testing model " + str(i + 1) + " on training data")
        p_label, p_acc, p_val = svm_predict(y, X, linear_model)
        
        E = 0
        for p in range(len(p_label)):
            if (p_label[p] != y[p]):
                E += weights[p]
        print("Error: " + str(E))
        alpha = 0.5 * np.log((1-E)/E)
        alphas.append(alpha)
        print("Alpha: " + str(alpha))
    
        new_weights = []
        for w in range(len(weights)):
            new_weights.append(weights[w] * np.exp(np.array(-(alpha * y[w] * p_label[w]), dtype=np.float32)))
        weights = new_weights/sum(new_weights)
        print()
        
    predictions = []
    for m in range(len(models)):
        print("Testing model " + str(m + 1) + " on test data")
        p_label, p_acc, p_val = svm_predict(y_testset, X_testset, models[m])
        predictions.append(p_label)
    print()
        
    correct = 0
    for x in range(len(X_testset)):
        h = 0
        for a in range(len(alphas)):
            h += alphas[a] * predictions[a][x]
        if (h > 0 and y[x] > 0) or (h <= 0 and y[x] < 0):
            correct += 1
            
    h_accuracy = correct/len(X_testset)

    print("Ensemble classifier accuracy: " + "{:.2f}".format(100 * h_accuracy) + "%")

In [7]:
AdaBoost(10)

Iteration 1:
Weights: [8.e-05 8.e-05 8.e-05 ... 8.e-05 8.e-05 8.e-05]
Training model 1
Testing model 1 on training data
Accuracy = 60.12% (7515/12500) (classification)
Error: 0.39880000000003674
Alpha: 0.20523380989549153

Iteration 2:
Weights: [6.65336015e-05 1.00300900e-04 6.65336015e-05 ... 1.00300900e-04
 6.65336015e-05 6.65336015e-05]
Training model 2
Testing model 2 on training data
Accuracy = 57.472% (7184/12500) (classification)
Error: 0.44564097840580097
Alpha: 0.1091494417873163

Iteration 3:
Weights: [6.00094876e-05 1.12535544e-04 7.46493305e-05 ... 9.04656512e-05
 6.00094876e-05 6.00094876e-05]
Training model 3
Testing model 3 on training data
Accuracy = 50.424% (6303/12500) (classification)
Error: 0.4964012998408128
Alpha: 0.0071975246035184655

Iteration 4:
Weights: [6.04445312e-05 1.13351379e-04 7.51905069e-05 ... 8.98191864e-05
 5.95806616e-05 5.95806616e-05]
Training model 4
Testing model 4 on training data
Accuracy = 50.336% (6292/12500) (classification)
Error: 0.5002

After 10 iterations, the test accuracy of the ensemble classifier is exactly as high as that of its best performing model, namely the unboosted model. This indicates that boosting in this case has failed. This is evident by the fact that the accuracy keeps dropping from each iteration and tends to stabilize around 50%. As a result, only the first model has relatively strong accuracy and the remaining models only pose a neglegible effect, or cancel each other out. One possible cause for this is the linear kernel not being complex enough to fit the data set.

(3) Increasing the number of boosting iterations

In [8]:
AdaBoost(20)

Iteration 1:
Weights: [8.e-05 8.e-05 8.e-05 ... 8.e-05 8.e-05 8.e-05]
Training model 1
Testing model 1 on training data
Accuracy = 60.12% (7515/12500) (classification)
Error: 0.39880000000003674
Alpha: 0.20523380989549153

Iteration 2:
Weights: [6.65336015e-05 1.00300900e-04 6.65336015e-05 ... 1.00300900e-04
 6.65336015e-05 6.65336015e-05]
Training model 2
Testing model 2 on training data
Accuracy = 57.472% (7184/12500) (classification)
Error: 0.44564097840580097
Alpha: 0.1091494417873163

Iteration 3:
Weights: [6.00094876e-05 1.12535544e-04 7.46493305e-05 ... 9.04656512e-05
 6.00094876e-05 6.00094876e-05]
Training model 3
Testing model 3 on training data
Accuracy = 50.424% (6303/12500) (classification)
Error: 0.4964012998408128
Alpha: 0.0071975246035184655

Iteration 4:
Weights: [6.04445312e-05 1.13351379e-04 7.51905069e-05 ... 8.98191864e-05
 5.95806616e-05 5.95806616e-05]
Training model 4
Testing model 4 on training data
Accuracy = 50.336% (6292/12500) (classification)
Error: 0.5002

After 20 iterations, the test accuracy of the ensemble classifier remains only as high as the best performing model. This was to be expected as the alpha value dropped steeply after the first three iterations, meaning future alpha values remained negligible. In order for AdaBoost to work, the first few iterations must remain high in accuracy in order to have an effect on the overall result. For this particular dataset using a linear kernel, the maximum accuracy seems to be saturated in the unboosted SVM. If the original boosting attempt failed due to the linear kernel underfitting the data, adding more iterations is unlikely to fix the problem. A different kernel should be chosen instead.