# Train SVM File
***

1. Import libraries
2. Load preprocessed data
3. Train SVM
4. Save model

In [1]:
# 1. import libraries
import numpy as np
import pandas as pd
import torch
# import svm classifier from sklearn
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
import pickle


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# constants

CV = 5

In [4]:
PREPROCESSED_DATA_PATH = '../Preprocessed Dataset/'
MODEL_PATH = '../Models/'
X = 'X_'
y =  'y_'
TRAIN = 'train_'
VAL = 'val_'
TEST = 'test_'
path = '../Preprocessed Dataset/'
NPY = '.npy'

In [8]:
# list of test accuracies
test_acc = []
# tune the hyperparameters
    # 1. Kernel type
    # 2. C

TRAIN_ = False

kernels = ['linear', 'poly', 'rbf', 'sigmoid']
C = [1] # np.logspace(-3, 3)

val_acc = np.zeros((CV, len(kernels), len(C)))

for iteration in range(CV) :
    
    print("K-Fold Cross Validation Iteration: ", (iteration+ 1))

    # Get path
    TRAIN_PATH = PREPROCESSED_DATA_PATH + X + TRAIN + str(iteration+1) + NPY
    VAL_PATH = PREPROCESSED_DATA_PATH + X + VAL + str(iteration+1) + NPY
    TEST_PATH = PREPROCESSED_DATA_PATH + X + TEST + str(iteration+1) + NPY


    # load data from numpy array
    train = np.load(TRAIN_PATH, allow_pickle=True)
    traindf = pd.DataFrame(train, columns = ["Tensor", "Target"])
    val = np.load(VAL_PATH, allow_pickle=True)
    valdf = pd.DataFrame(val, columns = ["Tensor", "Target"])
    test = np.load(TEST_PATH, allow_pickle=True)
    testdf = pd.DataFrame(test, columns = ["Tensor", "Target"])
    

    X_train = traindf.Tensor
    X_train = np.array([i.flatten().numpy() for i in X_train.to_list()])
    y_train = traindf.Target.to_numpy().astype(int)

    X_val = valdf.Tensor
    X_val = np.array([i.flatten().numpy() for i in X_val.to_list()])
    y_val = valdf.Target.to_numpy().astype(int)


    for kernel in kernels :
        print('\tKernel: ', kernel)
        for c in C :
            # create a svm classifier
            svm = SVC(kernel=kernel, C=c)

            # model name 
            filename = MODEL_PATH + 'svm-k_' + kernel + '-c_' + str(c) +'-cv_' + str(iteration) + '.sav'

            if TRAIN_ :
                # fit the model
                svm.fit(X_train, y_train)
                # save the model
                pickle.dump(svm, open(filename, 'wb'))
            
            else :
                # load the model
                svm = pickle.load(open(filename, 'rb'))
            
            # predict the model
            y_pred = svm.predict(X_val)
            # calculate the accuracy
            accuracy = np.sum(y_pred == y_val) / len(y_val)
            # append the validation accuracy to the list
            val_acc[iteration, kernels.index(kernel), C.index(c)] = accuracy

print("TRAINING COMPLETED")
# average over the k-folds
val_acc = np.mean(val_acc, axis=0)
# get the best hyperparameters
best_kernel = kernels[np.argmax(val_acc) // len(C)]
best_c = C[np.argmax(val_acc) % len(C)]

print("Best Kernel: ", best_kernel, "Best C: ", best_c)

for cv in range(CV) :
    # create a svm classifier with the best hyperparameters
    svm = SVC(kernel=best_kernel, C=best_c)
    # Load train data
    TRAIN_PATH = PREPROCESSED_DATA_PATH + X + TRAIN + str(cv+1) + NPY
    train = np.load(TRAIN_PATH, allow_pickle=True)
    traindf = pd.DataFrame(train, columns = ["Tensor", "Target"])

    X_train = traindf.Tensor
    X_train = np.array([i.flatten().numpy() for i in X_train.to_list()])
    y_train = traindf.Target.to_numpy().astype(int)

    X_test = testdf.Tensor
    X_test = np.array([i.flatten().numpy() for i in X_test.to_list()])
    y_test = testdf.Target.to_numpy().astype(int)

    # load the model
    filename = MODEL_PATH + 'svm-k_' + best_kernel + '-c_' + str(best_c) +'-cv_' + str(cv) + '.sav'
    svm = pickle.load(open(filename, 'rb'))

    # predict the model
    y_pred = svm.predict(X_test)
    
    # calculate the accuracy
    accuracy = np.sum(y_pred == y_test) / len(y_test)

    # append the test accuracy to the list
    test_acc.append(accuracy)

# average over the k-folds
test_acc = np.mean(test_acc)

# print the test accuracy
print("Test Accuracy: ", test_acc)

K-Fold Cross Validation Iteration:  1
	Kernel:  linear
	Kernel:  poly
	Kernel:  rbf
	Kernel:  sigmoid
K-Fold Cross Validation Iteration:  2
	Kernel:  linear
	Kernel:  poly
	Kernel:  rbf
	Kernel:  sigmoid
K-Fold Cross Validation Iteration:  3
	Kernel:  linear
	Kernel:  poly
	Kernel:  rbf
	Kernel:  sigmoid
K-Fold Cross Validation Iteration:  4
	Kernel:  linear
	Kernel:  poly
	Kernel:  rbf
	Kernel:  sigmoid
K-Fold Cross Validation Iteration:  5
	Kernel:  linear
	Kernel:  poly
	Kernel:  rbf
	Kernel:  sigmoid
TRAINING COMPLETED
Best Kernel:  linear Best C:  1
Test Accuracy:  0.38


### Log

5-fold cross validation with a single kernel takes 17 minutes and Test Accuracy: 0.38 

# SGD Classifier



In [7]:
# list of test accuracies
test_acc = []
# tune the hyperparameters
    # 1. Kernel type
    # 2. C

TRAIN_ = True

# kernels = ['linear', 'poly', 'rbf', 'sigmoid']
# C = [1] # np.logspace(-3, 3)
losses = ['hinge', 'squared_hinge']

val_acc = np.zeros((CV, len(losses)))

for iteration in range(CV) :
    
    print("K-Fold Cross Validation Iteration: ", (iteration+ 1))

    # Get path
    TRAIN_PATH = PREPROCESSED_DATA_PATH + X + TRAIN + str(iteration+1) + NPY
    VAL_PATH = PREPROCESSED_DATA_PATH + X + VAL + str(iteration+1) + NPY
    TEST_PATH = PREPROCESSED_DATA_PATH + X + TEST + str(iteration+1) + NPY
    print("\tPaths fetched!")

    # load data from numpy array
    train = np.load(TRAIN_PATH, allow_pickle=True)
    traindf = pd.DataFrame(train, columns = ["Tensor", "Target"])
    val = np.load(VAL_PATH, allow_pickle=True)
    valdf = pd.DataFrame(val, columns = ["Tensor", "Target"])
    test = np.load(TEST_PATH, allow_pickle=True)
    testdf = pd.DataFrame(test, columns = ["Tensor", "Target"])
    print("\tDataframe loaded")

    X_train = traindf.Tensor
    X_train = np.array([i.flatten().numpy() for i in X_train.to_list()])
    y_train = traindf.Target.to_numpy().astype(int)

    X_val = valdf.Tensor
    X_val = np.array([i.flatten().numpy() for i in X_val.to_list()])
    y_val = valdf.Target.to_numpy().astype(int)
    print("\tFeatures extracted")

    for loss in losses :
        print("\t\tLoss: ", loss)
        # create a svm classifier
        svm = SGDClassifier(loss=loss)

        # model name 
        filename = MODEL_PATH + 'sgd-' + loss +'-cv-' + str(iteration) + '.sav'

        if TRAIN_ :
            # fit the model
            svm.fit(X_train, y_train)
            # save the model
            pickle.dump(svm, open(filename, 'wb'))
        
        else :
            # load the model
            svm = pickle.load(open(filename, 'rb'))
        
        print("\t\tModel trained and saved")
        # predict the model
        y_pred = svm.predict(X_val)
        # calculate the accuracy
        accuracy = np.sum(y_pred == y_val) / len(y_val)
        # append the validation accuracy to the list
        val_acc[iteration, losses.index(loss)] = accuracy
        print("\t\tValidation Accuracy: ", accuracy)

print("TRAINING COMPLETED")
# average over the k-folds
val_acc = np.mean(val_acc, axis=0)

# save the validation accuracy to a file
np.save(MODEL_PATH + 'sgd-val-acc.npy', val_acc)

# get the best loss
best_loss = losses[np.argmax(val_acc) % len(losses)]


for cv in range(CV) :
    print("K-Fold Cross Validation Iteration: ", (cv+ 1))
    # create a svm classifier with the best hyperparameters
    svm = SGDClassifier(loss=best_loss)
    # Load train data
    TRAIN_PATH = PREPROCESSED_DATA_PATH + X + TRAIN + str(cv+1) + NPY
    train = np.load(TRAIN_PATH, allow_pickle=True)
    traindf = pd.DataFrame(train, columns = ["Tensor", "Target"])

    X_train = traindf.Tensor
    X_train = np.array([i.flatten().numpy() for i in X_train.to_list()])
    y_train = traindf.Target.to_numpy().astype(int)

    X_test = testdf.Tensor
    X_test = np.array([i.flatten().numpy() for i in X_test.to_list()])
    y_test = testdf.Target.to_numpy().astype(int)

    print("\tDataframe loaded and Features extracted")
    # load the model
    filename = MODEL_PATH + 'sgd-' + loss +'-cv-' + str(iteration) + '.sav'
    svm = pickle.load(open(filename, 'rb'))

    print("\tModel loaded")
    # predict the model
    y_pred = svm.predict(X_test)
    
    # calculate the accuracy
    accuracy = np.sum(y_pred == y_test) / len(y_test)

    # append the test accuracy to the list
    test_acc.append(accuracy)

# average over the k-folds
test_acc = np.mean(test_acc)

# print the test accuracy
print("Test Accuracy: ", test_acc)

K-Fold Cross Validation Iteration:  1
	Paths fetched!
	Dataframe loaded
	Features extracted
		Loss:  hinge
		Model trained and saved
		Validation Accuracy:  0.165
		Loss:  squared_hinge
		Model trained and saved
		Validation Accuracy:  0.195
K-Fold Cross Validation Iteration:  2
	Paths fetched!
	Dataframe loaded
	Features extracted
		Loss:  hinge
		Model trained and saved
		Validation Accuracy:  0.17
		Loss:  squared_hinge
		Model trained and saved
		Validation Accuracy:  0.155
K-Fold Cross Validation Iteration:  3
	Paths fetched!
	Dataframe loaded
	Features extracted
		Loss:  hinge
		Model trained and saved
		Validation Accuracy:  0.095
		Loss:  squared_hinge
		Model trained and saved
		Validation Accuracy:  0.09
K-Fold Cross Validation Iteration:  4
	Paths fetched!
	Dataframe loaded
	Features extracted
		Loss:  hinge
		Model trained and saved
		Validation Accuracy:  0.11
		Loss:  squared_hinge
		Model trained and saved
		Validation Accuracy:  0.14
K-Fold Cross Validation Iteration:  

In [8]:
# load validation accuracy matrix
val_acc = np.load(MODEL_PATH + 'sgd-val-acc.npy')
print(val_acc)

[0.121 0.133]
