In [None]:
import sklearn as sk
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import math

Load Data Set
--

In [None]:
# Load train datasets
# This data set has already removed classes other than Class 5 and 7
f = open("fashion-mnist_train.csv")
trainData = np.loadtxt(f, delimiter= ",")
# Seperate target and feadtures
xTrain = trainData[:4000, 1:]
yTrain = trainData[:4000, 0]
# Pre-process data by rescaling inputs by 255
xTrain = [[number/255 for number in group] for group in xTrain]

# Load Test datasets
# This data set has already removed classes other than Class 5 and 7
f = open("fashion-mnist_test.csv")
testData = np.loadtxt(f, delimiter= ",")
# Seperate target and feadtures
xTest = testData[:, 1:]
yTest = testData[:, 0]
# Pre-process data by rescaling inputs by 255
xTest = [[number/255 for number in group] for group in xTest]

Misc Functions
--

In [None]:
def plotGraph(a, fa, b = [], fb = [], name = 'untitiled', xvalrange = 100, yvalrange = 100 + 4, xAxisTitle = "", title = "", digits = 5, single = False, label = False):
    plt.clf()
    if not single:
        plt.plot(a, fa, linewidth=3.0, label='Training Data')
        plt.plot(b, fb, linewidth=3.0, label='Test Data')
    else:
        plt.plot(a, fa, linewidth=3.0, label='Dataset')
    ymin = min(fb) if not single else min(fa)
    xpos = fb.index(ymin) if not single else fa.index(ymin)
    xmin = b[xpos]  if not single else a[xpos] 
    plt.annotate('Minimum \n(' + str(xmin) + ', ' + str(ymin)[0:digits] + ')', xy=(xmin, ymin), xytext=(xmin, ymin + 10), arrowprops=dict(facecolor='black', shrink=0.05))
    if label:
        ymin = min(fa)
        xpos = fa.index(ymin)
        xmin = a[xpos] 
        plt.annotate('Minimum \n(' + str(xmin) + ', ' + str(ymin)[0:digits] + ')', xy=(xmin, ymin), xytext=(xmin, ymin - 5), arrowprops=dict(facecolor='blue', shrink=0.05))
    plt.xlabel(xAxisTitle)
    plt.ylabel('Error (%)')
    plt.xlim(0, xvalrange)
    plt.ylim(0, yvalrange)
    plt.title(title + ' vs. Error')
    plt.legend(loc = 'lower right')
    plt.savefig(name  + '.png')

In [None]:
def stats(pred, test, debug = True):
    if debug:
        print("\n-----------------------------------Metrics-----------------------------------")
        print('Mean Absolute Error:', metrics.mean_absolute_error(test, pred))
        print('           Accuracy:',accuracy_score(test, pred))
        print("\n-----------------------------------Report------------------------------------")
        print(classification_report(test,pred))
    return metrics.mean_absolute_error(test, pred)

Logistic Regression
--

In [None]:
def executeLogisticRegression(xTrain = [], yTrain = [], xTest = [], yTest = [], c = 1.0, pen = 'l2', train = True, test = True):
    clf = LogisticRegression(C = 1/c, penalty = pen)
    # Penalty: Used to specify the norm used in the penalization.
    # C: Inverse of regularization strength; smaller values specify stronger regularization.
    result = []
    clf = clf.fit(xTrain, yTrain)
    if train:
        result.append(stats(clf.predict(xTrain), yTrain))
    if test:
        result.append(stats(clf.predict(xTest), yTest))
    return result

SVM
--

In [None]:
def executeSVM(xTrain = [], yTrain = [], xTest = [], yTest = [], c = 1.0, k = 'rbf', d = 3, g = 'scale', coef = 0.0, train = True, test = True):
    clf = SVC(C = 1/c, kernel = k, degree = 3, gamma = g, coef0 = coef )
    # C, Regularization parameter.The strength of the regularization is inversely proportional to C. 
        # Must be strictly positive. The penalty is a squared l2 penalty.
    # kernel{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} , specifies the kernel type to be used in the algorithm.
    # Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.
    # Gamma{‘scale’, ‘auto’} or float, kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.
    # coef0, Independent term in kernel function. It is only significant in ‘poly’ and ‘sigmoid’.
    result = []
    clf = clf.fit(xTrain, yTrain)    
    if train:
        result.append(stats(clf.predict(xTrain), yTrain))
    if test:
        result.append(stats(clf.predict(xTest), yTest))
    return result

K-fold cross-validation
--

In [None]:
def executeKfold(X, Y, k):
    result = []
    kSize = int(len(X)/k)
    i = 0
    while i + kSize <= len(X):
        result.append(X[i: i + kSize])
        result.append(Y[i: i + kSize])
        i += kSize
    return result

In [None]:
def createSets(data, index):
    result = []
    for i in range(0, int(len(data)/2)):
        if i != index:
            if len(result) == 0:
                result.append(list(data[2*i]))
                result.append(list(data[2*i + 1]))
            else:
                result[0].extend(list(data[2*i]))
                result[1].extend(list(data[2*i + 1]))
    result.append(data[2 * index])
    result.append(data[2 * index + 1])
    return result

Task One 
--

In [None]:
x = []
fa = []
fb = []
for i in range(0, 11):
    error = executeLogisticRegression(xTrain, yTrain, xTest, yTest, c = 0.001*(3.5**i))
    x.append(i)
    fa.append(error[0] * 100)
    fb.append(error[1] * 100)
plotGraph(x, fa, x, fb, "TaskOne", xvalrange = 10, yvalrange = 18, xAxisTitle = "Regularization Strength (0.001*3.5$^x$)", title = "Regularization Strength", digits = 4)

Task Two 
--

In [None]:
x = []
fa = []
fb = []
for i in range(0, 11):
    error = executeSVM(xTrain, yTrain, xTest, yTest, k = 'linear', c =  0.0045*(3.2**i))
    x.append(i)
    fa.append(error[0] * 100)
    fb.append(error[1] * 100)
plotGraph(x, fa, x, fb, "TaskTwo", xvalrange = 10, yvalrange = 18, xAxisTitle = "Regularization Strength (0.004*3.2$^x$)", title = "Regularization Strength", digits = 4)

Task Three 
--

In [None]:
x = []
yLR = []
ySVM = []
optimalRegularizationLG = 0
optimalRegularizationSVM = 0
minValidationError = np.inf
fold = 8

Validation = executeKfold(xTrain, yTrain, fold)
for i in range(0, 11):
    temp = 0
    for k in range(0, fold):
        regularizationStrength = 0.001*(3.5**i)
        kFoldData = createSets(Validation, k)
        error = executeLogisticRegression(kFoldData[0], kFoldData[1], kFoldData[2], kFoldData[3], c = regularizationStrength)
        temp += error[1]
    temp = temp/fold
    if temp < minValidationError:
        minValidationError = temp
        optimalRegularizationLG = regularizationStrength
    x.append(i)
    yLR.append(temp * 100)

minValidationError = np.inf
Validation = executeKfold(xTrain, yTrain, fold)
for i in range(0, 11):
    temp = 0
    for k in range(0, fold):
        regularizationStrength = 0.0045*(3.2**i)
        kFoldData = createSets(Validation, k)
        error = executeSVM(kFoldData[0], kFoldData[1], kFoldData[2], kFoldData[3], k = 'linear', c = regularizationStrength)
        temp += error[1]
    temp = temp/fold
    if temp < minValidationError:
        minValidationError = temp
        optimalRegularizationSVM = regularizationStrength
    ySVM.append(temp * 100)
    
plotGraph(x, ySVM, x, yLR, "TaskThree", xvalrange = 10, yvalrange = 18, xAxisTitle = "Regularization Strength Exponent", title = "Regularization Strength", digits = 4, label = True)
result = executeLogisticRegression(xTrain, yTrain, xTest, yTest, c = optimalRegularizationSVM, train = False)
print("Optimized Logistic Regression error = " + str(result[0]))
result = executeSVM(xTrain, yTrain, xTest, yTest, c = optimalRegularizationSVM, k = 'linear', train = False)
print("Optimized SVM error = " + str(result[0]))

Task Four 
--

In [None]:
fa = []
fb = []
gammaValues = [0.0001, 0.01, 0.05, 0.1, 0.2, 0.3]
fold = 8
optmizedValues = {}
optimalRegularizationSVM = 0

Validation = executeKfold(xTrain, yTrain, fold)
for gamma in gammaValues:
    minValidationError = np.inf
    currentValidationError = np.inf
    for i in range(0, 11):
        temp = 0
        regularizationStrength =  0.0045*(3.2**i)
        for k in range(0, fold):
            kFoldData = createSets(Validation, k)
            error = executeSVM(kFoldData[0], kFoldData[1], kFoldData[2], kFoldData[3], c = regularizationStrength, g = gamma)
            temp += error[1]
        temp = temp/fold
        if temp < minValidationError:
            minValidationError = temp
            optimalRegularizationSVM = regularizationStrength
        if minValidationError < currentValidationError:
            optmizedValues[gamma] = optimalRegularizationSVM
            currentValidationError = minValidationError
    
for gamma, regularization in optmizedValues.items():
    error = executeSVM(xTrain, yTrain, xTest, yTest, g = gamma, c = regularization)
    fa.append(error[0] * 100)
    fb.append(error[1] * 100)
print(optmizedValues)
plotGraph(gammaValues, fa, gammaValues, fb, name = "TaskFour", xvalrange = max(gammaValues), yvalrange =  max(max(fa), max(fb)) + 6, xAxisTitle = "γ", title = "Gamma", digits = 4)