In [1]:
import csv
import numpy as np 
import matplotlib.pyplot as plt
import os
import itertools
import pandas as pd
import seaborn as sns
import matplotlib.pyplot
import numpy.linalg 
import numpy.random
from sklearn.utils import shuffle




def parseData(filename):
    csv_data = pd.read_csv(filename)
    numpy_data = csv_data.values
    rows, columns = numpy_data.shape
    X = numpy_data[:, :columns - 1]
    y = numpy_data[:, columns - 1:]
    X = np.array(X)
    y = np.array(y)
    return X, y


def splitData(X, y, trainSplit, valSplit, testSplit):
    trainStop = int(trainSplit * X.shape[0])
    valStop = int((trainSplit + valSplit) * X.shape[0])
    train_x = X[0:trainStop, :]
    train_y = y[0:trainStop]
    val_x = X[trainStop:valStop, :]
    val_y = y[trainStop:valStop]
    test_x = X[valStop:, :]
    test_y = y[valStop:]
    return train_x, train_y, val_x, val_y, test_x, test_y


def normalize(X):
    rangeX = np.zeros(X.shape[1])
    minX = np.zeros(X.shape[1])
    normX = np.zeros(X.shape)

    for i in range(X.shape[1]):
        minX[i] = min(X[:, i])
        rangeX[i] = max(X[:, i]) - minX[i]
    for i in range(X.shape[0]):
        for j in range(X.shape[1]):
            normX[i][j] = (X[i][j] - minX[j]) / rangeX[j]

    return normX


def hingeLoss(C, w, b, X, y):
    reg_term = 0.5 * (w.T @ w)
    losses = np.zeros(X.shape[0])

    for i in range(X.shape[0]):
        opt_term = y[i] * ((w.T @ X[i]) + b)
        losses[i] = reg_term + C * max(0, 1 - opt_term)
    
    return losses


def fit(C, X, y, batchSize, learningRate, epochs):
    w = np.zeros(X.shape[1])
    b = 0
    lossList = []

    for i in range(epochs):
        losses = hingeLoss(C, w, b, X, y)
        lossList.append(losses)
        for batch in range(0, X.shape[0], batchSize):
            batchX = X[batch:batch+batchSize, :]
            batchy = y[batch:batch+batchSize, :]
            wGradient, bGradient = gradient(C, batchX, batchy, w, b)
            w = w - learningRate * wGradient
            b = b - learningRate * bGradient
    return w, b, lossList


def gradient(C, X, y, w, b):
    wGradient = 0
    bGradient = 0

    for i in range(X.shape[0]):
        dist = y[i] * (w.T @ X[i] + b)

        if (dist < 1):
            wGradient += -1 * (C * y[i] * X[i])
            bGradient += -1 * (C * y[i])
    return wGradient, bGradient


def predict(X, w, b):
    predictions = w @ X.T + b
    predictions = np.sign(predictions)
    return predictions


def score(predictions, y):
    numCorrect = np.sum(predictions == y.T)
    accuracy = numCorrect / y.shape[0]
    return accuracy


if __name__ == "__main__":
    X, y = parseData("handwriting_alzheimers.csv")
    X = X[:, 1:]
    y = np.where(y == "P", 1, y)
    y = np.where(y == "H", -1, y)
    X = normalize(X)
    #Due to the data have a low sample count, depending on the distribution of the shuffle
    #accuracy can be extremely poor
    #While shuffling help negate this, we need another technique to increase sample count
    X, y = shuffle(X, y)
    X, y = shuffle(X, y)
    X, y = shuffle(X, y)
    
    train_x, train_y, val_x, val_y, test_x, test_y = splitData(X, y, 0.7, 0.15, 0.15)
    w, b, lossList = fit(1, train_x, train_y, 10, 0.001, 100)
    predictions = predict(test_x, w, b)
    acc = score(predictions, test_y)
    print(acc)

    

0.7777777777777778


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=96e6bd14-424c-411e-98e0-e7aeafdb7a8f' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>