In [1]:
import numpy as np
import math
from matplotlib import pyplot as plt

def load_data(file_path, delimiter, skiprows=0):
    """loads a data file and returns a numpy array"""
    file = open(file_path, "rb")
    arr = np.loadtxt(file, delimiter=delimiter, skiprows=skiprows)
    return arr

In [4]:
data = load_data("winequality-red.csv", ";", 1)

In [5]:
for row in data:
    row[-1] = 0 if row[-1] <= 5 else 1

In [6]:
X=data[:,1:-1]
Y=np.ravel((data[:,-1:]))

In [7]:
# y = true labels
# y_hat = training labels
# return: accuracy of training labels (in percentage)
# Ensure that y and y_hat contain the labels for the same training examples.
def evaluate_acc(y, y_hat):
    score = 0
    for i in range(y.shape[0]):
        if y[i] == y_hat[i]:
            score += 1
    return (score / y.shape[0]) * 100

In [8]:
# y = class labels of training examples
# x = feature data of training examples
# 2 < k = number of folds to use in validation
# return: average of prediction error over the k rounds of execution
def k_fold(x, y, k, model):
    if k < 1:
        return "Must have at least 1 fold."
    elif k > (x.shape[0]//2):
        return "Too many folds."
    elif k == 1:
        print("1 fold selected - model will be trained and validated on same data set")
        model.fit(x, y)
        return evaluate_acc(y, model.predict(x))
    else:
        rows_per_fold = (x.shape[0] + 1)//k       # a few rows at the end of the training data will be unused
        accuracy = 0

        for exec_round in range(k):
            # determine held-out range
            lower_row = exec_round * rows_per_fold
            upper_row = ((exec_round + 1) * rows_per_fold) - 1
            
            # create validation set
            x_val = np.copy(x)[lower_row:upper_row]
            y_val = np.copy(y)[lower_row:upper_row]

            # create training set
            x_trn = np.concatenate((x[0:lower_row], x[upper_row:]))
            y_trn = np.concatenate((y[0:lower_row], y[upper_row:]))

            # train model
            model.fit(x_trn, y_trn)

            # run validation set through model
            y_hat = model.predict(x_val)
            accuracy += evaluate_acc(y_val, y_hat)

        return accuracy / k

In [37]:
class LogisticRegression:
    def __init__(self, alpha=0.0005, threshold = 10):
        self.alpha = alpha
        self.threshold = threshold
        self.stop = False
        self.weights = None
        self.max_iter = 10000
        self.change = []

    def __intercept(self, X):
        return np.c_[np.ones(len(X)), X]
    
    def __sigmoid(self, z):
        return 1/(1 + np.exp(-z))
    
    def __grad(self, X_i, y_i):
        z = np.dot(self.weights.T, X_i)
        return X_i*(y_i-self.__sigmoid(z))
    
    def __update(self, X, Y):
        changeW = np.zeros(np.size(X, 1))

        for i in range(len(X)):
            grad = self.__grad(X[i], Y[i])
            changeW = changeW + self.alpha*grad
        print(np.linalg.norm(changeW))
        self.change.append(np.linalg.norm(changeW))
        self.weights = self.weights + changeW
    
    def fit(self, X, Y):
        self.change = [] # reset the gradients before running a new fit
        padded_X = self.__intercept(X)
        self.weights = np.zeros(np.size(padded_X,1))
        
        num_iter = 0
        while self.change == [] or self.change[-1] > self.threshold and num_iter < self.max_iter:
            self.__update(padded_X, Y)
            num_iter+=1
            
            if (num_iter == self.max_iter):
                print(f"Warning, reached max iterations of {self.max_iter}, stopping because we haven't converged yet")
                break

        print(f"learning rate:{self.alpha} \n stop threshold:{self.threshold} \n number of iterations: {num_iter}")
        print(f"weights:{self.weights}")
        
        return self.weights
    
    def predict(self, X):
        padded_X = self.__intercept(X)
        predictions = []
        
        for i in range(0, len(X)):
            Z = np.dot(self.weights.T, padded_X[i])
            pred = self.__sigmoid(Z).round()
            predictions.append(pred)
        
        return predictions

In [None]:
lr = LogisticRegression()
k_fold(X, Y, 5, lr)