In [150]:
import csv
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
import warnings

#suppress warnings
warnings.filterwarnings('ignore')

def parseData(filename):
    csv_data = pd.read_csv(filename)
    numpy_data = csv_data.values
    rows, columns = numpy_data.shape
    X = numpy_data[:, :columns - 1]
    y = numpy_data[:, columns - 1:]
    X = np.array(X)
    y = np.array(y)
    return X, y

def splitData(X, y, trainSplit, valSplit, testSplit):
    trainStop = int(trainSplit * X.shape[0])
    valStop = int((trainSplit + valSplit) * X.shape[0])
    train_x = X[0:trainStop, :]
    train_y = y[0:trainStop]
    val_x = X[trainStop:valStop, :]
    val_y = y[trainStop:valStop]
    test_x = X[valStop:, :]
    test_y = y[valStop:]
    return train_x, train_y, val_x, val_y, test_x, test_y


def normalize(X):
    rangeX = np.zeros(X.shape[1])
    minX = np.zeros(X.shape[1])
    normX = np.zeros(X.shape)

    for i in range(X.shape[1]):
        minX[i] = min(X[:, i])
        rangeX[i] = max(X[:, i]) - minX[i]
    for i in range(X.shape[0]):
        for j in range(X.shape[1]):
            normX[i][j] = (X[i][j] - minX[j]) / rangeX[j]

    return normX

def score(predictions, y):
    numCorrect = np.sum(predictions == y.T)
    accuracy = numCorrect / y.shape[0]
    return accuracy

class LogisticRegression(object):
    def __init__(self, max_iter, learning_rate):
        self.max_iter = max_iter
        self.learning_rate = learning_rate
    
    def initialize_weights(self, dim):
        # numpy.zeros_like function return an array of zeros with the same shape and type as a given array
        w = np.zeros_like(dim)
        b = 0
        return w, b
    
    def sigmoid(self, z):
        return (1/ (1+np.exp(-z)))

    def gradient(self, y, X, w): 
        #np.sum( list( list1[i]/2 for i in range(len(list1)) ) )
        N = len(X)

        array_X = []
        for i in range(len(X)):
            array_X.append(np.squeeze(np.asarray(X[i])))
            
        array_y = np.squeeze(np.asarray(y))
        X = array_X
        y = array_y
        
        return (-1/N)*np.sum( list( np.dot(y[n],X[n]) * self.sigmoid(np.dot(y[n], np.dot(w,X[n]))) for n in range(len(X)) ) )
    
    def fit(self, X, y):
        w = np.squeeze(np.asarray(X[0]))
        
        for _ in range(self.max_iter):
            g_t = self.gradient(y, X, w)
            v_t = -g_t
            w = w + (self.learning_rate*v_t)

        self.w = w   

    def get_params(self):
        ### TODO
        if self.w is None:
            print("Run fit first!")
            sys.exit(-1)
        return self.w

    def predict(self, X):
        ### here X should not be 2d, should be one of the entry
        if self.w is None:
            print("Run fit first!")
            sys.exit(-1)

        array_X = np.squeeze(np.asarray(X))            
        w = np.squeeze(np.asarray(self.w))
        X = array_X
        self.w = w
        
        preds = self.sigmoid(np.dot(X, self.w))
        return 1 if preds >= 0.5 else -1

    def score(self, X, y):
        if self.w is None:
            print("Run fit first!")
            sys.exit(-1)
        preds = list(self.predict(X[i]) for i in range(len(X)))
        accuracy = np.mean(preds == y)
        return accuracy
    
if __name__ == "__main__":
    breast_cancer_csv = "./data/breast-cancer.csv"
    spam_email_csv = "./data/spam_email_dataset.csv"
    water_potability_csv = "./data/water_potability.csv"
    hand_writing_csv = "./data/handwriting_alzheimers.csv"
    file_list = [breast_cancer_csv, hand_writing_csv, spam_email_csv, water_potability_csv]
    
    X, y = parseData(hand_writing_csv)
    X = X[:, 1:]
    y = np.where(y == "P", 1, y)
    y = np.where(y == "H", -1, y)
    X = normalize(X)
    #Due to the data have a low sample count, depending on the distribution of the shuffle
    #accuracy can be extremely poor
    #While shuffling help negate this, we need another technique to increase sample count
    X, y = shuffle(X, y)
    X, y = shuffle(X, y)
    X, y = shuffle(X, y)

    train_x, train_y, val_x, val_y, test_x, test_y = splitData(X, y, 0.8, 0, 0.2)
    
    model = LogisticRegression(100, 0.1)
    array_x = np.squeeze(np.asarray(train_x[0]))
    array_y = np.squeeze(np.asarray(train_y[0]))
    model.fit(train_x, train_y)
    print(model.score(test_x, test_y))


0.7037037037037037
