In [1]:
import numpy as np
import math
from matplotlib import pyplot as plt

def load_data(file_path, delimiter, skiprows=0):
    """loads a data file and returns a numpy array"""
    file = open(file_path, "rb")
    arr = np.loadtxt(file, delimiter=delimiter, skiprows=skiprows)
    return arr

In [2]:
data = load_data("breast-cancer-wisconsin.csv", ",")

In [3]:
for row in data:
    row[-1] = 0 if row[-1] == 2 else 1

In [4]:
X=data[:,1:10]
Y=np.ravel((data[:,-1:]))

In [5]:
# y = true labels
# y_hat = training labels
# return: accuracy of training labels (in percentage)
# Ensure that y and y_hat contain the labels for the same training examples.
def evaluate_acc(y, y_hat):
    score = 0
    for i in range(y.shape[0]):
        if y[i] == y_hat[i]:
            score += 1
    return (score / y.shape[0]) * 100

In [6]:
# y = class labels of training examples
# x = feature data of training examples
# 2 < k = number of folds to use in validation
# return: average of prediction error over the k rounds of execution
def k_fold(x, y, k, model):
    if k < 1:
        return "Must have at least 1 fold."
    elif k > (x.shape[0]//2):
        return "Too many folds."
    elif k == 1:
        print("1 fold selected - model will be trained and validated on same data set")
        model.fit(x, y)
        return evaluate_acc(y, model.predict(x))
    else:
        rows_per_fold = (x.shape[0] + 1)//k       # a few rows at the end of the training data will be unused
        accuracy = 0

        for exec_round in range(k):
            # determine held-out range
            lower_row = exec_round * rows_per_fold
            upper_row = ((exec_round + 1) * rows_per_fold) - 1
            
            # create validation set
            x_val = np.copy(x)[lower_row:upper_row]
            y_val = np.copy(y)[lower_row:upper_row]

            # create training set
            x_trn = np.concatenate((x[0:lower_row], x[upper_row:]))
            y_trn = np.concatenate((y[0:lower_row], y[upper_row:]))

            # train model
            model.fit(x_trn, y_trn)

            # run validation set through model
            y_hat = model.predict(x_val)
            accuracy += evaluate_acc(y_val, y_hat)

        return accuracy / k

In [38]:
class LogisticRegression:
    def __init__(self, alpha=0.001, threshold = 0.0005):
        self.alpha = alpha
        self.threshold = threshold
        self.stop = False
        self.weights = None
        self.max_iter = 10000
        self.change = []

    def __intercept(self, X):
        return np.c_[np.ones(len(X)), X]
    
    def __sigmoid(self, z):
        return 1/(1 + np.exp(-z))
    
    def __grad(self, X_i, y_i):
        z = np.dot(self.weights.T, X_i)
        return X_i*(y_i-self.__sigmoid(z))
    
    def __update(self, X, Y):
        changeW = np.zeros(np.size(X, 1))

        for i in range(len(X)):
            grad = self.__grad(X[i], Y[i])
            changeW = changeW + self.alpha*grad
        self.change.append(np.linalg.norm(changeW))
        self.weights = self.weights + changeW
    
    def fit(self, X, Y):
        self.change = [] # reset the gradients before running a new fit
        padded_X = self.__intercept(X)
        self.weights = np.zeros(np.size(padded_X,1))
        
        num_iter = 0
        while self.change == [] or self.change[-1] > self.threshold and num_iter < self.max_iter:
            self.__update(padded_X, Y)
            num_iter+=1
            
            if (num_iter == self.max_iter):
                print(f"Warning, reached max iterations of {self.max_iter}, stopping because we haven't converged yet")
                break

        print(f"learning rate:{self.alpha} \n stop threshold:{self.threshold} \n number of iterations: {num_iter}")
        print(f"weights:{self.weights}")
        
        return self.weights
    
    def predict(self, X):
        padded_X = self.__intercept(X)
        predictions = []
        
        for i in range(0, len(X)):
            Z = np.dot(self.weights.T, padded_X[i])
            pred = self.__sigmoid(Z).round()
            predictions.append(pred)
        
        return predictions

In [37]:
lr = LogisticRegression()
k_fold(X, Y, 5, lr)

learning rate:0.001 
 stop threshold:0.0002 
 number of iterations: 4009
weights:[-9.21846794e+00  4.57654155e-01  2.44140004e-01  1.59499609e-01
  2.60163201e-01 -1.16421552e-03  4.32458808e-01  3.78837694e-01
  1.33913577e-01  3.11444890e-01]
learning rate:0.001 
 stop threshold:0.0002 
 number of iterations: 5000
weights:[-12.00112852   0.68824535  -0.05098261   0.28361941   0.44429868
   0.03412249   0.34651163   0.56463723   0.41573122   0.95229659]
learning rate:0.001 
 stop threshold:0.0002 
 number of iterations: 3366
weights:[-9.5343657   0.473617   -0.08087272  0.3824713   0.25780921  0.1272465
  0.37188773  0.50335314  0.21237498  0.37232926]
learning rate:0.001 
 stop threshold:0.0002 
 number of iterations: 3525
weights:[-9.6568054   0.48951105  0.01338228  0.35384836  0.4118834   0.09600074
  0.39121251  0.35895796  0.17021498  0.47914982]
learning rate:0.001 
 stop threshold:0.0002 
 number of iterations: 2994
weights:[-9.31904696  0.51658909 -0.06245557  0.33931854  0.3

96.5925925925926

In [39]:
# lr = LogisticRegression(0.05)
# print(k_fold(X, Y, 5, lr))
# lr = LogisticRegression(0.01)
# print(k_fold(X, Y, 5, lr))
# lr = LogisticRegression(0.005)
# print(k_fold(X, Y, 5, lr))
# lr = LogisticRegression(0.001)
# print(k_fold(X, Y, 5, lr))
# lr = LogisticRegression(0.0005)
# print(k_fold(X, Y, 5, lr))

learning rate:0.05 
 stop threshold:0.0005 
 number of iterations: 10000
weights:[-238.6202961    13.30005716    2.10932043    4.45296995    5.93992835
    4.65260375    8.60426831    9.72560884    1.60836691   16.04879417]
learning rate:0.05 
 stop threshold:0.0005 
 number of iterations: 10000
weights:[-2.93950976e+02  1.67587172e+01 -3.14806181e+00  5.66048679e+00
  1.04643890e+01 -1.08123477e-01  6.44782010e+00  1.42651101e+01
  8.42807625e+00  2.68256680e+01]
learning rate:0.05 
 stop threshold:0.0005 
 number of iterations: 10000
weights:[-256.52149304   20.64212512   -0.62939669   12.34498756   10.73544487
    8.63026927   13.10171349   18.00154207    7.98362522    8.87807796]
learning rate:0.05 
 stop threshold:0.0005 
 number of iterations: 10000
weights:[-245.80275739   14.65693351    1.40950682    9.64672327   11.97198907
    5.27673178   10.11265646   11.76964129    6.14933515   10.4774469 ]
learning rate:0.05 
 stop threshold:0.0005 
 number of iterations: 10000
weights:[-