In [1]:
from data_handler import DataHandler
from linear_regression import LinearRegression
# from log_regression import LogisticalRegression
from calculations import Calculations

import numpy as np
import matplotlib.pyplot as plt

ITERATIONS = 2000
tr_log_loss = np.array(0)
val_log_loss = np.array(0)

TERMINATION_VALUE = 2**-16
LEARNING_RATE = 0.05

In [2]:
calculator = Calculations()
dh = DataHandler("spambase.data")
data = dh.parse_data_no_header()
data = dh.shuffle_data(data)
data_train, data_validation = dh.split_data(data)
data_train, data_validation

(array([[1.000e-01, 2.000e-01, 1.010e+00, ..., 6.690e+02, 1.351e+03,
         1.000e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, ..., 1.000e+00, 4.000e+00,
         0.000e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, ..., 4.000e+00, 1.500e+01,
         0.000e+00],
        ...,
        [8.900e-01, 0.000e+00, 0.000e+00, ..., 9.000e+00, 7.600e+01,
         0.000e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, ..., 1.100e+01, 4.300e+01,
         0.000e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, ..., 1.500e+01, 5.000e+01,
         0.000e+00]], dtype=float32),
 array([[0.000e+00, 0.000e+00, 0.000e+00, ..., 1.000e+01, 9.700e+01,
         0.000e+00],
        [0.000e+00, 0.000e+00, 1.100e+00, ..., 7.900e+01, 3.160e+02,
         0.000e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, ..., 2.000e+00, 1.600e+01,
         0.000e+00],
        ...,
        [4.300e-01, 4.000e-01, 3.700e-01, ..., 1.780e+02, 3.303e+03,
         1.000e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, ..., 1.200e+01,

In [47]:
training_data_x, training_data_y_1 = dh.getXY(
            data_train, -1, -1)
training_data_y = training_data_y_1.reshape((training_data_x.shape[0], -1))
mean, std = dh.zscores(training_data_x)
training_data_x = dh.zscore_data(mean, std, training_data_x)

sample_size = training_data_x.shape[0]
feature_size = training_data_x.shape[1]

training_data_x.shape, training_data_y.shape, sample_size, feature_size

((3067, 57), (3067, 1), 3067, 57)

In [4]:
validation_data_x, validation_data_y = dh.getXY(
            data_validation, -1, -1)
validation_data_y = validation_data_y.reshape((validation_data_x.shape[0], -1))
validation_data_x = dh.zscore_data(mean, std, validation_data_x)
validation_data_x.shape, validation_data_y.shape

((1533, 57), (1533, 1))

In [51]:
# Confidence Level: High
def compute_log_loss(Y, p):
    # cost = -1/m * np.sum( np.dot(np.log(A), Y.T) + np.dot(np.log(1-A), (1-Y.T)))
    # P = P.reshape((m,-1))
    logP = np.log(p, where=(p>0))
    logP_min = np.log((1.0 - p), where=((1-p)>0))
    
    cost = -(np.dot(Y, logP)+np.dot((1-Y), logP_min))
    return cost

def prediction(w, x, b):
    # Y_preds - Ouput size = prediction for each sample
    # 
    P_ = sigmoid(wx_b(w, x, b))
    Y_preds = np.zeros((1, sample_size))
    for k in range(sample_size-1):
        if P_[0, k] > 0.5:
            Y_preds[0, k] = 1
        else:
            Y_preds[0, k] = 0
    return Y_preds

## Also interpreted as Y_hat
def sigmoid(fxn):
    return (1/(1+(np.exp(-fxn))))

def wx_b(w, X, b):
    return np.dot(w.T, X) + b

# Confidence Level: Medium
def compute_weights(X, b, Y, Y_):
    diff = np.subtract(Y_, Y)
    return (1.0/sample_size)*(np.matmul(X, diff.T))

def compute_bias(Y_hat, Y):
    m = Y.shape[0]
    return (1.0/sample_size)*np.sum(Y_hat - Y)

def train_model(w,b,X,tY, vY):
    P = sigmoid(wx_b(w,X,b))
    dw = compute_weights(X, b, tY, P)
    db = compute_bias(P, tY)
    t_cost = compute_log_loss(tY, P)
    v_cost = compute_log_loss(tY, P)

    losses = {
        "TR": t_cost,
        "VAL": v_cost
    }
    return dw, db, losses

In [42]:
# Confidence Level: Low


In [52]:
# %%time

# assert(training_data_x.shape[0] == training_data_y.shape[0])
m, n = training_data_x.shape
tr_costs = []
val_costs = []
tX = training_data_x
tY = training_data_y
vX = validation_data_x
vY = validation_data_y
w = np.zeros((sample_size,1)) # Shape -> weight for each feature of data
b = 0
#Log Regression Calculation
for i in range(ITERATIONS):
    # Returns in order:
    ## Gradient Descent Weight
    ## Gradient Descent Bias
    ## Gradient Descent Probabilities/Y_hat
    dw, db, losses = train_model(w, b, tX, tY, vY)

    # Updating the parameters.
    # print(dw.shape)
    # assert(dw.shape == w.shape)
    w = w - LEARNING_RATE * dw
    b = b - LEARNING_RATE * db
    
    tr_costs.append(losses["TR"])
    val_costs.append(losses["VAL"])

    # print("Weight after iteration %i: " % (i), np.array(w))
    # print("Bias after iteration %i: %f" % (i, b))

    # Print costs when changed
    if i % 100 == 0:
        print("Training Mean Cost after iteration %i: %f" % (i, np.mean(tr_costs)))
        print("Validation Mean Cost after iteration %i: %f" % (i, np.mean(val_costs)))
        # print("Weight after iteration %i: " % (i), (w))
        # print("Bias after iteration %i: %f" % (i, b))

    if np.mean(val_costs) < TERMINATION_VALUE:
        print("Termination Condition Hit, Exiting {}".format(np.mean(val_costs)))
        break;

ValueError: shapes (1,57) and (3067,57) not aligned: 57 (dim 1) != 3067 (dim 0)

In [None]:
print(w, b)
plt.plot(P)
train_Y_preds = prediction(w, tX, b)
print(train_Y_preds)
train_acc = calculator.accuracy(tY, train_Y_preds)
print("Training Accuracy {} %".format(train_acc))

val_Y_preds = prediction(w, vX, b)
print(val_Y_preds)
val_acc = calculator.accuracy(vY, val_Y_preds)
print("Validation Accuracy {} %".format(val_acc))

In [None]:
tr_costs = np.squeeze(tr_costs)
val_costs = np.squeeze(val_costs)

In [None]:
plt.plot(tr_costs)
plt.title("Log Regression Costs")

plt.ylabel("cost")
plt.xlabel("iterations")

In [None]:
plt.plot(val_costs)
plt.title("Log Regression Costs")

plt.ylabel("cost")
plt.xlabel("iterations")