In [1]:
from copy import deepcopy
from numpy.linalg import inv, norm
from sklearn import datasets
from sklearn.datasets import make_regression
from sklearn import linear_model
import numpy as np
import pandas as pd
import time

In [2]:
#Getting and standardizing dataset
dataset = datasets.load_breast_cancer()
dataset_X = dataset.data[:, np.newaxis, 2]
dataset_X_sd = (dataset_X - np.mean(dataset_X, axis=0))/np.std(dataset_X, axis=0)#Standardization is necssary
#to avoid numerical stability issues 
dataset_Y = dataset.target

## Using sklearn

In [3]:
tic = time.clock()
n = len(dataset_Y)
log_reg = linear_model.LogisticRegression()
log_reg.fit (dataset_X_sd, dataset_Y)
toc = time.clock()
print(toc-tic, "seconds")
beta = [log_reg.intercept_, log_reg.coef_]

dataset_Y_hat = log_reg.predict_proba(dataset_X_sd)[:,1]

epsilon = 1e-10
error_cross_entropy = 0
for i in range(n):
    if dataset_Y[i] == 0:
        error_cross_entropy -= np.log(1 - dataset_Y_hat[i] + epsilon)
    else:
        error_cross_entropy -= np.log(dataset_Y_hat[i] + epsilon)
error_ss = np.sum((dataset_Y - dataset_Y_hat)**2)

print("It took ", toc-tic, "seconds to run")
print("beta = ", beta[0], beta[1])
print("Error (cross entropy) = ", error_cross_entropy)
print("Error (sum of squares) = ", error_ss)

0.002360887023496673 seconds
It took  0.002360887023496673 seconds to run
beta =  [0.61894342] [[-3.57333075]]
Error (cross entropy) =  152.94402381477659
Error (sum of squares) =  46.972067614299064


## My Derivations

In [4]:
#Preliminary preparations
dataset_X_all = np.concatenate((np.ones((dataset_X_sd.shape[0], 1)), dataset_X_sd), axis=1)
m = dataset_X_all.shape[1]
dataset_Y = dataset_Y.reshape(n, 1)

In [5]:
#Newton-Raphson Cross Entropy Loss Minimization --> Note: Not always stable
tic = time.clock()

iter_max = 10
beta = np.random.randn(m, 1)

tolerance = 1e-10
for t in range(iter_max):
    beta_old = deepcopy(beta)
    px = 1 / (1 + np.exp(-dataset_X_all @ beta_old))
    one_minus_px = np.ones(px.shape) - px
    eta = inv((px*dataset_X_all).T @ (one_minus_px*dataset_X_all))
    beta += eta @ dataset_X_all.T @ (dataset_Y - px)
    
    #print("beta=", beta[0], beta[1])
    if norm(beta-beta_old) < tolerance:
        print("Achieved convergence at iteration {}".format(t))
        break

dataset_Y_hat = 1 / (1 + np.exp(-dataset_X_all @ beta))

epsilon = 1e-10
error_cross_entropy = 0
for i in range(n):
    if dataset_Y[i] == 0:
        error_cross_entropy -= np.log(1 - dataset_Y_hat[i] + epsilon)
    else:
        error_cross_entropy -= np.log(dataset_Y_hat[i] + epsilon)
error_ss = np.sum((dataset_Y - dataset_Y_hat)**2)

toc = time.clock()

print("It took ", toc-tic, "seconds to run")
print("beta = ", beta[0], beta[1])
print("Error (cross entropy) = ", error_cross_entropy)
print("Error (sum of squares) = ", error_ss)

Achieved convergence at iteration 7
It took  0.00573984731666366 seconds to run
beta =  [0.63169989] [-3.98118813]
Error (cross entropy) =  [152.24219665]
Error (sum of squares) =  46.91191116341832


In [6]:
#Gradient Descent Cross Entropy Loss Minimization -- Matrix Version (faster than scalar version)
tic = time.clock()

iter_max = 1000
beta = np.random.randn(m,1)
eta = 0.01

tolerance = 1e-10
for t in range(iter_max):
    beta_old = deepcopy(beta)
    px = 1 / (1 + np.exp(-dataset_X_all @ beta_old))
    beta += eta * dataset_X_all.T @ (dataset_Y - px)
    
    #print("beta = ", beta[0], beta[1])
    if norm(beta-beta_old) < tolerance:
        print("Achieved convergence at iteration {}".format(t))
        break

epsilon = 1e-10
error_cross_entropy = 0
for i in range(n):
    if dataset_Y[i] == 0:
        error_cross_entropy -= np.log(1 - dataset_Y_hat[i] + epsilon)
    else:
        error_cross_entropy -= np.log(dataset_Y_hat[i] + epsilon)
error_ss = np.sum((dataset_Y - dataset_Y_hat)**2)

toc = time.clock()
print("It took ", toc-tic, "seconds to run")
print("beta = ", beta[0], beta[1])
print("Error (cross entropy) = ", error_cross_entropy)
print("Error (sum of squares) = ", error_ss)

Achieved convergence at iteration 258
It took  0.012988829243393951 seconds to run
beta =  [0.63169989] [-3.98118813]
Error (cross entropy) =  [152.24219665]
Error (sum of squares) =  46.91191116341832


In [7]:
#Gradient Descent Cross Entropy Loss Minimization -- Scalar Version (slower than matrix version)
tic = time.clock()

iter_max = 1000
beta = np.random.randn(m,1)
eta = 0.01

tolerance = 1e-10
for t in range(iter_max):
    beta_old = deepcopy(beta)
    
    for j in range(m):
        s = 0
        for i in range(n):
            px = 1 / (1 + np.exp(-dataset_X_all[i,:] @ beta_old))
            s += (dataset_Y[i] - px)* dataset_X_all[i,j]
        beta[j] += eta*s
    
    #print("beta = ", beta[0], beta[1])
    if norm(beta-beta_old) < tolerance:
        print("Achieved convergence at iteration {}".format(t))
        break

epsilon = 1e-10
error_cross_entropy = 0
for i in range(n):
    if dataset_Y[i] == 0:
        error_cross_entropy -= np.log(1 - dataset_Y_hat[i] + epsilon)
    else:
        error_cross_entropy -= np.log(dataset_Y_hat[i] + epsilon)
error_ss = np.sum((dataset_Y - dataset_Y_hat)**2)

toc = time.clock()
print("It took ", toc-tic, "seconds to run")
print("beta = ", beta[0], beta[1])
print("Error (cross entropy) = ", error_cross_entropy)
print("Error (sum of squares) = ", error_ss)

Achieved convergence at iteration 257
It took  2.462316276688373 seconds to run
beta =  [0.63169989] [-3.98118813]
Error (cross entropy) =  [152.24219665]
Error (sum of squares) =  46.91191116341832


In [8]:
#Gradient Descent Sum of Least Squares Minimization -- Matrix Version
tic = time.clock()

iter_max = 1000
beta = np.random.randn(m,1)
eta = 0.01

tolerance = 1e-10
for t in range(iter_max):
    beta_old = deepcopy(beta)
    px = 1 / (1 + np.exp(-dataset_X_all @ beta_old))
    one_minus_px = np.ones(px.shape) - px
    beta += eta * dataset_X_all.T @ ((dataset_Y - px) * px * one_minus_px)
    
    #print("beta = ", beta[0], beta[1])
    if norm(beta-beta_old) < tolerance:
        print("Achieved convergence at iteration {}".format(t))
        break

epsilon = 1e-10
error_cross_entropy = 0
for i in range(n):
    if dataset_Y[i] == 0:
        error_cross_entropy -= np.log(1 - dataset_Y_hat[i] + epsilon)
    else:
        error_cross_entropy -= np.log(dataset_Y_hat[i] + epsilon)
error_ss = np.sum((dataset_Y - dataset_Y_hat)**2)

toc = time.clock()
print("It took ", toc-tic, "seconds to run")
print("beta = ", beta[0], beta[1])
print("Error (cross entropy) = ", error_cross_entropy)
print("Error (sum of squares) = ", error_ss)

It took  0.04402880471798021 seconds to run
beta =  [0.67575093] [-3.90046724]
Error (cross entropy) =  [152.24219665]
Error (sum of squares) =  46.91191116341832
