In [1]:
import pandas as pd
import numpy as np
import math
import time
from numpy.linalg import inv
from sklearn.model_selection import train_test_split

In [2]:
# Sources for IWLS:
# https://datascience.stackexchange.com/questions/75166/iterative-reweighted-least-squares-in-python
# https://github.com/jaeho3690/LogisiticRegression/blob/main/LogisticRegressionIRLS.ipynb

# Sources for GD & SGD:
# https://stackoverflow.com/questions/47795918/logistic-regression-gradient-descent
# https://medium.com/analytics-vidhya/gradient-descent-and-stochastic-gradient-descent-from-scratch-python-1cd93d4def49
# https://github.com/Darshansol9/GD-SGD_FromScratch_Python/blob/master/Code.ipynb

# Sources for ADAM:
# https://medium.com/analytics-vidhya/derivative-of-log-loss-function-for-logistic-regression-9b832f025c2d
# https://github.com/jiexunsee/Adam-Optimizer-from-scratch/blob/master/ADAM.ipynb
# https://machinelearningmastery.com/adam-optimization-from-scratch/

# Source for cross-validation:
# https://github.com/jaeho3690/LogisiticRegression/blob/main/LogisticRegressionIRLS.ipynb

In [3]:
class Optimizers:
    
    def sigmoid(self,x):
        # Activation function used to map any real value between 0 and 1
        return 1/(1+np.exp(-x))

    def cost_function(self,params,X,y):
        # Computes the cost function for all the training samples
        fx = self.sigmoid(X.dot(params))
        cost = -np.sum(y * np.log(fx) + (1 - y)* np.log(1-fx))
        return cost
    
    def cost_derivative(self,params,X,y):
        m=X.shape[0]
        dfx=(1/m)*X.T.dot(self.sigmoid(X.dot(params))-y)
        return dfx
        
    
    def gradient_descent(self,params,X,y,iterations,alpha):
        for i in range(iterations):
            params = params + alpha * (X.T.dot(y - self.sigmoid(X.dot(params))))
        return params
    
    def stochastic_gradient_descent(self,params,X,y,iterations,alpha,sample_size=1):
        assert sample_size <= X.shape[0]
        df_X = pd.DataFrame(X)
        df_y = pd.DataFrame(y)
        for i in range(iterations):
            n_samples = math.ceil(df_X.shape[0]/sample_size)
            shuffled = df_X.sample(frac=1)
            samples = np.array_split(shuffled, n_samples)
            for sample in samples:
                X_st = np.array(sample)
                y_st = np.array(y[sample.index])
                # y_st = np.expand_dims(y_st, axis=-1)
                params = params + alpha * (X_st.T.dot(y_st - self.sigmoid(X_st.dot(params))))
        return params
    
    def irls(self,X,y,iterations=1000):
        """
        param: X - design matrix
        param: y - target vector comprising Boolean value
        """
        w = np.zeros((X.shape[1],1))
        for i in range(iterations):
            y_ = self.sigmoid(np.matmul(X,w))
            R = np.diag(np.ravel(y_*(1-y_)))
            grad = np.matmul(X.T,(y_-y))
            hessian = np.matmul(np.matmul(X.T,R),X)+0.001*np.eye(X.shape[1])
            w -= np.matmul(np.linalg.inv(hessian),grad)
        return w
    
    def adam(self,X,y,b1,b2,iterations,alpha,eps):
        m,n = X.shape
        W,b = np.random.randn(n,1),np.random.randn(1)
        VW,Vb = np.zeros((n,1)),np.zeros(1)
        SW,Sb = np.zeros((n,1)),np.zeros(1)
        
        y = y.reshape(len(y),1)
        
        for i in range(iterations):
            Z = X.dot(W)+b
            # sigmoid
            A = self.sigmoid(-Z)
            
            # binary classification cost
            j = (-y*np.log(A)- (1-y)*np.log(1-A)).sum()*(1/m)  # constant value (1/m)

            # derivative respect to j
            dA = (A-y)/(A*(1-A))
            dZ = A-y

            dW = X.transpose().dot(dZ)
            db = dZ.sum()
            
            # momentum
            VW = b1*VW + (1-b1)*dW
            Vb = b1*Vb + (1-b1)*db
            
            # rmsprop
            SW = b2*SW + (1-b2)*dW**2
            Sb = b2*Sb + (1-b2)*db**2
            
            # update weight
            W -= alpha*VW/(np.sqrt(SW)+eps)
            b -= alpha*Vb/(np.sqrt(Sb)+eps)
        return W,b,A  # pred_labels = np.round(A)

## Test

In [4]:
credit_train_x = pd.read_csv("../datasets/preprocessed/credit_train_x.csv")
credit_train_y = pd.read_csv("../datasets/preprocessed/credit_train_y.csv")

credit_train_x = np.array(credit_train_x)
credit_train_y = np.array(credit_train_y)

In [5]:
X_final, y_final = credit_train_x, credit_train_y

In [6]:
# Defining the parameters such as iterations
iterations = 10
# Creating the instance of the class
optimizer = Optimizers()

# Iterative-Reweighted Least Squares
start = time.time()
params_optimal_irls  = optimizer.irls(X_final,y_final,iterations=iterations)
end = time.time()
params_irls = np.zeros((X_final.shape[1],1))
initial_cost_irls = optimizer.cost_function(params_irls, X_final, y_final)
final_cost_irls = optimizer.cost_function(params_optimal_irls, X_final, y_final)

print('Iterative-Reweighted Least Squares\n')
print(f'Time eclapsed for IRLS {end-start} secs')
print('Initial cost ',initial_cost_irls)
print('Final cost ',final_cost_irls)
print('\n\n')

# Gradient descent
# Defining the parameters such as alpha, iterations
iterations = 2000
learning_rate = 2e-5

params_gd = np.random.randn(X_final.shape[1])
params_gd = params_gd[:,np.newaxis]

start = time.time()
intial_cost_gd = optimizer.cost_function(params_gd,X_final,y_final)
params_optimal_gd  = optimizer.gradient_descent(params_gd,X_final,y_final,iterations,learning_rate)
end = time.time()
final_cost_gd = optimizer.cost_function(params_optimal_gd, X_final, y_final)

print('Gradient Descent\n')
print(f'Time eclapsed for GD is {end-start} secs')
print('Initial cost ',intial_cost_gd)
print('Final cost ',final_cost_gd)
print('\n\n')

# Stochastic Gradient Descent
iterations = 2000
learning_rate = 2e-5
params_sgd = np.random.randn(X_final.shape[1])
params_sgd = params_sgd[:,np.newaxis]
intial_cost_sgd = optimizer.cost_function(params_sgd, X_final, y_final)

start = time.time()
params_optimal_sgd  = optimizer.stochastic_gradient_descent(params_sgd, X_final, y_final, iterations, learning_rate)
end = time.time()
final_cost_sgd = optimizer.cost_function(params_optimal_sgd, X_final, y_final)

print('Stochastic Gradient Descent\n')
print(f'Time eclapsed for SGD {end-start} secs')
print('Initial cost ',intial_cost_sgd)
print('Final cost ',final_cost_sgd)
print('\n\n')

# ADAM
b1=0.9
b2=0.999
iterations=20000
alpha=1e-6
eps=1e-8

start = time.time()
params_optimal_adam,b,A = optimizer.adam(X_final,y_final,b1=0.9,b2=0.999,iterations=20000,alpha=1e-6,eps=1e-8)
end = time.time()

print('ADAM\n')
print(f'Time eclapsed for ADAM {end-start} secs')
print('\n\n')

Iterative-Reweighted Least Squares

Time eclapsed for IRLS 0.05204916000366211 secs
Initial cost  519.860385419959
Final cost  329.8412760667467



Gradient Descent

Time eclapsed for GD is 0.047278642654418945 secs
Initial cost  1490.6749175945429
Final cost  454.35621275904



Stochastic Gradient Descent

Time eclapsed for SGD 108.01680064201355 secs
Initial cost  978.2061742215765
Final cost  440.4543264725253



ADAM

Time eclapsed for ADAM 1.9808518886566162 secs



