In [2]:
import pandas as pd
import numpy as np
import math
import time
from numpy.linalg import inv
from typing import Tuple

In [3]:
# Sources for IWLS:
# https://datascience.stackexchange.com/questions/75166/iterative-reweighted-least-squares-in-python
# https://github.com/jaeho3690/LogisiticRegression/blob/main/LogisticRegressionIRLS.ipynb

# Sources for GD & SGD:
# https://stackoverflow.com/questions/47795918/logistic-regression-gradient-descent
# https://medium.com/analytics-vidhya/gradient-descent-and-stochastic-gradient-descent-from-scratch-python-1cd93d4def49
# https://github.com/Darshansol9/GD-SGD_FromScratch_Python/blob/master/Code.ipynb

# Sources for ADAM:
# https://medium.com/analytics-vidhya/derivative-of-log-loss-function-for-logistic-regression-9b832f025c2d
# https://github.com/jiexunsee/Adam-Optimizer-from-scratch/blob/master/ADAM.ipynb
# https://machinelearningmastery.com/adam-optimization-from-scratch/

# Source for cross-validation:
# https://github.com/jaeho3690/LogisiticRegression/blob/main/LogisticRegressionIRLS.ipynb

In [5]:
class Optimizers:
    
    def sigmoid(self,x:float)->float:
        """ Activation function used to map any real value between 0 and 1 """
        # Activation function used to map any real value between 0 and 1
        return 1/(1+np.exp(-x))

    def cost_function(self,X:np.matrix,y:np.array,params:np.array,b:np.array=None)->float:
        """
        Computes the cost function for all the training samples 
        
        param: X - design matrix
        param: y - target vector comprising boolean value
        param: params - array of weights
        param: b - intercept (optional)
        """
        # Computes the cost function for all the training samples
        fx = self.sigmoid(X.dot(params))
        cost = -np.sum(y * np.log(fx) + (1 - y)* np.log(1-fx))
        return cost
    
    def gradient_descent(self,X:np.matrix,y:np.array,params:np.array,iterations:int,alpha:float)->np.array:
        """
        Performs gradient descent optimization.
        
        Works assuming that weights vector contains
        intercept and the corresponding one column 
        has been added to the design matrix before 
        it is given to the method
        
        param: X - design matrix
        param: y - target vector comprising boolean value
        param: params - array of weights
        param: b - intercept (optional)
        param: iterations - number of iterations
        param: alpha - learning rate
        """
        for i in range(iterations):
            params = params + alpha * (X.T.dot(y - self.sigmoid(X.dot(params))))
        return params
    
    def stochastic_gradient_descent(self,X:np.matrix,y:np.array,params:np.array,iterations:int,alpha:float,sample_size:int=1)->np.array:
        """
        Performs stochastic gradient descent optimization.
        
        Works assuming that weights vector contains
        intercept and the corresponding one column 
        has been added to the design matrix before 
        it is given to the method
        
        param: X - design matrix
        param: y - target vector comprising boolean value
        param: params - array of weights
        param: b - intercept (optional)
        param: iterations - number of iterations
        param: alpha - learning rate
        param: sample_size - batch size
        """
        assert sample_size <= X.shape[0]
        df_X = pd.DataFrame(X)
        df_y = pd.DataFrame(y)
        for i in range(iterations):
            n_samples = math.ceil(df_X.shape[0]/sample_size)
            shuffled = df_X.sample(frac=1)
            samples = np.array_split(shuffled, n_samples)
            for sample in samples:
                X_st = np.array(sample)
                y_st = np.array(y[sample.index])
                # y_st = np.expand_dims(y_st, axis=-1)
                params = params + alpha * (X_st.T.dot(y_st - self.sigmoid(X_st.dot(params))))
        return params
    
    def irls(self,X:np.matrix,y:np.array,iterations:int=1000)->np.array:
        """
        Performs Iterative-Reweighted Least Squares optimization.
        
        Works assuming that weights vector contains
        intercept and the corresponding one column 
        has been added to the design matrix before 
        it is given to the method
        
        param: X - design matrix
        param: y - target vector comprising boolean value
        param: iterations - number of iterations
        """
        w = np.zeros((X.shape[1],1))
        for i in range(iterations):
            y_ = self.sigmoid(np.matmul(X,w))
            R = np.diag(np.ravel(y_*(1-y_)))
            grad = np.matmul(X.T,(y_-y))
            hessian = np.matmul(np.matmul(X.T,R),X)+0.001*np.eye(X.shape[1])
            w -= np.matmul(np.linalg.inv(hessian),grad)
        return w
    
    def adam(self,X:np.matrix,y:np.array,b1:float,b2:float,iterations:int,alpha:float,eps:float)->np.array:
        """
        Performs stochastic gradient descent optimization.
        
        Works assuming that weights vector does not 
        contain intercept - it is a separate variable (named b)
        and the design matrix does not include additional 
        one column
        
        param: X - design matrix
        param: y - target vector comprising boolean value
        param: params - array of weights
        param: iterations - number of iterations
        param: alpha - learning rate
        param: sample_size - batch size
        param: b - intercept (optional)
        param: b1, b2 - initial decay rates used when estimating 
            the first and second moments of the gradient
        """
        m,n = X.shape
        W,b = np.random.randn(n,1),np.random.randn(1)
        VW,Vb = np.zeros((n,1)),np.zeros(1)
        SW,Sb = np.zeros((n,1)),np.zeros(1)
        
        y = y.reshape(len(y),1)
        
        for i in range(iterations):
            Z = X.dot(W)+b
            # sigmoid
            A = self.sigmoid(-Z)
            
            # binary classification cost
            j = (-y*np.log(A)- (1-y)*np.log(1-A)).sum()*(1/m)  # constant value (1/m)

            # derivative respect to j
            dA = (A-y)/(A*(1-A))
            dZ = A-y

            dW = X.transpose().dot(dZ)
            db = dZ.sum()
            
            # momentum
            VW = b1*VW + (1-b1)*dW
            Vb = b1*Vb + (1-b1)*db
            
            # rmsprop
            SW = b2*SW + (1-b2)*dW**2
            Sb = b2*Sb + (1-b2)*db**2
            
            # update weight
            W -= alpha*VW/(np.sqrt(SW)+eps)
            b -= alpha*Vb/(np.sqrt(Sb)+eps)
        return W,b,A  # pred_labels = np.round(A)

## Test

In [6]:
credit_train_x = pd.read_csv("../datasets/preprocessed/credit_train_x.csv")
credit_train_y = pd.read_csv("../datasets/preprocessed/credit_train_y.csv")

credit_train_x = np.array(credit_train_x)
credit_train_y = np.array(credit_train_y)

In [7]:
X_final, y_final = credit_train_x, credit_train_y

In [8]:
# Defining the parameters such as iterations
iterations = 10
# Creating the instance of the class
optimizer = Optimizers()

# Iterative-Reweighted Least Squares
start = time.time()
params_optimal_irls  = optimizer.irls(X_final,y_final,iterations=iterations)
end = time.time()
params_irls = np.zeros((X_final.shape[1],1))
initial_cost_irls = optimizer.cost_function(X_final, y_final,params_irls)
final_cost_irls = optimizer.cost_function(X_final, y_final,params_optimal_irls)

print('Iterative-Reweighted Least Squares\n')
print(f'Time eclapsed for IRLS {end-start} secs')
print('Initial cost ',initial_cost_irls)
print('Final cost ',final_cost_irls)
print('\n\n')

# Gradient descent
# Defining the parameters such as alpha, iterations
iterations = 2000
learning_rate = 2e-5

params_gd = np.random.randn(X_final.shape[1])
params_gd = params_gd[:,np.newaxis]

start = time.time()
intial_cost_gd = optimizer.cost_function(X_final,y_final,params_gd)
params_optimal_gd  = optimizer.gradient_descent(X_final,y_final,params_gd,iterations,learning_rate)
end = time.time()
final_cost_gd = optimizer.cost_function(X_final,y_final,params_optimal_gd)

print('Gradient Descent\n')
print(f'Time eclapsed for GD is {end-start} secs')
print('Initial cost ',intial_cost_gd)
print('Final cost ',final_cost_gd)
print('\n\n')

# Stochastic Gradient Descent
iterations = 2000
learning_rate = 2e-5
params_sgd = np.random.randn(X_final.shape[1])
params_sgd = params_sgd[:,np.newaxis]
intial_cost_sgd = optimizer.cost_function(X_final, y_final, params_sgd)

start = time.time()
params_optimal_sgd  = optimizer.stochastic_gradient_descent(X_final, y_final, params_sgd, iterations, learning_rate)
end = time.time()
final_cost_sgd = optimizer.cost_function(X_final, y_final, params_optimal_sgd)

print('Stochastic Gradient Descent\n')
print(f'Time eclapsed for SGD {end-start} secs')
print('Initial cost ',intial_cost_sgd)
print('Final cost ',final_cost_sgd)
print('\n\n')

# ADAM
b1=0.9
b2=0.999
iterations=20000
alpha=1e-6
eps=1e-8

start = time.time()
params_optimal_adam,b,A = optimizer.adam(X_final,y_final,b1=0.9,b2=0.999,iterations=20000,alpha=1e-6,eps=1e-8)
end = time.time()

print('ADAM\n')
print(f'Time eclapsed for ADAM {end-start} secs')
print('\n\n')

Iterative-Reweighted Least Squares

Time eclapsed for IRLS 0.11946773529052734 secs
Initial cost  519.860385419959
Final cost  329.8412760667467



Gradient Descent

Time eclapsed for GD is 0.1935269832611084 secs
Initial cost  1221.595277155707
Final cost  456.18796219883654



Stochastic Gradient Descent

Time eclapsed for SGD 399.4033818244934 secs
Initial cost  1091.960263018058
Final cost  412.82213784954996



ADAM

Time eclapsed for ADAM 6.4455108642578125 secs



