In [1]:
from math import floor

from typing import List
import warnings

from tqdm.notebook import tqdm

import logging
from io import StringIO

import numpy as np
import pandas as pd

### Define signature of a generic activation function class.

In [2]:
from abc import ABC, abstractmethod
class ActivationFunction(ABC):
    def __init__(self):
        pass
    @abstractmethod
    def function(self):
        pass
    @abstractmethod
    def vectorized_function(self):
        pass
    @abstractmethod
    def derivative(self):
        pass
    @abstractmethod
    def __call__(self):
        pass

### Define all activation functions

In [3]:
class Sigmoid(ActivationFunction):
    def __call__(self, x):
        return self.vectorized_function(x)
    def function(self, x):
        x = np.clip(x, -700, 700)
        return 1/(1+np.exp(-x))
    def vectorized_function(self, x):
        return np.vectorize(self.function)(x)
    def derivative(self, x):
        return self.vectorized_function(x)*(1-self.vectorized_function(x))
    def vectorized_derivative(self, x):
        return self.vectorized_function(x)*(1-self.vectorized_function(x))

class Tanh(ActivationFunction):
    def __call__(self, x):
        return self.vectorized_function(x)
    def function(self, x):
        x = np.clip(x, -100, 100)
        # print(x)
        expr1, expr2 = np.exp(x), np.exp(-x)
        return (expr1-expr2)/(expr1+expr2)
        
    def vectorized_function(self, x):
        return np.vectorize(self.function)(x)
    def derivative(self, x):
        expr1, expr2 = np.exp(x), np.exp(-x)
        return 4/((expr1+expr2)**2)
    def vectorized_derivative(self, x):
        return self.derivative(x)

class ReLU(ActivationFunction):
    def __call__(self, x):
        return self.vectorized_function(x)
    def function(self, x):
        return max(0, x)
    def derivative(self, x):
        if max(0,x) == 0:
            return 0
        return 1
    def vectorized_function(self, x):
        return np.vectorize(self.function)(x)
    def vectorized_derivative(self, x):
        return np.vectorize(self.derivative)(x)

class LeakyReLU(ActivationFunction):
    def __call__(self, x):
        return self.vectorized_function(x)
    def function(self, x):
        return max(0.01*x, x)
    def derivative(self, x):
        if max(0.01*x, x) == x:
            return 1
        return 0.01
    def vectorized_function(self, x):
        return np.vectorize(self.function)(x)
    def vectorized_derivative(self, x):
        return np.vectorize(self.derivative)(x)

class Linear:
    def __call__(self, x):
        return x
    def vectorized_derivative(self, x):
        return np.ones_like(x)

### Import existing loss functions and code new ones

In [4]:
def huberLoss(y_true, y_pred, delta=10):
    err = y_true - y_pred
    n_samples = y_true.shape[0]
    abs_err = np.abs(err)
    delta_sq = 0.5*(delta ** 2)
    huber_loss_vectorized = np.vectorize(lambda x: (x**2)*0.5 if x <= delta else delta*x - delta)
    huber_loss_vec = huber_loss_vectorized(abs_err)
    return np.sum(huber_loss_vec)/n_samples
    # return np.sum(huber_loss_vectorized(abs_err))

In [66]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, log_loss

class Crossentropy:
    def __call__(self, y_true, y_pred):
        return self.function(y_true, y_pred)
    def function(self, y_true, y_pred):
        return log_loss(y_true, y_pred, labels = np.arange(y_pred.shape[1]))
    def derivative(self, y_true, y_pred):
        # return an n x 1 matrix
        epsilon = 1e-6
        n_samples = y_true.shape[0]
        y_pred_capped = np.clip(y_pred, epsilon, 1-epsilon) # cap predicted probabilities to avoid floating point issues after taking reciprocal
        y_pred_inv = 1/y_pred_capped # 1/y_hat
        n_classes = y_pred.shape[1]
        y_true_proba = np.eye(n_classes)[y_true] # for ease of computing the derivative, basically a one-hot encoding
        derivative_loss_arr = -np.multiply(y_true_proba, y_pred_inv)# -[y/y_hat, (1-y)/(1-y_hat)]
        derivative_loss_arr = np.sum(derivative_loss_arr, axis=1) # sum up, i.e., y/y_hat + (1-y)/(1-y_hat)  
        return derivative_loss_arr/n_samples

class MSE:
    def __call__(self, y_true, y_pred):
        return self.function(y_true, y_pred)
    def function(self, y_true, y_pred):
        return mean_squared_error(y_true, y_pred)
    def derivative(self, y_true, y_pred):
        n_samples = y_true.shape[0]
        y_true_reshaped = y_true.copy()
        y_true_reshaped = y_true_reshaped.reshape(-1, 1)
        return (-2*(y_true_reshaped - y_pred))/n_samples

class MAE:
    def __call__(self, y_true, y_pred):
        return self.function(y_true, y_pred)
    def function(self, y_true, y_pred):
        return mean_absolute_error(y_true, y_pred)
    def derivative(self, y_true, y_pred):
        n_samples = y_true.shape[0]
        y_true_reshaped = y_true.copy()
        y_true_reshaped = y_true_reshaped.reshape(-1, 1)
        return np.where(y_true > y_pred, 1, -1)/n_samples

class HuberLoss:
    def __init__(self, delta=10):
        self.delta=delta
    def __call__(self, y_true, y_pred):
        return self.function(y_true, y_pred)
    def function(self, y_true, y_pred):
        return huberLoss(y_true, y_pred, self.delta)
    def derivative(self, y_true, y_pred):
        n_samples = y_true.shape[0]
        y_true_reshaped = y_true.copy()
        y_true_reshaped = y_true_reshaped.reshape(-1, 1)
        err = y_true_reshaped - y_pred
        huber_loss_derivative_vectorized = np.vectorize(lambda x: x if np.abs(x) <= self.delta else -self.delta if x < 0 else self.delta)
        return huber_loss_derivative_vectorized(err)/n_samples

#### Test on binary classification

In [None]:
y_true, y_pred = np.random.randint(0, 2, size=(3,)), np.random.uniform(0,1,(3,1))
print(y_true,"\n\n", y_pred)
y_pred_proba = np.hstack([y_pred, 1-y_pred])
print(y_pred_proba)

In [None]:
ce = Crossentropy()
print(f"Loss = {round(ce(y_true, y_pred_proba), 2)}, Derivative = {ce.derivative(y_true, y_pred_proba)}.")

#### Test on Multi-class classification

In [None]:
y_true_multiclass, y_pred_multiclass_preprocessed = np.random.randint(0, 3, size=(5,)), np.random.uniform(0,1,(5, 3))
y_pred_multiclass = y_pred_multiclass_preprocessed / y_pred_multiclass_preprocessed.sum(axis=1, keepdims=True)
print(f"{y_true_multiclass}\n\n{y_pred_multiclass}\n")

In [None]:
print(f"Loss = {round(ce(y_true_multiclass, y_pred_multiclass), 2)}, Derivative = {ce.derivative(y_true_multiclass, y_pred_multiclass), 2}.")

Rest assured, the return value is a numpy.ndarray

#### Test on Regression using Mean Squared Error

In [None]:
yt, yp = np.random.randint(0,10,(4,)), np.random.randint(0,10,(4,))
print(f"{yt}\n\n{yp}")

In [None]:
mse_loss_fn = MSE()
print(f"Loss = {mse_loss_fn(yt, yp)}\nderivative = \n{mse_loss_fn.derivative(yt, yp)}\n")

#### Test on Regression using Mean Absolute Error

In [None]:
mae_loss_fn = MAE()
print(f"Loss = {mae_loss_fn(yt, yp)}\nderivative = \n{mae_loss_fn.derivative(yt, yp)}\n")

#### Test on Regression using Huber Loss

In [None]:
huber_loss_fn = HuberLoss(delta=8)
print(f"Delta = {huber_loss_fn.delta}, Loss = {huber_loss_fn(yt, yp)}\nderivative = \n{huber_loss_fn.derivative(yt, yp)}\n")

### Define Layer and Sequential Model

In [197]:
class ListHandler(logging.Handler):
    def __init__(self):
        super().__init__()
        self.log = []

    def emit(self, record):
        self.log.append(self.format(record))

class Layer():
    
    __valid_activations = {'sigmoid': Sigmoid, 'tanh': Tanh, 'relu': ReLU, 'leaky_relu': LeakyReLU, 'linear': Linear}
    
    def __init__(self, activation, in_dim, out_dim, learning_rate=0.01):
        if activation.lower() not in list(Layer.__valid_activations.keys()):
            raise Exception(f"Valid activations are {Layer.__valid_activations}.")
        self.activation = Layer.__valid_activations[activation.lower()]()
        self.learning_rate = learning_rate
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.weights = np.random.uniform(-1,1, size=(in_dim, out_dim))
        self.batch_size = None

        logger_name = f'MyClass_{id(self)}'  # Generate a unique name for the logger
        self.logger = logging.getLogger(logger_name)
        self.logger.setLevel(logging.DEBUG)
        self.log_handler = ListHandler()
        self.log_handler.setLevel(logging.DEBUG)
        self.logger.addHandler(self.log_handler)

        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - \n%(message)s')
        self.log_handler.setFormatter(formatter)

    def forward_compute(self, X, compute_gradient=False, print_logs=False):
        '''
        compute_gradient: bool , is True if updates are to be performed. is False if only a prediction is to be made in a single forward pass,
        '''
        output_prime = X.dot(self.weights) # of order n x out_dim
        output_val = self.activation(output_prime) # of order n x out_dim

        if compute_gradient:
            # computing stuff for eventual backpropagation
            self._activation_gradient = self.activation.vectorized_derivative(output_prime) # of order n x out_dim
            self._input = X.copy()
        if print_logs:
            self.logger.info(f"\tFORWARD COMPUTE: Output from this layer = \n\t{output_val[:5]}\n")
        
        return output_val

    def backprop_compute(self, prev_grad_multipliers):
        gradient_mat = 1
        
        # To Do: find a way of multiplying pre v_grad_multipliers with this layer's gradient multiplier matrix.
        '''
        Needs: 
         1. current-layer's activation gradient matrix(as a function of this layer's input, i.e. prev layers output)
         2. current-layer's input matrix
        '''
        output_prime = self._input.dot(self.weights) # of order n x out_dim
        output_val = self.activation(output_prime) # of order n x out_dim
        
        activ_prev_layer_output_element_wise_product = np.multiply(prev_grad_multipliers, self._activation_gradient) # of order n x out_dim
        weights_gradient = self._input.T.dot(activ_prev_layer_output_element_wise_product)
        
        self.logger.info(f"\tInput to this layer = \n\t{self._input[:5]}\n")
        # self.logger.info(f"\tOutput from this layer = \n\t{output_val[:5]}\n")
        
        self.weights -= self.learning_rate * weights_gradient
        

        send_mat_to_prev_layer = activ_prev_layer_output_element_wise_product.dot(self.weights.T) # of order n x in_dim, 
                                                                      # which is basically n x out_dim for the previous layer.
        return send_mat_to_prev_layer

    def print_logs(self):
        for log_record in self.log_handler.log:
            print(log_record)

In [198]:
class ModelSequential():
    __valid_loss_functions = {
        'crossentropy': Crossentropy, 
        'mse': MSE,
        'mae': MAE,
        'huber': HuberLoss
    }

    __valid_gd_types = ['batch', 'mini_batch', 'stochastic']

    # def __init__(self, layers_arr: List, metrics_to_track, early_stopping=False):
    def __init__(self, layers_arr: List[Layer]):
        self._layers = layers_arr
        self.loss_arr = []

    def compile(self, loss_function, n_iter = 100):
        if loss_function.lower() not in ModelSequential.__valid_loss_functions:
            raise Exception(f"Loss functions should be any of {ModelSequential.__valid_loss_functions.keys()}.")
        
        self.loss_function = ModelSequential.__valid_loss_functions[loss_function.lower()]()
        self.n_iter = n_iter

    def reweight_layers(self):
        n_layers = len(self._layers)
        for i in range(n_layers):
            in_dim, out_dim = self._layers[i].in_dim, self._layers[i].out_dim
            self._layers[i].weights = np.random.uniform(-1,1, size=(in_dim, out_dim))
            print(f"Weights for layer {i} set to \n{self._layers[i].weights}\n\n")

    def batch_fit(self, X, y):
        '''
        for a given samples of X(features),y(target) , fit the model for n_iter iterations.
        X: features, all numeric columns, n x m matrix
        y: target variable, all numeric columns, n x c matrix , c: no. of classes/no. of numerical variables.
        '''

        self.loss_arr.append(self.loss_function(y_true=y, y_pred=self.predict(X)))
        
        pbar_iterations = tqdm(self.n_iter)
        pbar_iterations.set_description("#Iterations: ")
        
        for iter_ in range(self.n_iter):
            n_layers = len(self._layers)
            input_for_next_layer = X.copy()


            # forward compute
            for i in range(n_layers):
                try:
                    if i == 1:
                        input_for_next_layer = self._layers[i].forward_compute(input_for_next_layer, compute_gradient=True, print_logs=True)
                    else:
                        input_for_next_layer = self._layers[i].forward_compute(input_for_next_layer, compute_gradient=True)
                except ValueError as ve:
                    raise Exception(f"Forward compute failed at layer {i} with the following exception:\n{ve}")
                except RuntimeWarning as rw:
                    raise Exception(f"Forward compute failed at layer {i} with the following exception:\n {rw}")

            y_pred = self.predict(X)
            input_for_next_layer = self.loss_function.derivative(y_true=y, y_pred=y_pred)

        
            # backprop
            for i in range(n_layers-1, -1, -1):
                with warnings.catch_warnings():
                    warnings.filterwarnings("error")
                    try:
                        input_for_next_layer = self._layers[i].backprop_compute(input_for_next_layer)
                    except ValueError as ve:
                        return Exception(f"Backpropagation failed at layer {i} with the following exception:\n{ve}")
                    except RuntimeWarning as rw:
                        raise Exception(f"Forward compute failed at layer {i} with the following exception:\n {rw}")
            
            self.loss_arr.append(self.loss_function(y_true=y, y_pred=self.predict(X)))

            pbar_iterations.update(1)
        pbar_iterations.close()
    
    def fit(self, X, y, gd_type='mini_batch', batch_size=None):
        if gd_type.lower() not in list(ModelSequential.__valid_gd_types):
            raise Exception(f"Valid Gradient descent types are {ModelSequential.__valid_gd_types}.")

        n_samples = X.shape[0]
        
        if gd_type.lower() == 'batch':
            batch_size = n_samples
        elif gd_type.lower() == 'mini_batch':
            if type(batch_size) != 'int' or type(batch_size) != 'int64' or batch_size >= n_samples:
                batch_size=64
        else:
            batch_size = 1
        print(f"Using batch size as {batch_size}...\n\n")
        n_batches = floor(n_samples/batch_size)

        # iterate over each batch_size sized batch
        for batch_iter in range(n_batches):            
            X_new, y_new = X[batch_iter * batch_size:(batch_iter + 1) * batch_size, :].copy(), y[batch_iter * batch_size:(batch_iter + 1) * batch_size].copy()
            self.batch_fit(X_new, y_new)

    def print_layer_logs(self):
        for i in range(n_layers):
            print(f"Handling layer {i}...")
            self._layers[i].print_logs()
    
    def predict(self, X):
        n_layers = len(self._layers)
        input_for_next_layer = X.copy()

        # forward compute
        for i in range(n_layers):
            with warnings.catch_warnings():
                try:
                    input_for_next_layer = self._layers[i].forward_compute(input_for_next_layer)
                except ValueError as ve:
                    raise Exception(f"Forward compute failed at layer {i} with the following exception:\n{ve}")
                except RuntimeWarning as rw:
                    raise Exception(f"Forward compute failed at layer {i} with the following exception:\n {rw}")
        return input_for_next_layer

### Test

In [34]:
# from sklearn.datasets import load_diabetes

# X, y = load_diabetes(return_X_y=True)
# print(X.shape, y.shape)

(442, 10) (442,)


In [199]:
l1, l2, l3 = Layer('sigmoid', X.shape[1], 3, learning_rate=0.05), Layer('leaky_relu', 3, 2, learning_rate=0.05), Layer('linear', 2, 1, learning_rate=0.05)

In [200]:
# Load the weights_list from the file using NumPy
loaded_weights = np.load('model_weights.npy', allow_pickle=True)

# Separate the loaded weights into individual arrays
loaded_layer1_weights = loaded_weights[0][0]
loaded_layer2_weights = loaded_weights[1][0]
loaded_layer3_weights = loaded_weights[2][0]

# Now you have loaded the weights into separate NumPy arrays
# loaded_layer1_weights, loaded_layer2_weights, loaded_layer3_weights
l1.weights = loaded_layer1_weights
l2.weights = loaded_layer2_weights
l3.weights = loaded_layer3_weights

In [201]:
print(loaded_layer3_weights)

[[0.65107906]
 [0.5028274 ]]


In [202]:
model = ModelSequential([l1, l2, l3])

In [203]:
# model.compile(loss_function='huber', n_iter=10)
model.compile(loss_function='mse', n_iter=10)

In [204]:
init_pred = model.predict(X)
print(model.loss_function(y_true=y, y_pred=init_pred))

29029.82125825195


In [205]:
# model.reweight_layers()

n_layers = len(model._layers)
for i in range(n_layers):
    print(f"Layer {i}\n{model._layers[i].weights}\n\n")


model.fit(X, y, batch_size=10)

Layer 0
[[ 0.38481784 -0.2873984   0.2569083 ]
 [ 0.63190603  0.0207603  -0.01701885]
 [-0.14716172 -0.3139873   0.13053173]
 [-0.5871992   0.58255076 -0.06661969]
 [-0.22049424 -0.00403428 -0.14160568]
 [-0.3763931  -0.22384077 -0.27747136]
 [-0.04296637 -0.07706636  0.31943142]
 [-0.02129191 -0.07783592 -0.5941539 ]
 [-0.17432415 -0.03620058  0.60362446]
 [ 0.40245163  0.31902552 -0.0473246 ]]


Layer 1
[[-0.3344916   0.68226814]
 [-0.31428093  0.6829982 ]
 [-1.0913304  -0.75725216]]


Layer 2
[[0.65107906]
 [0.5028274 ]]


Using batch size as 64...




0it [00:00, ?it/s]

Exception: Forward compute failed at layer 2 with the following exception:
 overflow encountered in subtract

In [206]:
model._layers[1].print_logs()

2023-11-27 02:31:40,924 - MyClass_5715041056 - INFO - 
	FORWARD COMPUTE: Output from this layer = 
	[[-0.00881138  0.30266174]
 [-0.00869384  0.29330342]
 [-0.00883482  0.3031546 ]
 [-0.00852007  0.30267831]
 [-0.00859889  0.29966052]]

2023-11-27 02:31:40,927 - MyClass_5715041056 - INFO - 
	Input to this layer = 
	[[0.50973872 0.49716206 0.50799134]
 [0.49390647 0.49379042 0.50304404]
 [0.5187596  0.49035977 0.50933283]
 [0.48544146 0.49940304 0.48809936]
 [0.48653979 0.50100949 0.49452305]]

2023-11-27 02:31:40,940 - MyClass_5715041056 - INFO - 
	FORWARD COMPUTE: Output from this layer = 
	[[-1.04327706e-02  6.14867711e+01]
 [-1.08765252e-02  6.40228321e+01]
 [-9.92640710e-03  5.84480870e+01]
 [-8.17846153e-03  4.83310876e+01]
 [-7.86080376e-03  4.62850753e+01]]

2023-11-27 02:31:40,942 - MyClass_5715041056 - INFO - 
	Input to this layer = 
	[[0.64604683 0.63444812 0.63942382]
 [0.6660295  0.66547955 0.66767432]
 [0.6217095  0.59455525 0.60880264]
 [0.49750106 0.51147867 0.50006544]


In [196]:
model._layers[2].print_logs()

2023-11-27 02:29:52,596 - MyClass_5714031376 - INFO - 
	Input to this layer = 
	[[-0.00881138  0.30266174]
 [-0.00869384  0.29330342]
 [-0.00883482  0.3031546 ]
 [-0.00852007  0.30267831]
 [-0.00859889  0.29966052]]

2023-11-27 02:29:52,611 - MyClass_5714031376 - INFO - 
	Input to this layer = 
	[[-1.04327706e-02  6.14867711e+01]
 [-1.08765252e-02  6.40228321e+01]
 [-9.92640710e-03  5.84480870e+01]
 [-8.17846153e-03  4.83310876e+01]
 [-7.86080376e-03  4.62850753e+01]]

2023-11-27 02:29:52,622 - MyClass_5714031376 - INFO - 
	Input to this layer = 
	[[-1.78000825e-306  2.12850461e-300]
 [-1.80534142e-002  2.15879761e+004]
 [-1.78000825e-306  2.12850461e-300]
 [-1.80534142e-002  2.15879761e+004]
 [-1.80534142e-002  2.15879761e+004]]

2023-11-27 02:29:52,633 - MyClass_5714031376 - INFO - 
	Input to this layer = 
	[[-8.05125001e-298  9.62784463e-288]
 [-8.16583584e+006  9.76486864e+016]
 [-8.05125001e-298  9.62784463e-288]
 [-8.16583584e+006  9.76486864e+016]
 [-8.16583584e+006  9.76486864e

In [64]:
y_pred = model.predict(X)
model.loss_function(y_true=y, y_pred=y_pred)

ValueError: Input contains infinity or a value too large for dtype('float64').

In [65]:
n_layers = len(model._layers)
for i in range(n_layers):
    print(f"Layer {i}\n{model._layers[i].weights}\n\n")

Layer 0
[[-385171.84 -379742.34 -393323.38]
 [-289175.16 -289028.84 -289466.56]
 [-361098.84 -361032.   -366622.97]
 [-458592.4  -464064.62 -465368.4 ]
 [-478988.56 -480969.22 -488474.25]
 [-496577.3  -498434.34 -503756.25]
 [ 367055.38  366053.84  362682.28]
 [-577394.7  -577423.8  -580373.44]
 [-462128.   -461964.94 -467120.38]
 [-438094.75 -436584.1  -439147.1 ]]


Layer 1
[[-2.7219453e+08  3.2549562e+16]
 [-2.7219453e+08  3.2549562e+16]
 [-2.7219453e+08  3.2549562e+16]]


Layer 2
[[1.4531762e+33]
 [         -inf]]




In [64]:
print(model.loss_arr)

[25925.177240833218, 21382.184231005536, 16928294.951077983, 1.0939799763713171e+19, 1.4035903790250388e+67, 1.1190986420038096e+269]


### Print all layer's weights

In [None]:
n_layers = len(model._layers)
for i in range(n_layers):
    print(f"Layer {i}\n{model._layers[i].weights}\n\n")

In [None]:
print(dir(model))

In [None]:
import sys
print(sys.getsizeof(model))