In [1]:
from typing import List
import warnings

from tqdm.notebook import tqdm

import logging
from io import StringIO

import numpy as np
import pandas as pd

### Define signature of a generic activation function class.

In [2]:
from abc import ABC, abstractmethod
class ActivationFunction(ABC):
    def __init__(self):
        pass
    @abstractmethod
    def function(self):
        pass
    @abstractmethod
    def vectorized_function(self):
        pass
    @abstractmethod
    def derivative(self):
        pass
    @abstractmethod
    def __call__(self):
        pass

### Define all activation functions

In [3]:
class Sigmoid(ActivationFunction):
    def __call__(self, x):
        return self.vectorized_function(x)
    def function(self, x):
        x = np.clip(x, -700, 700)
        return 1/(1+np.exp(-x))
    def vectorized_function(self, x):
        return np.vectorize(self.function)(x)
    def derivative(self, x):
        return self.vectorized_function(x)*(1-self.vectorized_function(x))
    def vectorized_derivative(self, x):
        return self.vectorized_function(x)*(1-self.vectorized_function(x))

class Tanh(ActivationFunction):
    def __call__(self, x):
        return self.vectorized_function(x)
    def function(self, x):
        x = np.clip(x, -100, 100)
        # print(x)
        expr1, expr2 = np.exp(x), np.exp(-x)
        return (expr1-expr2)/(expr1+expr2)
        
    def vectorized_function(self, x):
        return np.vectorize(self.function)(x)
    def derivative(self, x):
        expr1, expr2 = np.exp(x), np.exp(-x)
        return 4/((expr1+expr2)**2)
    def vectorized_derivative(self, x):
        return self.derivative(x)

class ReLU(ActivationFunction):
    def __call__(self, x):
        return self.vectorized_function(x)
    def function(self, x):
        return max(0, x)
    def derivative(self, x):
        if max(0,x) == 0:
            return 0
        return 1
    def vectorized_function(self, x):
        return np.vectorize(self.function)(x)
    def vectorized_derivative(self, x):
        return np.vectorize(self.derivative)(x)

class LeakyReLU(ActivationFunction):
    def __call__(self, x):
        return self.vectorized_function(x)
    def function(self, x):
        return max(0.01*x, x)
    def derivative(self, x):
        if max(0.01*x, x) == x:
            return 1
        return 0.01
    def vectorized_function(self, x):
        return np.vectorize(self.function)(x)
    def vectorized_derivative(self, x):
        return np.vectorize(self.derivative)(x)

class Linear:
    def __call__(self, x):
        return x
    def vectorized_derivative(self, x):
        return np.ones_like(x)

### Import existing loss functions and code new ones

In [4]:
def huberLoss(y_true, y_pred, delta=10):
    err = y_true - y_pred
    n_samples = y_true.shape[0]
    abs_err = np.abs(err)
    delta_sq = 0.5*(delta ** 2)
    huber_loss_vectorized = np.vectorize(lambda x: (x**2)*0.5 if x <= delta else delta*x - delta)
    huber_loss_vec = huber_loss_vectorized(abs_err)
    return np.sum(huber_loss_vec)/n_samples
    # return np.sum(huber_loss_vectorized(abs_err))

In [5]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, log_loss

class Crossentropy:
    def __call__(self, y_true, y_pred):
        return self.function(y_true, y_pred)
    def function(self, y_true, y_pred):
        return log_loss(y_true, y_pred, labels = np.arange(y_pred.shape[1]))
    def derivative(self, y_true, y_pred):
        # return an n x 1 matrix
        epsilon = 1e-6
        n_samples = y_true.shape[0]
        y_pred_capped = np.clip(y_pred, epsilon, 1-epsilon) # cap predicted probabilities to avoid floating point issues after taking reciprocal
        y_pred_inv = 1/y_pred_capped # 1/y_hat
        n_classes = y_pred.shape[1]
        y_true_proba = np.eye(n_classes)[y_true] # for ease of computing the derivative, basically a one-hot encoding
        derivative_loss_arr = -np.multiply(y_true_proba, y_pred_inv)# -[y/y_hat, (1-y)/(1-y_hat)]
        derivative_loss_arr = np.sum(derivative_loss_arr, axis=1) # sum up, i.e., y/y_hat + (1-y)/(1-y_hat)  
        return derivative_loss_arr/n_samples

class MSE:
    def __call__(self, y_true, y_pred):
        return self.function(y_true, y_pred)
    def function(self, y_true, y_pred):
        return mean_squared_error(y_true, y_pred)
    def derivative(self, y_true, y_pred):
        n_samples = y_true.shape[0]
        y_true_reshaped = y_true.copy()
        y_true_reshaped = y_true_reshaped.reshape(-1, 1)
        return -2*(y_true_reshaped - y_pred)/n_samples

class MAE:
    def __call__(self, y_true, y_pred):
        return self.function(y_true, y_pred)
    def function(self, y_true, y_pred):
        return mean_absolute_error(y_true, y_pred)
    def derivative(self, y_true, y_pred):
        n_samples = y_true.shape[0]
        y_true_reshaped = y_true.copy()
        y_true_reshaped = y_true_reshaped.reshape(-1, 1)
        return np.where(y_true > y_pred, 1, -1)/n_samples

class HuberLoss:
    def __init__(self, delta=10):
        self.delta=delta
    def __call__(self, y_true, y_pred):
        return self.function(y_true, y_pred)
    def function(self, y_true, y_pred):
        return huberLoss(y_true, y_pred, self.delta)
    def derivative(self, y_true, y_pred):
        n_samples = y_true.shape[0]
        y_true_reshaped = y_true.copy()
        y_true_reshaped = y_true_reshaped.reshape(-1, 1)
        err = y_true_reshaped - y_pred
        huber_loss_derivative_vectorized = np.vectorize(lambda x: x if np.abs(x) <= self.delta else -self.delta if x < 0 else self.delta)
        return huber_loss_derivative_vectorized(err)/n_samples



#### Test on binary classification

In [None]:
y_true, y_pred = np.random.randint(0, 2, size=(3,)), np.random.uniform(0,1,(3,1))
print(y_true,"\n\n", y_pred)
y_pred_proba = np.hstack([y_pred, 1-y_pred])
print(y_pred_proba)

In [None]:
ce = Crossentropy()
print(f"Loss = {round(ce(y_true, y_pred_proba), 2)}, Derivative = {ce.derivative(y_true, y_pred_proba)}.")

#### Test on Multi-class classification

In [None]:
y_true_multiclass, y_pred_multiclass_preprocessed = np.random.randint(0, 3, size=(5,)), np.random.uniform(0,1,(5, 3))
y_pred_multiclass = y_pred_multiclass_preprocessed / y_pred_multiclass_preprocessed.sum(axis=1, keepdims=True)
print(f"{y_true_multiclass}\n\n{y_pred_multiclass}\n")

In [None]:
print(f"Loss = {round(ce(y_true_multiclass, y_pred_multiclass), 2)}, Derivative = {ce.derivative(y_true_multiclass, y_pred_multiclass), 2}.")

Rest assured, the return value is a numpy.ndarray

#### Test on Regression using Mean Squared Error

In [None]:
yt, yp = np.random.randint(0,10,(4,)), np.random.randint(0,10,(4,))
print(f"{yt}\n\n{yp}")

In [None]:
mse_loss_fn = MSE()
print(f"Loss = {mse_loss_fn(yt, yp)}\nderivative = \n{mse_loss_fn.derivative(yt, yp)}\n")

#### Test on Regression using Mean Absolute Error

In [None]:
mae_loss_fn = MAE()
print(f"Loss = {mae_loss_fn(yt, yp)}\nderivative = \n{mae_loss_fn.derivative(yt, yp)}\n")

#### Test on Regression using Huber Loss

In [None]:
huber_loss_fn = HuberLoss(delta=8)
print(f"Delta = {huber_loss_fn.delta}, Loss = {huber_loss_fn(yt, yp)}\nderivative = \n{huber_loss_fn.derivative(yt, yp)}\n")

### Define Layer and Sequential Model

In [23]:
class ListHandler(logging.Handler):
    def __init__(self):
        super().__init__()
        self.log = []

    def emit(self, record):
        self.log.append(self.format(record))

class Layer():
    
    __valid_activations = {'sigmoid': Sigmoid, 'tanh': Tanh, 'relu': ReLU, 'leaky_relu': LeakyReLU, 'linear': Linear}
    
    def __init__(self, activation, in_dim, out_dim, learning_rate=0.01):
        if activation.lower() not in list(Layer.__valid_activations.keys()):
            raise Exception(f"Valid activations are {Layer.__valid_activations}.")
        self.activation = Layer.__valid_activations[activation.lower()]()
        self.learning_rate = learning_rate
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.weights = np.random.uniform(-1,1, size=(in_dim, out_dim))

        logger_name = f'MyClass_{id(self)}'  # Generate a unique name for the logger
        self.logger = logging.getLogger(logger_name)
        self.logger.setLevel(logging.DEBUG)
        self.log_handler = ListHandler()
        self.log_handler.setLevel(logging.DEBUG)
        self.logger.addHandler(self.log_handler)

        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - \n%(message)s')
        self.log_handler.setFormatter(formatter)

    def forward_compute(self, X, compute_gradient=False):
        output_prime = X.dot(self.weights) # of order n x out_dim
        output_val = self.activation(output_prime) # of order n x out_dim

        if compute_gradient:
            # computing stuff for eventual backpropagation
            self._activation_gradient = self.activation.vectorized_derivative(output_prime) # of order n x out_dim
            self._input = X
        
        return output_val

    def backprop_compute(self, prev_grad_multipliers):
        gradient_mat = 1
        
        # To Do: find a way of multiplying pre v_grad_multipliers with this layer's gradient multiplier matrix.
        '''
        Needs: 
         1. current-layer's activation gradient matrix(as a function of this layer's input, i.e. prev layers output)
         2. current-layer's input matrix
        '''
        self.learning_rate /= 2
        activ_prev_layer_output_element_wise_product = np.multiply(prev_grad_multipliers, self._activation_gradient) # of order n x out_dim
        weights_gradient = self._input.T.dot(activ_prev_layer_output_element_wise_product)
        self.weights -= self.learning_rate * weights_gradient
        self.logger.info(f"\tWeight Gradient = \n\t{self.learning_rate * weights_gradient}\n")
        self.logger.info(f"\tWeights = \n\t{self.weights}\n\n")

        send_mat_to_prev_layer = activ_prev_layer_output_element_wise_product.dot(self.weights.T) # of order n x in_dim, 
                                                                      # which is basically n x out_dim for the previous layer.
        return send_mat_to_prev_layer

    def print_logs(self):
        for log_record in self.log_handler.log:
            print(log_record)

In [7]:
class ModelSequential():
    __valid_loss_functions = {
        'crossentropy': Crossentropy, 
        'mse': MSE,
        'mae': MAE,
        'huber': HuberLoss
    }

    # def __init__(self, layers_arr: List, metrics_to_track, early_stopping=False):
    def __init__(self, layers_arr: List[Layer]):
        self._layers = layers_arr

    def compile(self, loss_function, n_iter = 100):
        if loss_function.lower() not in ModelSequential.__valid_loss_functions:
            raise Exception(f"Loss functions should be any of {ModelSequential.__valid_loss_functions.keys()}.")
        
        self.loss_function = ModelSequential.__valid_loss_functions[loss_function.lower()]()
        self.n_iter = n_iter

    def reweight_layers(self):
        n_layers = len(self._layers)
        for i in range(n_layers):
            in_dim, out_dim = self._layers[i].in_dim, self._layers[i].out_dim
            self._layers[i].weights = np.random.uniform(-1,1, size=(in_dim, out_dim))
            print(f"Weights for layer {i} set to \n{self._layers[i].weights}\n\n")
    def fit(self, X, y):
        pbar_iterations = tqdm(self.n_iter)
        pbar_iterations.set_description("#Iterations: ")
        for iter_ in range(self.n_iter):
            n_layers = len(self._layers)
            input_for_next_layer = X.copy()


            # forward compute
            for i in range(n_layers):
                try:
                    input_for_next_layer = self._layers[i].forward_compute(input_for_next_layer, compute_gradient=True)
                except ValueError as ve:
                    raise Exception(f"Forward compute failed at layer {i} with the following exception:\n{ve}")
                except RuntimeWarning as rw:
                    raise Exception(f"Forward compute failed at layer {i} with the following exception:\n {rw}")

            y_pred = self.predict(X)
            input_for_next_layer = self.loss_function.derivative(y_true=y, y_pred=y_pred)

        
            # backprop
            for i in range(n_layers-1, -1, -1):
                with warnings.catch_warnings():
                    warnings.filterwarnings("error")
                    try:
                        input_for_next_layer = self._layers[i].backprop_compute(input_for_next_layer)
                    except ValueError as ve:
                        return Exception(f"Backpropagation failed at layer {i} with the following exception:\n{ve}")
                    except RuntimeWarning as rw:
                        raise Exception(f"Forward compute failed at layer {i} with the following exception:\n {rw}")

            pbar_iterations.update(1)
        pbar_iterations.close()

    def print_layer_logs(self):
        for i in range(n_layers):
            print(f"Handling layer {i}...")
            self._layers[i].print_logs()
    
    def predict(self, X):
        n_layers = len(self._layers)
        input_for_next_layer = X.copy()

        # forward compute
        for i in range(n_layers):
            with warnings.catch_warnings():
                try:
                    input_for_next_layer = self._layers[i].forward_compute(input_for_next_layer, compute_gradient=True)
                except ValueError as ve:
                    raise Exception(f"Forward compute failed at layer {i} with the following exception:\n{ve}")
                except RuntimeWarning as rw:
                    raise Exception(f"Forward compute failed at layer {i} with the following exception:\n {rw}")
        return input_for_next_layer

### Test

In [8]:
import sys

def get_y(x):
    return x[:,0] + (x[:,1]**2) + np.abs(x[:, 2])

x_10K = np.random.randint(-100,100,size=(10000, 3))
y_10K = get_y(x_10K)

# Column-wise normalization using L2 normalization (Euclidean norm)
x_10K_normalized = x_10K / np.linalg.norm(x_10K, axis=0)  # Normalizing along axis 0 (columns)

x_100K = np.random.randint(-100,100,size=(100000, 3))
y_100K = get_y(x_10K)

x_1M = np.random.randint(-100,100,size=(1000000, 3))
y_1M = get_y(x_1M)

x_10M = np.random.randint(-100,100,size=(10000000, 3))
y_10M = get_y(x_10M)

print(sys.getsizeof(x_10K), sys.getsizeof(x_100K), sys.getsizeof(x_1M), sys.getsizeof(x_10M))
print()
print(sys.getsizeof(y_10K), sys.getsizeof(y_100K), sys.getsizeof(y_1M), sys.getsizeof(x_10M))

240120 2400120 24000120 240000120

80104 80104 8000104 240000120


In [24]:
l1, l2, l3 = Layer('sigmoid', x_10K.shape[1], 3, learning_rate=0.01), Layer('leaky_relu', 3, 2, learning_rate=0.01), Layer('linear', 2, 1, learning_rate=0.01)

In [25]:
model = ModelSequential([l1, l2, l3])

In [26]:
# model.compile(loss_function='huber', n_iter=10)
model.compile(loss_function='mse', n_iter=10)

In [27]:
init_pred_10k = model.predict(x_10K_normalized)
print(model.loss_function(y_true=y_10K, y_pred=init_pred_10k))

20181473.281987637


In [28]:
# model.reweight_layers()

n_layers = len(model._layers)
for i in range(n_layers):
    print(f"Layer {i}\n{model._layers[i].weights}\n\n")


model.fit(x_10K_normalized, y_10K)

Layer 0
[[ 0.83362112  0.92637197  0.23902398]
 [ 0.21962276 -0.66480166 -0.76965921]
 [-0.87562595 -0.88478715 -0.03543006]]


Layer 1
[[-0.14251452  0.83318855]
 [-0.60185958  0.85707514]
 [ 0.58464671  0.92996537]]


Layer 2
[[0.84554119]
 [0.15461206]]




0it [00:00, ?it/s]

In [29]:
model.print_layer_logs()

Handling layer 0...
2023-11-24 13:54:03,256 - MyClass_5473425728 - INFO - 
	Weight Gradient = 
	[[-46.34617646 -46.3503554  -46.35194443]
 [ 24.3774111   24.37423523  24.3758658 ]
 [ 16.44985882  16.45107686  16.45310844]]

2023-11-24 13:54:03,256 - MyClass_5473425728 - INFO - 
	Weights = 
	[[ 47.17979758  47.27672737  46.59096841]
 [-24.15778834 -25.0390369  -25.14552501]
 [-17.32548477 -17.33586401 -16.48853849]]


2023-11-24 13:54:04,357 - MyClass_5473425728 - INFO - 
	Weight Gradient = 
	[[-1.06480716e+12 -1.06317238e+12 -1.06633856e+12]
 [ 4.95363521e+11  4.92117551e+11  4.93035422e+11]
 [ 3.77533049e+11  3.77101899e+11  3.80432617e+11]]

2023-11-24 13:54:04,357 - MyClass_5473425728 - INFO - 
	Weights = 
	[[ 1.06480716e+12  1.06317238e+12  1.06633856e+12]
 [-4.95363521e+11 -4.92117551e+11 -4.93035422e+11]
 [-3.77533049e+11 -3.77101899e+11 -3.80432617e+11]]


2023-11-24 13:54:05,471 - MyClass_5473425728 - INFO - 
	Weight Gradient = 
	[[-5.70383432e-252  1.14091698e-252 -1.35570294e

In [30]:
y_pred_10k = model.predict(x_10K_normalized)
model.loss_function(y_true=y_10K, y_pred=y_pred_10k)

ValueError: Input contains NaN.

In [None]:
n_layers = len(model._layers)
for i in range(n_layers):
    print(f"Layer {i}\n{model._layers[i].weights}\n\n")

### Print all layer's weights

In [None]:
n_layers = len(model._layers)
for i in range(n_layers):
    print(f"Layer {i}\n{model._layers[i].weights}\n\n")

In [None]:
print(dir(model))

In [None]:
import sys
print(sys.getsizeof(model))