In [None]:
import numpy as np

**Multi Layer Perceptron (MLP) from scratch using Backpropogation**

**Layer**
Initialized as layer = Layer(input_dim, output_dim).  
We define *weights* and *biases* as the learnable parameters that gets updated by the optimization method.  
The forward pass computes $Z = weight^T.X + bias$
The update_weights function would update the *weights* and *biases* with their gradient.

In [None]:
class Layer:
    def __init__(self, input_dim, output_dim):
        self.weights = np.random.randn(input_dim, output_dim) * np.sqrt(2 / input_dim)
        self.bias = np.zeros(output_dim)
        self.input = None
        self.output = None
    
    def forward(self, X):
        self.input = X
        self.output = np.dot(X, self.weights) + self.bias
        return self.output
    
    def update_weights(self, dW, db, learning_rate):
        self.weights -= learning_rate * dW
        self.bias -= learning_rate * db

**Activation Layer**  
Supports both *Sigmoid* and *ReLU* activation.  

In [None]:
class Activation(object):
    def __init__(self, activation_type):
        # Initialize the activation function based on the activation type
        if activation_type == "sigmoid":
            self.activation_func = self.sigmoid
            self.activation_prime = self.sigmoid_prime
        elif activation_type == "relu":
            self.activation_func = self.relu
            self.activation_prime = self.relu_prime

    def sigmoid(self, x):
        # Compute the sigmoid activation function
        return 1 / (1 + np.exp(-x))

    def sigmoid_prime(self, x):
        # Compute the derivative of the sigmoid activation function
        sigmoid = self.sigmoid(x)
        return sigmoid * (1 - sigmoid)

    def relu(self, x):
        # Compute the rectified linear unit (ReLU) activation function
        return np.maximum(0, x)

    def relu_prime(self, x):
        # Compute the derivative of the rectified linear unit (ReLU) activation function
        return (x > 0).astype(int)

    def forward(self, input):
        # Compute the forward pass through the activation function
        self.input = input
        self.output = self.activation_func(input)
        return self.output

    def backward(self, grad_output):
        # Compute the gradient of the activation function with respect to its input
        return grad_output * self.activation_prime(self.input)


**Loss**  
Implemented both *mean standard error* and *categorical cross entropy* activation.  
Forward pass computes the loss wrt *y_true* and *y_pred*.  
Backword pass computes the gradient.

In [None]:
class Loss(object):
    def __init__(self, loss_type):
        # Initialize the loss function based on the loss type
        if loss_type == "mse":
            self.loss_func = self.mse_loss
            self.loss_prime = self.mse_loss_prime
        elif loss_type == "cross_entropy":
            self.loss_func = self.cross_entropy_loss
            self.loss_prime = self.cross_entropy_loss_prime

    def mse_loss(self, y_true, y_pred):
        # Compute the mean squared error (MSE) loss function
        return np.mean((y_true - y_pred) ** 2)

    def mse_loss_prime(self, y_true, y_pred):
        # Compute the derivative of the mean squared error (MSE) loss function with respect to its input
        return -2 * (y_true - y_pred)

    def cross_entropy_loss(self, y_true, y_pred):
        # Compute the cross-entropy loss function
        return -np.sum(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

    def cross_entropy_loss_prime(self, y_true, y_pred):
        # Compute the derivative of the cross-entropy loss function with respect to its input
        return -y_true / y_pred + (1 - y_true) / (1 - y_pred)

    def forward(self, y_true, y_pred):
        # Compute the forward pass through the loss function
        self.y_true = y_true
        self.y_pred = y_pred
        self.output = self.loss_func(y_true, y_pred)
        return self.output

    def backward(self, y_true, y_pred):
        # Compute the gradient of the loss function with respect to its input
        return self.loss_prime(y_true, y_pred)

**MLP Class**  
Supports arbitrary number of hidden layers. Pass a list *hidden_layers = [16, 32, 64]* to intialize the 3 hidden layers with 16, 32 and 64 neurons per layer.  
Forward pass computes the activation from each neuron in the layer.    
Backword pass computes the loss at the final layer and the gradients for weight and biases. It then backpropogates the gradient to update the weight and biases.  

In [None]:
class MLP:
    def __init__(self, input_dim, hidden_layers, output_dim, activation_type, loss_type):
        self.layers = []
        self.num_layers = len(hidden_layers) + 1
        
        # Initialize input layer
        self.layers.append(Layer(input_dim, hidden_layers[0]))
        
        # Initialize hidden layers
        for i in range(self.num_layers - 2):
            self.layers.append(Layer(hidden_layers[i], hidden_layers[i+1]))
        
        # Initialize output layer
        self.layers.append(Layer(hidden_layers[-1], output_dim))
        
        # Initialize activation function
        self.activation = Activation(activation_type)
        
        # Initialize loss function
        self.loss = Loss(loss_type)
    
    def forward(self, X):
        A = X
        for layer in self.layers:
            Z = layer.forward(A)
            A = self.activation.forward(Z)
        return A
    
    def backward(self, X, y, y_pred, learning_rate):
        dA_prev = self.loss.backward(y, y_pred)
        for layer in reversed(self.layers):
            dZ = self.activation.backward(layer.output)
            dA = np.dot(dA_prev, layer.weights.T)
            dW = np.dot(layer.input.T, dA_prev * dZ) / X.shape[0]
            db = np.mean(dA_prev * dZ, axis=0)
            layer.update_weights(dW, db, learning_rate)
            dA_prev = dA
    
    def train(self, X, y, learning_rate, epochs):
        for i in range(epochs):
            y_pred = self.forward(X)
            self.backward(X, y, y_pred, learning_rate)
    
    def predict(self, X):
        return self.forward(X)

In [None]:
import numpy as np
# from mlp import MLP

# Generate a random dataset of size (1000, 10)
X_train = np.random.rand(1000, 10)
y_train = np.random.rand(1000, 1)

# Define the parameters of the MLP
input_dim = 10
hidden_layers = [10] * 10
output_dim = 1
activation_type = 'sigmoid'
loss_type = 'cross_entropy'
learning_rate = 0.001
epochs = 1000

# Create an instance of the MLP class
mlp = MLP(input_dim, hidden_layers, output_dim, activation_type, loss_type)

# Train the MLP on the dataset
mlp.train(X_train, y_train, learning_rate, epochs)

# Make predictions on the same dataset
y_pred = mlp.predict(X_train)

In [None]:
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

wine_dataset = load_diabetes()
X, y = wine_dataset.data, wine_dataset.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train.shape, y_train.shape

((353, 10), (353,))

In [79]:
input_dim = X_train.shape[1]
hidden_layers = [8, 16, 32, 64, 128, 64, 32, 16, 8]
output_dim = 1
activation_type = 'sigmoid'
loss_type = 'mse'
learning_rate = 0.005
epochs = 10000

In [None]:
mlp = MLP(input_dim, hidden_layers, output_dim, activation_type, loss_type)

# Train the MLP on the dataset
mlp.train(X_train, y_train.reshape(-1, output_dim), learning_rate, epochs)

# Make predictions on the same dataset
y_pred = mlp.predict(X_train)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

print(mean_squared_error(y_train.reshape(-1, output_dim), y_pred, squared=False))

169.56369613520278
