**Neural Network Implementation (Inspired from):**
- "Neural Network from scratch in Python" by Omar Aflak: [Link](https://towardsdatascience.com/math-neural-network-from-scratch-in-python-d6da9f29ce65)



This Python program implements a neural network with adjustments inspired by the code
from the above-mentioned source. The original code served as a foundational reference,
and modifications have been made to suit specific requirements and preferences.

In [28]:
!pip install tensorflow

In [29]:
# Abstract base class to handle input, output, forward prop., backward prop.
class Layer:
  def __init__(self):
    self.input = None
    self.output = None

    def forward_propagation(self, input):
      raise NotImplementedError

    def backward_propagation(self, output_error, learning_rate):
      raise NotImplementedError

In [44]:
import numpy as np

# Fully Connected (FC) - inherits from base class Layer
class FCLayer(Layer):
  # input_size = number of input neurons
  # output_size = number of output nerons

  def __init__(self, input_size, output_size):
    # Initialize weights and biases
    # Weights are initialized randomly to break symmetry
    # create weight vector
    self.weights = np.random.rand(input_size, output_size) - 0.5
     # create bias vector
    self.bias = np.random.rand(1, output_size) - 0.5

    # Forward Propagation returns output for a given input
  def forward_propagation(self, input_data):
    # Y = XW + B
    # Y: output, X: input, W: weights, B: bias
    self.input = input_data
    self.output = np.dot(self.input, self.weights) + self.bias
    return self.output

  # Backward Propagation computes gradients and updates parameters
  def backward_propagation(self, output_error, learning_rate):
    # Compute gradient w.r.t input (dE/dX)
    # input_error = dE/dY * W^T '''
    input_error = np.dot(output_error, self.weights.T)
    # Compute gradient w.r.t weights (dE/dW)
    # weights_error = X^T * dE/dY
    weights_error = np.dot(self.input.T, output_error)
    # dBias = output_error

    # Update parameters
    # Adjust weights and bias by subtracting a fraction of the gradients
    # Learning rate (lr) controls the size of the update step '''
    self.weights -= learning_rate * weights_error
    self.bias -= learning_rate * output_error
    return input_error

## Coding the Activation Layer ##

In [45]:
# Activation class - inherits from base class Layer
class ActivationLayer(Layer):
  def __init__(self, activation, activation_prime):
    self.activation = activation
    self.activation_prime = activation_prime

  # Forward propagation through the activation layer
  # Uses the activation function on the input data and returns activated input
  # Mathematically, this can be represented as: y = f(x)
  def forward_propagation(self, input_data):
    self.input = input_data
    self.output = self.activation(self.input)
    return self.output

  # Backward propagation through the activation layer
  # Calculates the gradient of the activation function with respect to the input
  # Mathematically, it computes: dE/dX = f'(x) * dE/dY
  # Here, f'(x) is the derivative of the activation function (activation_prime)
  # and dE/dY is the output_error
  # returns input_error for a fiven ouput_error
  def backward_propagation(self, output_error, learning_rate):
    return self.activation_prime(self.input) * output_error

## Loss Function ##

In [46]:
# Activation function and its deriv. -  put in sep. file

# tanh activation function
# Mathematically, this is: tanh(x) = (e^x - e^-x) / (e^x + e^-x)
def tanh(x):
  return np.tanh(x)

# Derivative of the tanh function
# Mathematically, this is: tanh'(x) = 1 - tanh(x)^2
def tanh_prime(x):
  return 1 - np.tanh(x)**2

In [53]:
# 2 python functions (loss and deriv. of loss) used when creating the network -  put in separate file

# Mean Squared Error (MSE) function
# Computes the average of the squares of the differences between actual (y_true) and predicted (y_pred) values.
# Mathematically, MSE = 1/n * Σ(y_true - y_pred)^2, where n is the number of samplesdef mse(y_true, y_pred):
def mse(y_true, y_pred):
  return np.mean(np.power(y_true-y_pred, 2))

# Derivative of the Mean Squared Error function
# Used for calculating gradients during backpropagation
# Mathematically, it is: d(MSE)/d(y_pred) = 2/n * Σ(y_pred - y_true)
def mse_prime(y_true, y_pred):
  return 2*(y_pred-y_true)/y_true.size

## Network Class ##


In [54]:
class Network:
  def __init__(self):
    self.layers = [] # List of layers in the network
    self.loss = None # Loss function
    self.loss_prime = None # Derivative of loss function

  # Method to add layer to network
  # layer: An instance of a layer (e.g., ActivationLayer, DenseLayer)
  def add(self, layer):
    self.layers.append(layer)

  # Method to set the loss function and its derivative
  # loss: A function to calc. the loss
  # loss_prime: A function to calc. the deriv. of the loss
  def use(self, loss, loss_prime):
    self.loss = loss
    self.loss_prime = loss_prime

  # Method to predict the output to given input data
  # This performs the Forward Prop. through the network
  def predit(self, input_data):
    input_samples = len(input_data) # Number of input sample data
    result = []

    # Forward propagation through the network for each sample
    for i in range(input_samples):
      output = input_data[i]
      for layer in self.layers:
        output = layer.forward_propagation(output) # Getting output of curr. layer
      result.append(output)

    return result

  # Method to train the network
  # x_train: Training inputs
  # y_train: Training outputs (labels)
  # epochs: Number of time the entire training dataset is passed through the network
  # learning_rate: Step size at each iteration of updating the weights
  def fit(self, x_train, y_train, epochs, learning_rate):
    training_samples = len(x_train) # Number of training input sample data

    # Training loop for the specified number of epochs
    for i in range(epochs):
      err = 0 # Error for the curr. epoch

      # Forwardprop. and backprop. for each sample in the training set
      for j in range(training_samples):
        # Forward propagation
        output = x_train[j]
        for layer in self.layers:
          output = layer.forward_propagation(output)

        # Compute loss for the current sample (for display purposes)
        err += self.loss(y_train[j], output)

        # Backward propagation
        error = self.loss_prime(y_train[j], output) # Compute the gradient of the loss
        for layer in reversed(self.layers):
          error = layer.backward_propagation(error, learning_rate) # Update the weights

      # Calculate and print the average error over all samples for this epoch
      err /= training_samples
      print('epoch %d/%d   error=%f' % (i+1, epochs, err))

# Building Neural Networks #


## XOR Neural Network ##
Simple way to tell if the NN is learning anything at all

In [55]:
# Training data for XOR problem
#
x_train = np.array([[[0,0]], [[0,1]], [[1,0]], [[1,1]]])
y_train = np.array([[[0]], [[1]], [[1]], [[0]]])

# Creating the nn
net = Network()

# Adding a FC layer with 2 input neurons and 3 output neurons
# This represents a weight matrix W of dimensions 3x2 and a bias vector b of length 3
net.add(FCLayer(2, 3))
# Adding a tanh activation layer
# This applies the tanh activation function element-wise: y = tanh(x)
net.add(ActivationLayer(tanh, tanh_prime))
# Adding another FC layer with 3 input neurons and 1 output neuron
# Weight matrix W of dimensions 1x3 and bias vector b of length 1
net.add(FCLayer(3, 1))
# Adding another tanh activation layer
net.add(ActivationLayer(tanh, tanh_prime))

# Train the network
# Setting the loss function to MSE and its derivative
net.use(mse, mse_prime)
# Training the network with the specified number of epochs and learning rate
net.fit(x_train, y_train, epochs=1000, learning_rate=0.1)

# Testing the network
out = net.predit(x_train)
print(out)

# NOTE: I don’t think I need to emphasize many things. Just be careful
# with the training data, you should always have the sample dimension first.
# For example here, the input shape is (4,1,2).

epoch 1/1000   error=0.379323
epoch 2/1000   error=0.310109
epoch 3/1000   error=0.298394
epoch 4/1000   error=0.294640
epoch 5/1000   error=0.292911
epoch 6/1000   error=0.291889
epoch 7/1000   error=0.291176
epoch 8/1000   error=0.290623
epoch 9/1000   error=0.290164
epoch 10/1000   error=0.289768
epoch 11/1000   error=0.289417
epoch 12/1000   error=0.289100
epoch 13/1000   error=0.288812
epoch 14/1000   error=0.288548
epoch 15/1000   error=0.288305
epoch 16/1000   error=0.288080
epoch 17/1000   error=0.287872
epoch 18/1000   error=0.287679
epoch 19/1000   error=0.287500
epoch 20/1000   error=0.287334
epoch 21/1000   error=0.287179
epoch 22/1000   error=0.287035
epoch 23/1000   error=0.286901
epoch 24/1000   error=0.286776
epoch 25/1000   error=0.286660
epoch 26/1000   error=0.286552
epoch 27/1000   error=0.286451
epoch 28/1000   error=0.286357
epoch 29/1000   error=0.286270
epoch 30/1000   error=0.286188
epoch 31/1000   error=0.286113
epoch 32/1000   error=0.286042
epoch 33/1000   e

## MNIST NN ##

**Note:**
We did not implement the Convolutional Layer but this is not a problem. All we need to do is to reshape our data so that it can fit into a Fully Connected Layer.

In [57]:
!pip install keras



In [None]:
# TODO implemented mini-batch GD


In [66]:
import keras
import tensorflow as tf

# Load MNIST dataset
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

# Reshape and normalize input data
# Normalizing the data to the range of [0, 1] for faster convergence
x_train = x_train.reshape(x_train.shape[0], 1, 28*28)
x_train = x_train.astype('float32')
x_train /= 255  # Normalization: converting pixel values from [0, 255] to [0, 1]

# One-hot encoding of the labels
# Each digit label is converted into a binary vector of size 10
# E.g., digit '3' becomes [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
y_train = tf.keras.utils.to_categorical(y_train)

# Repeat the process for test data
x_test = x_test.reshape(x_test.shape[0], 1, 28*28)
x_test = x_test.astype('float32')
x_test /= 255
y_test = tf.keras.utils.to_categorical(y_test)

# Constructing the neural network for MNIST
net = Network()
# First fully connected layer: Input - 784 neurons (28*28 flattened image), Output - 100 neurons
net.add(FCLayer(28*28, 100))
net.add(ActivationLayer(tanh, tanh_prime))
# Second fully connected layer: Input - 100 neurons, Output - 50 neurons
net.add(FCLayer(100, 50))
net.add(ActivationLayer(tanh, tanh_prime))
# Third fully connected layer: Input - 50 neurons, Output - 10 neurons (corresponding to 10 digits)
net.add(FCLayer(50, 10))
net.add(ActivationLayer(tanh, tanh_prime))

# Training the network with a smaller subset of the training data
# Mini-batch gradient descent is not implemented, so we limit the number of samples
net.use(mse, mse_prime)
net.fit(x_train[0:1000], y_train[0:1000], epochs=35, learning_rate=0.1)

# Testing the network on a small subset of the test data
out = net.predit(x_test[:3])
print(f"\npredicted values:\n{out}\n")
print(f"true values:\n{y_test[:3]}\n")

epoch 1/35   error=0.236787
epoch 2/35   error=0.105192
epoch 3/35   error=0.081818
epoch 4/35   error=0.068472
epoch 5/35   error=0.059351
epoch 6/35   error=0.052295
epoch 7/35   error=0.046089
epoch 8/35   error=0.040928
epoch 9/35   error=0.036414
epoch 10/35   error=0.032838
epoch 11/35   error=0.029981
epoch 12/35   error=0.027767
epoch 13/35   error=0.025839
epoch 14/35   error=0.023879
epoch 15/35   error=0.022149
epoch 16/35   error=0.020705
epoch 17/35   error=0.019360
epoch 18/35   error=0.017992
epoch 19/35   error=0.016651
epoch 20/35   error=0.015557
epoch 21/35   error=0.014572
epoch 22/35   error=0.013660
epoch 23/35   error=0.012933
epoch 24/35   error=0.012216
epoch 25/35   error=0.011505
epoch 26/35   error=0.010914
epoch 27/35   error=0.010346
epoch 28/35   error=0.009857
epoch 29/35   error=0.009385
epoch 30/35   error=0.008924
epoch 31/35   error=0.008539
epoch 32/35   error=0.008212
epoch 33/35   error=0.007874
epoch 34/35   error=0.007584
epoch 35/35   error=0.0