Imports

In [1]:
# Imports kept to a bare minimum
import torch

MLP Layers_Dense Class

In [2]:
# This class represents one layer in a multi-layer perceptron
class Layers_Dense:
  def __init__(self,n_inputs,n_neurons):
    # Weights defined to be congruent with inputs from each previous layer
    self.weights = torch.rand(n_inputs,n_neurons)
    # Biases defined with dims so that they can be added directly to weights after
    self.biases = torch.rand((1,n_neurons))

  def forward(self,inputs):
    # We need to save inputs to compute gradients later
    self.inputs = inputs
    # Matmul used for vectorization, so you can train on batched data
    self.outputs = torch.matmul(self.inputs,self.weights) + self.biases
    # Note: we do not pass through the activation function here. This is to give you flexibility as to whether you want to use it or not. Pytorch's nn.Linear is built in a similar way
    return self.outputs
  def backwards(self, dvalues):
    # "dvalues" represents dL/dZ
    # In order to calculate dL/dw, we multiply dL/dZ * dZ/dw, which is just the total input activation * dvalues
    self.dweights = torch.matmul(self.inputs.T, dvalues)
    # The biases grad is technically 1, but it's convention to just sum up dvalues
    self.dbiases = torch.sum(dvalues, axis=0, keepdims=True)
    # When we pass gradients backwards, we need to compute the gradient of loss in respect to them
    # We get net activation from the previous layers as our inputs, so we have to compute dZ/dA here. That is just dvalues * weights
    self.dinputs = torch.matmul(dvalues,self.weights.T)

Activation Functions

In [3]:
# Relu is just a function that returns the same value if it is greater than or equal to 0 or 0 if it is less
def relu(inputs):
    return inputs.clamp(min=0.0)

# Relu's derivative is 1 if the inputs are > 0 and 0 if they are less than or equal to it
def relu_deriv(inputs):
    return (inputs > 0).float()

# Using sigmoid for the final layer, a simple nonlinear activation function that squishes values between 0 and 1
def sigmoid(inputs):
  return 1/(1+torch.exp(-inputs))

# Returns the derivative of sigmoid
def sigmoid_deriv(inputs):
  return sigmoid(inputs)*(1-sigmoid(inputs))

Toy Dataset

In [4]:
# For the sake of keeping things concise, this example will train the model on a simple problem
# We have 3 bits A B and C. When C is 1, A and B behave like an OR gate. When C is 0, they behave like an AND gate

# Training data
X = torch.tensor([
    [0.0,0.0,0.0],
    [0.0,0.0,1.0],
    [1.0,0.0,0.0],
    [1.0,0.0,1.0],
    [0.0,1.0,0.0],
    [0.0,1.0,1.0],
    [1.0,1.0,0.0],
    [1.0,1.0,1.0],]
)
# Predictions
y = torch.tensor([
    [0.], # A=0,B=0,C=0 -> AND: 0 AND 0 = 0
    [0.],  # A=0,B=0,C=1 -> OR: 0 OR 0 = 0
    [0.],  # A=1,B=0,C=0 -> AND: 1 AND 0 = 0
    [1.],  # A=1,B=0,C=1 -> OR: 1 OR 0 = 1
    [0.],  # A=0,B=1,C=0 -> AND: 0 AND 1 = 0
    [1.],  # A=0,B=1,C=1 -> OR: 0 OR 1 = 1
    [1.],  # A=1,B=1,C=0 -> AND: 1 AND 1 = 1
    [1.],  # A=1,B=1,C=1 -> AND: 1 AND 1 = 1
])


Training Loop

In [None]:
# We define MLP layers outside of a container class for illustrative purposes
layer1 = Layers_Dense(3,3)
layer2 = Layers_Dense(3,2)
layer3 = Layers_Dense(2,2)
layer4 = Layers_Dense(2,1)

# Loss function
# We use binary cross-entropy in order for two class output. For more than two, use categorical cross-entropy

def binary_cross_entropy_loss(y_true, y_pred):
    epsilon = 1e-7  # To avoid log(0)
    y_pred = torch.clamp(y_pred, epsilon, 1 - epsilon)
    return -(y_true * torch.log(y_pred) + (1 - y_true) * torch.log(1 - y_pred))

def bce_deriv(y_true, y_pred):
    epsilon = 1e-7
    y_pred = torch.clamp(y_pred, epsilon, 1 - epsilon)
    return (y_pred - y_true) / (y_pred * (1 - y_pred))

# Epochs are just the number of total runs where the model trains on all possible examples
# Note: You usually won't need 100 epochs
for j in range(0,100):
  epoch_loss = []
  for example in range(0,8):
    # Forward pass for layer1
    logits1 = layer1.forward(X[example].unsqueeze(0))
    outputs1 = relu(logits1)

    # Forward pass for layer2
    logits2 = layer2.forward(outputs1)
    outputs2 = relu(logits2)

    # Forward pass for layer3
    logits3 = layer3.forward(outputs2)
    outputs3 = relu(logits3)

    # Forward pass for layer2(logits saved)
    logits4 = layer4.forward(outputs3)
    outputs4 = sigmoid(logits4)
    # Loss and backprop
    y_true_single = y[example].unsqueeze(0)
    loss = binary_cross_entropy_loss(y_true_single,outputs4)
    epoch_loss.append(loss.item())
    # First, dL/dA is calculated with BCE loss
    dloss = bce_deriv(y_true_single,outputs4) # dL/dA
    dactivation4 = sigmoid_deriv(logits4)#dA/dZ
    # Backpropagation
    layer4.backwards(dloss*dactivation4) # passing in dL/dZ
    # Now for layer3
    dactivation3 = relu_deriv(logits3)
    layer3.backwards(layer4.dinputs*dactivation3)
    # Now for layer2
    dactivation2 = relu_deriv(logits2)
    layer2.backwards(layer3.dinputs*dactivation2)
    # Now for layer1
    dactivation1 = relu_deriv(logits1)
    layer1.backwards(layer2.dinputs*dactivation1)
    # Now we optimize with a learning rate of 0.05
    learningRate = 0.05
    layer1.weights -= learningRate*layer1.dweights
    layer1.biases -= learningRate*layer1.dbiases
    layer2.weights -= learningRate*layer2.dweights
    layer2.biases -= learningRate*layer2.dbiases
    layer3.weights -= learningRate * layer3.dweights
    layer3.biases -= learningRate * layer3.dbiases
    layer4.weights -= learningRate * layer4.dweights
    layer4.biases -= learningRate * layer4.dbiases
  print(f"Epoch: {j} Loss: {torch.mean(torch.tensor(epoch_loss))}")

Testing

In [7]:
# We define a modelforward method so that we can quickly forward inputs
def modelforward(layer1,layer2,layer3,layer4,X):
  outputs1 = relu(layer1.forward(X))
  outputs2 = relu(layer2.forward(outputs1))
  outputs3 = relu(layer3.forward(outputs2))
  outputs4 = sigmoid(layer4.forward(outputs3))
  return outputs4
# Example test output. There are a finite number of possible cases, and the model has memorized them all
print(modelforward(layer1,layer2,layer3,layer4,X).unsqueeze(0))
# MLPs are used instrumentally in larger ML Architectures(GNNs, SNNs, Transformers, etc.) Once you understand how they work, you can move on to the larger models in the repo
# The code I wrote for the other models will be more heavily abstracted(using pytorch's autograd and other methods)

tensor([[[0.0263],
         [0.0171],
         [0.0163],
         [0.9965],
         [0.0192],
         [0.9971],
         [0.9969],
         [1.0000]]])
