# Intro and Notes

Alright guys! This right here is my actual attempt at a neural network. I've actually planned this thing out properly this time, and am coming at it with way more planning. To start, here's the target architecture:
2 node in
2 node hidden
2 node out

Nice and simple.

As well, the dataset is super simple: The input is a 2 number input. The output is [difference of the 2 inputs, sum of 2 inputs]. This simple architecture should be able to learn it

In [8]:
# Using jax for autograd, so we're using its version of numpy as well as normal numpy
import jax
from jax import grad
import jax.numpy as jnp

import numpy as np

import random

# Generating Data

In [9]:
# Initially, both arrays are standard python arrays to make appending easy.
# We'll convert them to numpy arrays after we're done generating data
x_dat = []
y_dat = []

for i in range(1000):
  one = random.random() * 10 + 11
  two = random.random() * 10

  inDat = [one, two]
  outDat = [one - two, one + two]

  x_dat.append(inDat)
  y_dat.append(outDat)

# Converting both arrays to numpy
x_dat = jnp.array(x_dat)
y_dat = jnp.array(y_dat)

# Defining helper code

In [10]:
def sigmoid(x):
  return 1 / (1 + jnp.exp(-1 * x))

In [11]:
def mse(exp, hat):
  return (1 / len(exp)) * jnp.sum( (exp - hat) ** 2 )

In [12]:
LEARNRATE = 10

# Defining our model

In [13]:
class Neuron:
  # Num of inputs is effectively the number of nodes in the last layer;
  # this isn't universally true, but for dense layers it is
  def __init__(self, numOfInputs):
    # Generating random initial weights
    self.weights = jnp.array(np.random.rand(numOfInputs,))

    # Generating a random bias
    self.bias = random.random()

    # To make training easier, neurons can save the last input given to them.
    # Then, when backpropogating, the gradient calculation can happen
    # without the layer remembering the input. Makes the code a lot simpler
    self.savedInput = None
  
  # This just forward propogates an input
  def forward(self, inputs, save=False, useSave=False):
    # If the neuron is told to save the input, store it in the savedInput
    # variable. It can then be used for gradient calculation. Otherwise,
    # do nothing
    if save:
      self.savedInput = inputs
    
    # If the neuron is told to use the saved input from the last forward
    # propogation, it overwrites the input
    if useSave:
      inputs = self.savedInput

    inToActive = jnp.dot(inputs, self.weights)
    inToActive += self.bias

    activation = sigmoid(inToActive)
    return activation
  
  # This function calculates the gradient of this neuron's activation with respect
  # to activations in the last layer. Really useful for backpropogation
  def calcActivGrad(self):
    # Using jax to calculate gradient. Pretty awesome
    activ = lambda W, input, bias : sigmoid(jnp.dot(W, input) + bias)
    inputGradient = grad(activ, argnums=1)(self.weights, self.savedInput, self.bias)

    return inputGradient
  
  # This function takes in the derivative of the cost function with respect
  # to this neuron's activation. Then, it multiplies that with the gradient
  # of the activation with respect to the weights, and then multiplies by -1
  # and the learning rate. This is then added to the current weights. A similar
  # process is done for the bias, but that is not a vector obviously
  def learn(self, derivToCost):
    # Using jax to calculate gradient. Pretty awesome
    activ = lambda W, input, bias : sigmoid(jnp.dot(W, input) + bias)
    
    # Calculating both gradients for this neuron's activation
    W_grad = grad(activ, argnums=1)(self.weights, self.savedInput, self.bias)
    biasGrad = grad(activ, argnums=2)(self.weights, self.savedInput, self.bias)

    # Both need to multiplied by -1 to find the direction of descent rather
    # than ascent, then multiplied by cost derivative to find which way and 
    # by how much to move to optimize the cost function,
    # and then multiply by the learning rate to find
    # the actual change we want
    W_grad = W_grad * -1 * LEARNRATE * derivToCost
    biasGrad = biasGrad * -1 * LEARNRATE * derivToCost

    # Updating weights and biases
    self.weights = self.weights + W_grad
    self.bias = self.bias + biasGrad

class Layer:
  def __init__(self, numOfNodes, numOfInputs):
    self.neurons = []
    self.numOfInputs = numOfInputs
    # Creating all our neurons
    for i in range(numOfNodes):
      self.neurons.append(Neuron(numOfInputs))

  # Forward propogating our model
  def forward(self, input, save=False, useSave=False):
    result = []
    for neuron in self.neurons:
      result.append(neuron.forward(input, save, useSave))
    
    return jnp.array(result)
  
  # Backpropogating our model. This function takes in the gradient of the cost
  # function with respect to the current layers activations. This is used to
  # backpropogate again to the previous layer, and also to update this layer's
  # weights
  def backward(self, costGrade):
    # First, we want to calculate the cost gradient for the previous layers'
    # activations; this can be done by extracting the index indicating
    # the derivative with respect to each neuron from the input gradient,
    # and then multipying that with each neuron's activation gradient function.
    # Sum the result, and that's the gradient to backpropogate.
    # Next, simply pass each index into each neuron, and have it learn
    previousLayerCostGradient = []
    for i in range(len(costGrade)):
      activationToNeuronGrade = self.neurons[i].calcActivGrad()
      prevLayerCostGrade = activationToNeuronGrade * costGrade[i]
      previousLayerCostGradient.append(prevLayerCostGrade)

      # Now that we have all our gradient data, tell the neuron to learn
      self.neurons[i].learn(costGrade[i])
    
    previousLayerCostGradient = jnp.array(previousLayerCostGradient)

    # Currently, the array stores a bunch of gradients, so sum them up for one
    # final vector
    previousLayerCostGradient = jnp.sum(previousLayerCostGradient, axis=0)

    return previousLayerCostGradient

class Model:
  def __init__(self, layers):
    self.layers = layers
  
  def forward(self, input):
    layerOut = self.layers[0].forward(input)
    for i in range(1, len(self.layers)):
      layerOut = self.layers[i].forward(layerOut)
    return layerOut
  
  def train(self, input, output):
    # Calculating the output of our model
    layerOut = self.layers[0].forward(input, save=True)
    for i in range(1, len(self.layers)):
      layerOut = self.layers[i].forward(layerOut, save=True)
    
    # Calculating the gradient of our cost with respect to the model output
    # i.e. the last layer's activations
    activToCost = lambda activs, output: mse(activs, output)

    # Storing the gradient of our cost with respect to the output layer's activs
    # here. This will be updated with the gradient for other layers as we
    # backpropogate
    activGrad = grad(activToCost, argnums=0)(layerOut, output)

    # Backpropping through our model
    for i in range(len(self.layers) - 1, 1, -1):
      activGrad = self.layers[i].backward(activGrad)

In [14]:
myLayerStack = [Layer(4, 4), Layer(2, 4), Layer(4, 2)]
myModel = Model(myLayerStack)
input = jnp.array([1.0, 2.0, 3.0, 4.0])
output = jnp.array([0.0, 0.0, 1.0, 1.0])
print(myModel.forward(input))
for i in range(1000):
  myModel.train(input, output)
  print(myModel.forward(input))

[0.8361708  0.89165866 0.69467735 0.689644  ]
[0.5804007  0.7236806  0.76962805 0.8012852 ]
[0.31675696 0.39079243 0.8116576  0.84811723]
[0.23749945 0.26296884 0.8385585  0.87408715]
[0.19715892 0.21065065 0.8574116  0.8909108 ]
[0.17177604 0.18029389 0.87146497 0.90285313]
[0.1539678  0.15989418 0.88241136 0.9118523 ]
[0.14060917 0.14499672 0.8912209  0.91892487]
[0.13012224 0.13351405 0.89849204 0.9246596 ]
[0.12161375 0.1243207  0.90461475 0.9294226 ]
[0.11453543 0.1167494  0.90985495 0.9334551 ]
[0.10852997 0.11037625 0.91440094 0.9369225 ]
[0.10335332 0.10491744 0.9183895  0.93994284]
[0.09883241 0.10017494 0.9219231  0.9426024 ]
[0.09484072 0.09600579 0.92507994 0.9449663 ]
[0.09128329 0.09230393 0.92792076 0.9470843 ]
[0.08808738 0.08898875 0.93049353 0.9489954 ]
[0.08519614 0.08599786 0.93283695 0.9507304 ]
[0.08256447 0.08328196 0.9349822  0.9523143 ]
[0.08015604 0.08080174 0.93695503 0.95376724]
[0.07794129 0.07852521 0.9387767  0.955106  ]
[0.07589575 0.07642614 0.94046515 

KeyboardInterrupt: ignored