## Goal:
* To understand how deep learning works on a deep level
* To understand how to make code more efficient using vectorization
* Implement neural network from scratch and understand all the math along the way

chain rule: f(g(x)) = f'(g(x)) * g'(x)

In [1]:
n = [2, 3, 3, 1]
print("layer 0 / input layer size", n[0])
print("layer 1 size", n[1])
print("layer 2 size", n[2])
print("layer 3 size", n[3])

layer 0 / input layer size 2
layer 1 size 3
layer 2 size 3
layer 3 size 1


In [2]:
import numpy as np

In [3]:
W1 = np.random.randn(n[1], n[0])
W2 = np.random.randn(n[2], n[1])
W3 = np.random.randn(n[3], n[2])
b1 = np.random.randn(n[1], 1)
b2 = np.random.randn(n[2], 1)
b3 = np.random.randn(n[3], 1)

In [4]:
print("Weights for layer 1 shape:", W1.shape)
print("Weights for layer 2 shape:", W2.shape)
print("Weights for layer 3 shape:", W3.shape)
print("bias for layer 1 shape:", b1.shape)
print("bias for layer 2 shape:", b2.shape)
print("bias for layer 3 shape:", b3.shape)

Weights for layer 1 shape: (3, 2)
Weights for layer 2 shape: (3, 3)
Weights for layer 3 shape: (1, 3)
bias for layer 1 shape: (3, 1)
bias for layer 2 shape: (3, 1)
bias for layer 3 shape: (1, 1)


In [5]:
W1

array([[-0.06262043, -1.48846792],
       [ 0.61068388, -0.9170953 ],
       [ 0.41309267,  1.24040929]])

In [6]:
X = np.array([
    [150, 70],
    [254, 73],
    [312, 68],
    [120, 60],
    [154, 61],
    [212, 65],
    [216, 67],
    [145, 67],
    [184, 64],
    [130, 69]
])

print(X.shape)

(10, 2)


In [7]:
A0 = X.T
print(A0.shape)

(2, 10)


In [8]:
y = np.array([
    0,
    1, 
    1,
    0,
    0,
    1,
    1,
    0,
    1,
    0
])
m = 10

Y = y.reshape(n[3], m)
Y.shape

(1, 10)

In [9]:
def sigmoid(arr):
    return 1 / (1 + np.exp(-1 * arr))

In [10]:
sigmoid(np.array([1, 2, 3, -40, 100]))

array([7.31058579e-01, 8.80797078e-01, 9.52574127e-01, 4.24835426e-18,
       1.00000000e+00])

In [11]:
m = 10
# layer 1 calculations
Z1 = W1 @ A0 + b1 # @ means matrix multiplication

assert Z1.shape == (n[1], m) # checking if shapes are ok
A1 = sigmoid(Z1)

# layer 2 calculations
Z2 = W2 @ A1 + b2
assert Z2.shape == (n[2], m)
A2 = sigmoid(Z2)

# layer 3 calculations
Z3 = W3 @ A2 + b3
assert Z3.shape == (n[3], m)
A3 = sigmoid(Z3)

In [12]:
print(A3.shape)
y_hat = A3
print(y_hat)

(1, 10)
[[0.90827825 0.90827825 0.90827825 0.90827825 0.90827825 0.90827825
  0.90827825 0.90827825 0.90827825 0.90827825]]


### organized version:

In [15]:
import numpy as np

# 1. network architecture
L = 3
n = [2, 3, 3, 1]

# 2. weights and biases
W1 = np.random.randn(n[1], n[0])
W2 = np.random.randn(n[2], n[1])
W3 = np.random.randn(n[3], n[2])
b1 = np.random.randn(n[1], 1)
b2 = np.random.randn(n[2], 1)
b3 = np.random.randn(n[3], 1)

# 3. training data and labels
def prepare_data():
    X = np.array([
        [150, 70],
        [254, 73],
        [312, 68],
        [120, 60],
        [154, 61],
        [212, 65],
        [216, 67],
        [145, 67],
        [184, 64],
        [130, 69]
    ])
    y = np.array([0, 1, 1, 0, 0, 1, 1, 0, 1, 0])
    m = 10
    A0 = X.T
    Y = y.reshape(n[L], m)

    return A0, Y

# 4. activation function
def sigmoid(arr):
    return 1 / (1 + np.exp(-1 * arr))

def feed_forward(A0):

    # layer 1
    Z1 = W1 @ A0 + b1
    A1 = sigmoid(Z1)

    # layer 2
    Z2 = W2 @ A1 + b2
    A2 = sigmoid(Z2)
    
    # layer 3
    Z3 = W3 @ A2 + b3
    A3 = sigmoid(Z3)

    y_hat = A3
    return y_hat

A0, Y = prepare_data()
y_hat = feed_forward(A0)
print(y_hat)

[[0.11668854 0.11668854 0.11668854 0.11668854 0.11668854 0.11668854
  0.11668854 0.11668854 0.11668854 0.11668854]]


In [20]:
""" Cost function examples:
    Mean squared error
    Root mean squared error
    Mean absolute error....
"""
# I'll use binary cross entropy loss function

hey


In [21]:
def cost(y_hat, y):
    losses = - ( ( y * np.log(y_hat)) + (1 - y) * np.log(1 - y_hat))
    m = y_hat.reshape(-1).shape[0]

    summed_losses = (1 / m) * np.sum(losses, axis=1)

    # unnecessary but useful if working with more than one node in output layer
    return np.sum(summed_losses)

In [22]:
cost(y_hat, Y)

1.1361621764786987

In [23]:
import numpy as np

L = 3
n = [2, 3, 3, 1]
W1 = np.random.randn(n[1], n[0])
W2 = np.random.randn(n[2], n[1])
W3 = np.random.randn(n[3], n[2])
b1 = np.random.randn(n[1], 1)
b2 = np.random.randn(n[2], 1)
b3 = np.random.randn(n[3], 1)

def prepare_data():
  X = np.array([
      [150, 70],
      [254, 73],
      [312, 68],
      [120, 60],
      [154, 61],
      [212, 65],
      [216, 67],
      [145, 67],
      [184, 64],
      [130, 69]
  ])
  y = np.array([0,1,1,0,0,1,1,0,1,0])
  m = 10
  A0 = X.T
  Y = y.reshape(n[L], m)

  return A0, Y, m

def cost(y_hat, y):
  """
  y_hat should be a n^L x m matrix
  y should be a n^L x m matrix
  """
  # 1. losses is a n^L x m
  losses = - ( (y * np.log(y_hat)) + (1 - y)*np.log(1 - y_hat) )

  m = y_hat.reshape(-1).shape[0]

  # 2. summing across axis = 1 means we sum across rows, 
  #   making this a n^L x 1 matrix
  summed_losses = (1 / m) * np.sum(losses, axis=1)

  # 3. unnecessary, but useful if working with more than one node
  #   in output layer
  return np.sum(summed_losses)

def g(z):
  return 1 / (1 + np.exp(-1 * z))

def feed_forward(A0):
  # layer 1 calculations
  Z1 = W1 @ A0 + b1
  A1 = g(Z1)

  # layer 2 calculations
  Z2 = W2 @ A1 + b2
  A2 = g(Z2)

  # layer 3 calculations
  Z3 = W3 @ A2 + b3
  A3 = g(Z3)

  cache = {
      "A0": A0,
      "A1": A1,
      "A2": A2
  }

  return A3, cache

In [24]:
A0, Y, m = prepare_data()

In [25]:
def backprop_layer_3(y_hat, Y, m, A2, W3):
  A3 = y_hat
  
  # step 1. calculate dC/dZ3 using shorthand we derived earlier
  dC_dZ3 = (1/m) * (A3 - Y)
  assert dC_dZ3.shape == (n[3], m)


  # step 2. calculate dC/dW3 = dC/dZ3 * dZ3/dW3 
  #   we matrix multiply dC/dZ3 with (dZ3/dW3)^T
  dZ3_dW3 = A2
  assert dZ3_dW3.shape == (n[2], m)

  dC_dW3 = dC_dZ3 @ dZ3_dW3.T
  assert dC_dW3.shape == (n[3], n[2])

  # step 3. calculate dC/db3 = np.sum(dC/dZ3, axis=1, keepdims=True)
  dC_db3 = np.sum(dC_dZ3, axis=1, keepdims=True)
  assert dC_db3.shape == (n[3], 1)

  # step 4. calculate propagator dC/dA2 = dC/dZ3 * dZ3/dA2
  dZ3_dA2 = W3 
  dC_dA2 = W3.T @ dC_dZ3
  assert dC_dA2.shape == (n[2], m)

  return dC_dW3, dC_db3, dC_dA2

In [26]:
y_hat, cache = feed_forward(A0)
dC_dW3, dC_db3, dC_dA2 = backprop_layer_3(
  y_hat, 
  Y, 
  m, 
  A2= cache["A2"], 
  W3= W3
)

In [27]:
def backprop_layer_2(propagator_dC_dA2, A1, A2, W2):

  # step 1. calculate dC/dZ2 = dC/dA2 * dA2/dZ2

  # use sigmoid derivation to arrive at this answer:
  #   sigmoid'(z) = sigmoid(z) * (1 - sigmoid(z))
  #     and if a = sigmoid(z), then sigmoid'(z) = a * (1 - a)
  dA2_dZ2 = A2 * (1 - A2)
  dC_dZ2 = propagator_dC_dA2 * dA2_dZ2
  assert dC_dZ2.shape == (n[2], m)


  # step 2. calculate dC/dW2 = dC/dZ2 * dZ2/dW2 
  dZ2_dW2 = A1
  assert dZ2_dW2.shape == (n[1], m)

  dC_dW2 = dC_dZ2 @ dZ2_dW2.T
  assert dC_dW2.shape == (n[2], n[1])

  # step 3. calculate dC/db2 = np.sum(dC/dZ2, axis=1, keepdims=True)
  dC_db2 = np.sum(dC_dW2, axis=1, keepdims=True)
  assert dC_db2.shape == (n[2], 1)

  # step 4. calculate propagator dC/dA1 = dC/dZ2 * dZ2/dA1
  dZ2_dA1 = W2
  dC_dA1 = W2.T @ dC_dZ2
  assert dC_dA1.shape == (n[2], m)

  return dC_dW2, dC_db2, dC_dA1

def backprop_layer_1(propagator_dC_dA1, A1, A0, W1):

  # step 1. calculate dC/dZ1 = dC/dA1 * dA1/dZ1

  # use sigmoid derivation to arrive at this answer:
  #   sigmoid'(z) = sigmoid(z) * (1 - sigmoid(z))
  #     and if a = sigmoid(z), then sigmoid'(z) = a * (1 - a)
  dA1_dZ1 = A1 * (1 - A1)
  dC_dZ1 = propagator_dC_dA1 * dA1_dZ1
  assert dC_dZ1.shape == (n[1], m)


  # step 2. calculate dC/dW1 = dC/dZ1 * dZ1/dW1 
  dZ1_dW1 = A0
  assert dZ1_dW1.shape == (n[0], m)

  dC_dW1 = dC_dZ1 @ dZ1_dW1.T
  assert dC_dW1.shape == (n[1], n[0])

  # step 3. calculate dC/db1 = np.sum(dC/dZ1, axis=1, keepdims=True)
  dC_db1 = np.sum(dC_dW1, axis=1, keepdims=True)
  assert dC_db1.shape == (n[1], 1)

  return dC_dW1, dC_db1