In [1]:
%matplotlib inline

In [2]:
import numpy as np
from utils import sigmoid, relu

In [3]:
np.random.randn(5, 4)

array([[ 0.56500168, -0.25971907, -0.24950269, -1.66116313],
       [ 0.01429791, -0.47222707,  0.13103123,  0.91423348],
       [ 0.48889291,  0.67402905,  1.24669556,  0.44188237],
       [-1.39882227,  1.31102149, -0.83196813,  0.38986131],
       [ 0.42116466, -0.15580217,  0.99854994, -1.04392225]])

In [4]:
# forward with X and parameters
# compute cost with AL and Y
# backprop with AL and cached values
# adjust parameters

def backward(AL, Y, caches):
    gradients = {} # gradients
    L = len(caches) # number of layers
    m = AL.shape[1] # number of data samples; training sample count
    # pop the Lth cache
    (linear_cache, activation_cache) = caches[0]
    # first do sigmoid backward
    dZL = backward_activation("sigmoid")
    # then iterate linear ReLU backwards, backwards
    # do the Lth backwards
    dA_prev = backward_activity(dZL, linear_cache)
    # now we add dA, dW, and db to the gradients
    gradients["dA" + str(L - 1)], gradients["dW" + str(L)], gradients["db" + str(L)] \
        = backward_activity(dZL, linear_cache)
    for l in reversed(range(1, L - 1)):
        # pop the caches
        (linear_cache, activation_cache) = caches[l]
        # relu backwards
        dZ = backward_activation(dA_prev, "ReLU")
        # linear backwards
        dW, dA, db = backward_activity(dZ, linear_cache)
        dA_prev = dA
        # we need to add dW and db to our gradients
        gradients["dA" + str(l - 1)], gradients["dW" + str(l)], gradients["db" + str(l)] \
            = backward_activity(dZ, linear_cache)
    return gradients
    

def backward_activity(dZ, linear_cache):
    # here we do Wx + b derived
    (W, b, A) = linear_cache
    dW = (1 / m) * np.dot(dZ, A.T)
    db = (1 / m) * np.sum(dZ)
    dA = np.dot(W.T, dZ)
    
    return dW, dA, db


def backward_activation(dA, activation_cache, activation="ReLU"):
    if activation == "ReLU":
        dZ = sigmoid_backwards(dA, activation_cache)
    elif activation == "sigmoid":
        dZ = relu_backwards(dA, activation_cache)
        
    return dZ


def calc_cost(AL, Y):
    m = Y.shape[1]
    cost = - (1 / m) * np.sum(np.dot(Y, np.log(AL.T) + np.dot((1 - Y), np.log(1 - AL.T))))
    
    cost = np.squeeze(cost)
    return cost


def forward(X, parameters):
    # Z = forward_activity(A, W, b)
    # A = forward_activation(Z, activation)
    # We need to return an array of caches so we can use the saved values for backprop
    # We return the the final activity, AL, A[L], of our forward propagation step so that
    #   it can be used as the input for back-propagation
    A = X
    L = len(parameters) // 2
    caches = []
        
    for l in range(1, L):
        A_prev = A
        W = parameters["W" + str(l)]
        b = parameters["b" + str(l)]
        Z, linear_cache = forward_activity(A_prev, W, b)
        A, activation_cache = forward_activation(Z, W, b, "ReLU")
        cache = (linear_cache, activation_cache)
        caches.append(cache)
    W = parameters["W" + str(L)]
    b = parameters["b" + str(L)]
    ZL, linear_cache = forward_activity(A, W, b)
    AL, activation_cache = forward_activation(ZL, W, b, "sigmoid")
    cache = (linear_cache, activation_cache)
    caches.append(cache)
    
    return AL, caches


def forward_activation(Z, W, b, activation="ReLU"):
    if activation == "ReLU":
        A, activation_cache = relu(Z)
    elif activation == "sigmoid":
        A, activation_cache = sigmoid(Z)
    return A, activation_cache


def forward_activity(A, W, b):
    Z = np.dot(W, A) + b
    linear_cache = (A, W, b)
    
    assert Z.shape == (W.shape[0], A.shape[1]), "Not getting a proper dimension for Z (activity)"
    return Z, linear_cache


def initialize_parameters(layers):
    """
    Initializes the weights and biases for every layer of the neural network.
    
    :param layers: a Python array where the index is the layer and the value is the number of units in layer index
    :return parameters: Python dictionary containing "W1", "b1", ..., "WL", "bL"
                         - Wl: weights for l with shape: (layers[l], layers[l - 1])
                         - bl: biases  for l with shape: (layers[l], 1)
    """
    
    L = len(layers)
    parameters = {}
    
    for l in range(1, L):
        parameters["W" + str(l)] = np.random.randn(layers[l], layers[l - 1])
        parameters["b" + str(l)] = np.zeros((layers[l], 1))
    return parameters


def train(X, Y, layers, learning_rate, num_iterations=2500):
    parameters = initialize_parameters(layers)
    for i in range(0, num_iterations):
        AL, caches = forward(X, parameters)
        cost = calc_cost(AL, Y)
        # print out current cost
        print("Cost at iteration " + str(i) + ": " + str(cost))
        # do backprop here
        dW, dA, db = backward(AL, caches)
        # apply gradient descent and learning
        parameters = update_parameters(dW, db, parameters, learning_rate)


def update_parameters(dW, db, parameters, learning_rate=0.001):
    L = len(parameters) // 2
    
    for l in range(1, L):
        parameters["W" + str(l)] = parameters["W" + str(l)] - learning_rate * dW
        parameters["b" + str(l)] = parameters["b" + str(l)] - learning_rate * db
    return parameters