In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def initialize_parameters(layer_dims):
    """
    Objective: 
    Set parameters to initial values
    
    Arguments:
    layer_dims -- List containing the dimension of each layer in the neural network
    
    Returns:
    parameters -- Dictionary of parameters "W1", "b1", "W2", "b2", ..., "WL", "bL"
        Wl -- Weight matrix of shape (layer_dims[l], layer_dims[l-1])
        bl -- Bias vector of shape (layer_dims[l], 1)
    """
    parameters = {}
    L = len(layer_dims)

    for l in range(1, L):
        parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 0.01
        parameters["b" + str(l)] = np.zeros((layer_dims[l], 1))
    
    return parameters

In [3]:
def forward_linear(A_prev, W, b):
    """
    Objective: 
    Implement the forward propagation linear part of a layer
    
    Arguments:
    A_prev -- Activations from previous layer of shape (number of units of previous layer, sample size)
    W -- Weights matrix of shape (number of units of current layer, number of units of previous layer)
    b -- Bias vector of shape (number of units of current layer, 1)
    
    Returns:
    Z -- Pre-activation value of layer
    cache -- Information passed through in this function to be used in backward propagation step
    """
    Z = W @ A_prev + b
    
    cache = (A_prev, W, b)
    return Z, cache

In [4]:
def forward_activation(A_prev, W, b, activation_function):
    """
    Objective: 
    Implement the forward propagation activation part of a layer l
    
    Arguments:
    A_prev -- Activations from previous layer of shape (number of units of previous layer, sample size)
    W -- Weights matrix of shape (number of units of current layer, number of units of previous layer)
    b -- Bias vector of shape (number of units of current layer, 1)
    activation_function -- The type of function to be used. Stored as a string. 
                           Currently, "relu", "sigmoid", or "tanh" are accepted. 
                           Any other string will output a warning and default to a linear function
    
    Returns:
    A -- Activation value of layer
    cache -- Information passed through in this function to be used in backward propagation step. Has a linear
    component and an actviation component
    """
    if activation_function == "relu":
        Z, linear_cache = forward_linear(A_prev, W, b)
        A = Z * (Z > 0)
        activation_cache = Z
    elif activation_function == "sigmoid":
        Z, linear_cache = forward_linear(A_prev, W, b)
        A = 1 / (1 + np.exp(-Z))
        activation_cache = Z
    elif activation_function == "tanh":
        Z, linear_cache = forward_linear(A_prev, W, b)
        A = np.tanh(Z)
        activation_cache = Z
    else:
        print("***Warning: Error in activation_function input. Relaying a linear output***")
        Z, linear_cache = forward_linear(A_prev, W, b)
        A = Z
        activation_cache = Z
        
    cache = (linear_cache, activation_cache)
    
    return A, cache

In [5]:
def forward_prop(X, parameters, activation_function_layer, activation_function_output):
    """
    Objective: 
    Implement forward propagation
    
    Arguments:
    X -- Data matrix of shape (number of features, sample size)
    parameters -- Output of initialize_parameters(layer_dims)
    activation_function_layer -- Type of activation function hidden layers take.
                                 Currently, "relu", "sigmoid", or "tanh" are accepted. 
                                 Any other string will output a warning and default to a linear function
    activation_function_output -- Type of function output layer takes
                                  Currently "relu", "sigmoid", or "tanh" are accepted. 
                                  Any other string will output a warning and default to a linear function
    
    Returns:
    AL -- Last activation value of shape (1, sample size)
    caches -- List of all caches containing every cache from forward_activation(A_prev, W, b, activation_function).
    There are L of them indexed from 0 to L-1
    """
    
    caches = []
    A = X
    L = len(parameters) // 2 #Number of layers of neural network
    
    for l in range(1, L):
        A_prev = A
        W = parameters["W" + str(l)]
        b = parameters["b" + str(l)]
        A, cache = forward_activation(A_prev, W, b, activation_function_layer)
        caches.append(cache)
    
    W = parameters["W" + str(L)]
    b = parameters["b" + str(L)]
    AL, cache = forward_activation(A, W, b, activation_function_output)
    caches.append(cache)
    
    return AL, caches

In [6]:
def compute_cost(AL, Y, cost_function):
    """
    Objective:
    Implement cost function
    
    Arguments:
    AL -- Last activation value of shape (1, sample size)
    Y -- True value of shape (1, sample size)
    cost_function -- The type of function to be used. Stored as a string. 
                     Currently, "cross_entropy" or "diff_squared" are accepted. 
                     Any other string will output a warning and default to a "diff_squared" function
    """
    
    m = Y.shape[1]
    
    if cost_function == "cross_entropy":
        cost = -1/m * np.sum(Y*np.log(AL) + (1-Y)*np.log(1-AL))
    elif cost_function == "diff_squared":
        cost = 1/m * 1/2 * np.sum((Y-AL)**2)
    else:
        print("***Warning: Error in cost_function input. Using a difference squared cost function***")
        cost = 1/m * 1/2 * np.sum((Y-AL)**2)
    
    return cost

In [7]:
def backward_linear(dZ, cache):
    """
    Objective:
    Implement the backward propagation linear part of a layer l
    
    Arguments:
    dZ -- Gradient of cost with respect to pre-activation value of current layer l of 
          shape(number of units in current layer, sample size)
    cache -- Tuple of values containing (A_prev, W, b)
    
    Returns:
    dA_prev -- Gradient of cost with respect to activation value of previous layer l-1
    dW -- Gradient of cost with respect to W of current layer l
    db -- Gradient of cost with respect to b of current layer l
    """
    A_prev, W, b = cache
    m = A_prev.shape[1]
    
    dA_prev = W.T @ dZ
    dW = 1/m * dZ @ A_prev.T
    db = 1/m * np.sum(dZ, axis=1, keepdims=True)
    
    return dA_prev, dW, db

In [8]:
def backward_activation(dA, cache, activation_function):
    """
    Objective:
    Implements the backward propagation activation part of a layer l
    
    Arguments:
    dA -- Gradient of cost with respect to activation value of current layer l of
          shape(number of units in current layer, sample size)
    cache -- Tuple of values containing (linear_cache, activation_cache) for layer l
    activation_function -- The type of function to be used. Stored as a string.
                           Currently, "relu", "sigmoid", or "tanh" are accepted. 
                           Any other string will output a warning and default to a linear function
    
    Returns:
    dA_prev -- Gradient of cost with respect to activation value of previous layer l-1
    dW -- Gradient of cost with respect to weights matrix of current layer l
    db -- Gradient of cost with respect to bias vector of current layer l
    """
    linear_cache, activation_cache = cache
    
    if activation_function == "relu":
        dG = (activation_cache > 0)
        dZ = dA * dG
        dA_prev, dW, db = backward_linear(dZ, linear_cache)
    elif activation_function == "sigmoid":
        G = 1/(1 + np.exp(-activation_cache))
        dG = G * (1 - G)
        dZ = dA * dG
        dA_prev, dW, db = backward_linear(dZ, linear_cache)
    elif activation_function == "tanh":
        dG = 1 / np.cosh(activation_cache)**2
        dZ = dA * dG
        dA_prev, dW, db = backward_linear(dZ, linear_cache)
    else:
        print("***Error in activation_function input. Relaying a linear output***")
        dG = np.ones(activation_cache.shape)
        dZ = dA * dG
        dA_prev, dW, db = backward_linear(dZ, linear_cache)
    
    return dA_prev, dW, db

In [9]:
def backward_prop(AL, Y, caches, cost_function, activation_function_layer, activation_function_output):
    """
    Objective:
    Implement backward propagation
    
    Arguments:
    AL -- Last activation value of shape (1, sample size)
    Y -- True value of shape (1, sample size)
    caches -- List of all caches containing every cache from forward_activation(A_prev, W, b, activation_function).
    There are L of them indexed from 0 to L-1
    cost_function -- The type of function to be used
                     Currently, "cross_entropy" or "diff_squared" are accepted. 
                     Any other string will output a warning and default to a "diff_squared" function
    activation_function_layer -- Type of activation function hidden layers take.
                                 Currently, "relu", "sigmoid", or "tanh" are accepted. 
                                 Any other string will output a warning and default to a linear function
    activation_function_output -- Type of function output layer takes
                                 Currently, "relu", "sigmoid", or "tanh" are accepted. 
                                 Any other string will output a warning and default to a linear function
    
    Returns:
    grads -- A dictionary containing gradients for dA[0], dW[1], db[1], ..., dA[L-1], dW[L], db[L]
    """
    grads = {}
    L = len(caches) #Number of layers
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) #Ensuring that Y and AL have the same shape
    
    #Calculating dAL
    if cost_function == "cross_entropy":
        dAL = -(np.divide(Y, AL) - np.divide(1-Y, 1-AL))
    elif cost_function == "diff_squared":
        dAL = -(Y-AL)
    else:
        print("***Error in cost_function input. Using a difference squared cost function***")
        dAL = -(Y-AL)
    
    #Storing grads in layer L
    current_cache = caches[L-1]
    dA_prev, dW, db = backward_activation(dAL, current_cache, activation_function_output)
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = dA_prev, dW, db
    
    #Storing grads in layers L-1, ..., 1
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        dAl = grads["dA" + str(l+1)]
        dA_prev, dW, db = backward_activation(dAl, current_cache, activation_function_layer)
        grads["dA" + str(l)], grads["dW" + str(l+1)], grads["db" + str(l+1)] = dA_prev, dW, db
    
    return grads

In [10]:
def update_parameters(parameters, grads, learning_rate):
    """
    Objective:
    Update paramters using gradient descent
    
    Arguments:
    parameters -- Currently set parameter values that will be updated. 
                  Dictionary containing values for "W1", "b1", "W2", "b2", ..., "WL", "bL"
    grads -- Gradients provided by backward_prop(AL, Y, caches, cost_function, 
                                                 activation_function_layer, activation_function_output)
    learning_rate -- Hyperparameter positive real number used to dictate how fast gradient descent goes
    
    Returns:
    parameters -- Updated set of parameter values.
                  Dictionary containing values for "W1", "b1", "W2", "b2", ..., "WL", "bL"
    """
    
    L = len(parameters) // 2 #Number of layers of neural network
    
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate*grads["dW" + str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate*grads["db" + str(l+1)]
    
    return parameters

In [11]:
def NN_model_calibrate(X_train, Y_train, layers_dims, learning_rate, num_iterations, print_cost_flag, \
                       cost_function, activation_function_layer, activation_function_output):
    """
    Objective:
    Calibrate a neural network
    
    Arguments:
    X_train -- Input training data of shape (number of features, sample size)
    Y_train -- Output training data of shape (1, sample size)
    layers_dims -- -- List containing the dimension of each layer in the neural network
    learning_rate -- Hyperparameter positive real number used to dictate how fast gradient descent goes
    num_iterations -- Number of iterations to go over training dataset
    print_cost_flag -- Boolean flag that specifies whether or not cost should be printed and graphed
    cost_function -- The type of function to be used
                     Currently, "cross_entropy" or "diff_squared" are accepted. 
                     Any other string will output a warning and default to a "diff_squared" function
    activation_function_layer -- Type of activation function hidden layers take.
                                 Currently, "relu", "sigmoid", or "tanh" are accepted. 
                                 Any other string will output a warning and default to a linear function
    activation_function_output -- Type of function output layer takes
                                 Currently, "relu", "sigmoid", or "tanh" are accepted. 
                                 Any other string will output a warning and default to a linear function
    
    Returns:
    parameters -- Dictionary of calibrated parameters "W1", "b1", "W2", "b2", ..., "WL", "bL"
                  Wl -- Weight matrix of shape (layer_dims[l], layer_dims[l-1])
                  bl -- Bias vector of shape (layer_dims[l], 1)
    """
    parameters = initialize_parameters(layers_dims)
    costs = []
    
    for i in range(num_iterations):
        AL, caches = forward_prop(X_train, parameters, activation_function_layer, activation_function_output)
        
        cost = compute_cost(AL, Y_train, cost_function)
        
        grads = backward_prop(AL, Y_train, caches, cost_function, activation_function_layer,\
                              activation_function_output)
        parameters = update_parameters(parameters, grads, learning_rate)
        
        if print_cost_flag and i % 100 == 0:
            print("Cost after iteration %i: %f" %(i, cost))
        if print_cost_flag and i % 10 == 0:
            costs.append(cost)
    if print_cost_flag:
        plt.plot(np.squeeze(costs))
        plt.ylabel("Cost")
        plt.xlabel("# iterations (tens)")
        plt.title("Learning rate = " + str(learning_rate))
        plt.show()
        
    return parameters    

In [12]:
def NN_predict(X, parameters, activation_function_layer, activation_function_output, binary_flag):
    """
    Objective:
    Estimate output given input and model specifications
    
    Arguments:
    X -- Input data of shape(number of features, 1)
    parameters -- Dictionary of calibrated parameters "W1", "b1", "W2", "b2", ..., "WL", "bL"
                  Wl -- Weight matrix of shape (layer_dims[l], layer_dims[l-1])
                  bl -- Bias vector of shape (layer_dims[l], 1)
    activation_function_layer -- Type of activation function hidden layers take.
                                 Currently, "relu", "sigmoid", or "tanh" are accepted. 
                                 Any other string will output a warning and default to a linear function
    activation_function_output -- Type of function output layer takes
                                 Currently, "relu", "sigmoid", or "tanh" are accepted. 
                                 Any other string will output a warning and default to a linear function
    binary_flag -- Boolean variable that indicates whether or not the output is binary
    Returns:
    AL -- Estimate value of Y
    """
    AL, _ = forward_prop(X, parameters, activation_function_layer, activation_function_output)
    
    if binary_flag:
        AL = (AL >= 0.5) * 1
    return AL

In [13]:
"""AND Model"""

X_train = np.array([[0,0,1,1],[0,1,0,1]])
Y_train = np.array([[0,0,0,1]])
layers_dims = [2, 1]
learning_rate = 0.005
num_iterations = 10000
print_cost_flag = False
cost_function = "cross_entropy"
activation_function_layer = "relu"
activation_function_output = "sigmoid"

parameters = NN_model_calibrate(X_train, Y_train, layers_dims, learning_rate, num_iterations, print_cost_flag, \
                                cost_function, activation_function_layer, activation_function_output)
print("Parameters = ", parameters)

binary_flag = True
X00 = np.array([[0],[0]])
Y00 = NN_predict(X00, parameters, activation_function_layer, activation_function_output, binary_flag)
print("Y00 = ", Y00)

X01 = np.array([[0],[1]])
Y01 = NN_predict(X01, parameters, activation_function_layer, activation_function_output, binary_flag)
print("Y01 = ", Y01)

X10 = np.array([[0],[1]])
Y10 = NN_predict(X10, parameters, activation_function_layer, activation_function_output, binary_flag)
print("Y10 = ", Y10)

X11 = np.array([[1],[1]])
Y11 = NN_predict(X11, parameters, activation_function_layer, activation_function_output, binary_flag)
print("Y11 = ", Y11)

Parameters =  {'W1': array([[2.02734246, 2.02831425]]), 'b1': array([[-3.30883997]])}
Y00 =  [[0]]
Y01 =  [[0]]
Y10 =  [[0]]
Y11 =  [[1]]


In [14]:
"""OR Model"""

X_train = np.array([[0,0,1,1],[0,1,0,1]])
Y_train = np.array([[0,1,1,1]])
layers_dims = [2, 1]
learning_rate = 0.005
num_iterations = 10000
print_cost_flag = False
cost_function = "cross_entropy"
activation_function_layer = "relu"
activation_function_output = "sigmoid"

parameters = NN_model_calibrate(X_train, Y_train, layers_dims, learning_rate, num_iterations, print_cost_flag, \
                                cost_function, activation_function_layer, activation_function_output)
print("Parameters = ", parameters)

binary_flag = True
X00 = np.array([[0],[0]])
Y00 = NN_predict(X00, parameters, activation_function_layer, activation_function_output, binary_flag)
print("Y00 = ", Y00)

X01 = np.array([[0],[1]])
Y01 = NN_predict(X01, parameters, activation_function_layer, activation_function_output, binary_flag)
print("Y01 = ", Y01)

X10 = np.array([[0],[1]])
Y10 = NN_predict(X10, parameters, activation_function_layer, activation_function_output, binary_flag)
print("Y10 = ", Y10)

X11 = np.array([[1],[1]])
Y11 = NN_predict(X11, parameters, activation_function_layer, activation_function_output, binary_flag)
print("Y11 = ", Y11)

Parameters =  {'W1': array([[2.82822156, 2.82395335]]), 'b1': array([[-0.78707081]])}
Y00 =  [[0]]
Y01 =  [[1]]
Y10 =  [[1]]
Y11 =  [[1]]


In [15]:
"""NOT Model"""

X_train = np.array([[0,1]])
Y_train = np.array([[1,0]])
layers_dims = [1, 1]
learning_rate = 0.005
num_iterations = 10000
print_cost_flag = False
cost_function = "diff_squared"
activation_function_layer = "relu"
activation_function_output = "sigmoid"

parameters = NN_model_calibrate(X_train, Y_train, layers_dims, learning_rate, num_iterations, print_cost_flag, \
                                cost_function, activation_function_layer, activation_function_output)
print("Parameters = ", parameters)

binary_flag = True
X0 = np.array([[0]])
Y0 = NN_predict(X0, parameters, activation_function_layer, activation_function_output, binary_flag)
print("Y0 = ", Y0)

X1 = np.array([[1]])
Y1 = NN_predict(X1, parameters, activation_function_layer, activation_function_output, binary_flag)
print("Y1 = ", Y1)

Parameters =  {'W1': array([[-1.81428473]]), 'b1': array([[0.71184837]])}
Y0 =  [[1]]
Y1 =  [[0]]


In [16]:
"""XOR Model"""

X_train = np.array([[0,0,1,1],[0,1,0,1]])
Y_train = np.array([[0,1,1,0]])
layers_dims = [2, 5, 5, 1]
learning_rate = 0.05
num_iterations = 30000
print_cost_flag = False
cost_function = "cross_entropy"
activation_function_layer = "relu"
activation_function_output = "sigmoid"

parameters = NN_model_calibrate(X_train, Y_train, layers_dims, learning_rate, num_iterations, print_cost_flag, \
                                cost_function, activation_function_layer, activation_function_output)
print("Parameters = ", parameters)

binary_flag = True
X00 = np.array([[0],[0]])
Y00 = NN_predict(X00, parameters, activation_function_layer, activation_function_output, binary_flag)
print("Y00 = ", Y00)

X01 = np.array([[0],[1]])
Y01 = NN_predict(X01, parameters, activation_function_layer, activation_function_output, binary_flag)
print("Y01 = ", Y01)

X10 = np.array([[0],[1]])
Y10 = NN_predict(X10, parameters, activation_function_layer, activation_function_output, binary_flag)
print("Y10 = ", Y10)

X11 = np.array([[1],[1]])
Y11 = NN_predict(X11, parameters, activation_function_layer, activation_function_output, binary_flag)
print("Y11 = ", Y11)

Parameters =  {'W1': array([[ 1.08911354e-03, -2.25041911e-03],
       [-9.48832994e-03, -3.31441871e-03],
       [-1.26187902e+00,  2.24141944e+00],
       [ 1.20840536e+00, -6.83778469e-01],
       [ 1.89597603e+00, -1.07522844e+00]]), 'b1': array([[-1.08913740e-03],
       [ 0.00000000e+00],
       [ 1.26189989e+00],
       [ 6.83778714e-01],
       [ 1.07523488e+00]]), 'W2': array([[ 4.36731974e-03, -8.71796046e-03, -1.14009631e+00,
         6.31943029e-01,  9.58537734e-01],
       [-6.10493434e-03, -9.90758197e-03,  8.88376091e-01,
         4.70022629e-01,  7.55460014e-01],
       [-1.09330075e-02,  1.65656158e-02,  1.92248493e+00,
        -1.03803614e+00, -1.63055034e+00],
       [-5.44842386e-03,  1.81619737e-02, -1.54802113e+00,
         8.30766662e-01,  1.31892947e+00],
       [-3.63803934e-03,  1.97866514e-02, -3.84622355e-03,
        -3.91245962e-03, -1.33013486e-03]]), 'b2': array([[-0.02414349],
       [ 0.76071894],
       [ 0.03699499],
       [-0.03285765],
       [ 0. 