In [44]:
# importing packages

import numpy as np
import matplotlib.pyplot as plt
import math
import copy

<div style="background-color: #a83d36; color: #FFFFFF; padding: 5px; text-align: center; border-radius: 15px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.5);">
  <h1 style="margin: 0; font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-weight: bold; font-size: 32px;">PARAMETER INITIALIZATION</h1>


In [45]:
def initialize_parameters_zeros(layers_dims):
    """
    Initializes the parameters of the deep neural network with zeros.

    Arguments:
    layers_dims -- list containing the dimensions of each layer in the network

    Returns:
    parameters -- dictionary containing initialized parameters 'W1', 'b1', ..., 'WL', 'bL'
                   where Wl is weight matrix of shape (layers_dims[l], layers_dims[l-1])
                   and bl is bias vector of shape (layers_dims[l], 1)
    """

    parameters = {}
    L = len(layers_dims)

    for l in range(1, L):
        parameters['W' + str(l)] = np.zeros((layers_dims[l], layers_dims[l - 1]))
        parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))

    return parameters


In [46]:
def initialize_parameters_random(layers_dims):
    """
    Initializes the parameters of the deep neural network with random values.

    Arguments:
    layers_dims -- list containing the dimensions of each layer in the network

    Returns:
    parameters -- dictionary containing initialized parameters 'W1', 'b1', ..., 'WL', 'bL'
                   where Wl is weight matrix of shape (layers_dims[l], layers_dims[l-1])
                   and bl is bias vector of shape (layers_dims[l], 1)
    """

    np.random.seed(3)  # Set random seed for reproducibility
    parameters = {}
    L = len(layers_dims)  # Number of layers in the network

    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * 10
        parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))

    return parameters


In [4]:
def initialize_parameters_he(layers_dims):
    """
    Initializes the parameters of the deep neural network using He initialization.

    Arguments:
    layers_dims -- list containing the dimensions of each layer in the network

    Returns:
    parameters -- dictionary containing initialized parameters 'W1', 'b1', ..., 'WL', 'bL'
                   where Wl is weight matrix of shape (layers_dims[l], layers_dims[l-1])
                   and bl is bias vector of shape (layers_dims[l], 1)
    """

    np.random.seed(3)  # Set random seed for reproducibility
    parameters = {}
    L = len(layers_dims) - 1  # Number of layers in the network

    for l in range(1, L + 1):
        parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(2 / layers_dims[l - 1])
        parameters["b" + str(l)] = np.zeros((layers_dims[l], 1))

    return parameters


<div style="background-color: #a83d36; color: #FFFFFF; padding: 5px; text-align: center; border-radius: 15px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.5);">
  <h1 style="margin: 0; font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-weight: bold; font-size: 32px;">ACTIVATION FUNCTION</h1>


In [10]:
def relu(Z):
    """
    Implement the ReLU activation function.

    Arguments:
    Z -- Output of the linear layer, numpy array of any shape

    Returns:
    A -- Post-activation output, same shape as Z
    cache -- a python dictionary containing "Z" ; stored for computing the backward pass efficiently
    """
    A = np.maximum(0, Z)
    cache = Z  # We store Z since it's used in backpropagation
    return A, cache


In [11]:
def sigmoid(Z):
    """
    Implement the sigmoid activation function.

    Arguments:
    Z -- Output of the linear layer, numpy array of any shape

    Returns:
    A -- Post-activation output, same shape as Z
    cache -- a python dictionary containing "Z" ; stored for computing the backward pass efficiently
    """
    A = 1 / (1 + np.exp(-Z))
    cache = Z
    return A, cache

<div style="background-color: #a83d36; color: #FFFFFF; padding: 5px; text-align: center; border-radius: 15px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.5);">
  <h1 style="margin: 0; font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-weight: bold; font-size: 32px;">FORWARD PROPAGATION</h1>


In [1]:
def linear_forward(A, W, b):
    """
    Implement the linear part of a layer's forward propagation.

    Arguments:
    A -- activations from previous layer (size of previous layer, number of examples)
    W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer, 1)

    Returns:
    Z -- the input of the activation function, also called pre-activation parameter 
    cache -- a tuple containing "A", "W" and "b"; stored for computing the backward pass efficiently
    """
    Z = np.dot(W, A) + b
    cache = (A, W, b)
    return Z, cache


In [43]:
def linear_activation_forward(A_prev, W, b, activation):
    """
    Implement the forward propagation for the LINEAR->ACTIVATION layer.

    Arguments:
    A_prev -- activations from previous layer (size of previous layer, number of examples)
    W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer, 1)
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"

    Returns:
    A -- the output of the activation function (post-activation value)
    cache -- a tuple containing "linear_cache" and "activation_cache";
             stored for computing the backward pass efficiently
    """
    
    if activation == "sigmoid":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = sigmoid(Z)


    elif activation == "relu":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = relu(Z)

    cache = (linear_cache, activation_cache)

    return A, cache


In [4]:
def L_model_forward(X, parameters):
    """
    Implement forward propagation for the [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID computation.

    Arguments:
    X -- data, numpy array of shape (input size, number of examples)
    parameters -- output of initialize_parameters_deep()

    Returns:
    AL -- last post-activation value
    caches -- list of caches containing:
                every cache of linear_relu_forward() (there are L-1 of them, indexed from 0 to L-2)
                the cache of linear_sigmoid_forward() (there is one, indexed L-1)
    """
    caches = []
    A = X
    L = len(parameters) // 2                  # number of layers in the neural network

    for l in range(1, L):
        A_prev = A
        A, cache = linear_activation_forward(A_prev, parameters['W'+str(l)], parameters['b'+str(l)], activation='relu')
        caches.append(cache)
        
    AL, cache = linear_activation_forward(A, parameters['W'+str(L)], parameters['b'+str(L)], activation='sigmoid')
    caches.append(cache)

    return AL, caches


In [None]:
def forward_propagation_with_dropout(X, parameters, keep_prob=0.5):
    """
    Implements forward propagation with dropout for L-layer neural network.

    Arguments:
    X -- input data, numpy array of shape (input size, number of examples)
    parameters -- python dictionary containing parameters "W1", "b1", ..., "WL", "bL"
                  Wl -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
                  bl -- bias vector of shape (layer_dims[l], 1)
    keep_prob -- probability of keeping a neuron active during dropout, scalar

    Returns:
    AL -- last post-activation value
    caches -- list of caches containing:
                every cache of linear_activation_forward() (there are L-1 of them, indexed from 0 to L-2)
                the cache of linear_activation_forward() with "sigmoid" (there is one, indexed L-1)
    """

    np.random.seed(1)
    caches = []
    L = len(parameters) // 2  # number of layers in the neural network
    A = X
    # Loop through the layers
    for l in range(1, L):
        # Forward propagation for hidden layers
        Z = np.dot(parameters['W' + str(l)], A) + parameters['b' + str(l)]
        A = relu(Z)

        # Dropout
        D = np.random.rand(A.shape[0], A.shape[1])
        D = (D < keep_prob).astype(int)
        A = A * D
        A = A / keep_prob

        # Cache the values for backward propagation
        cache = (Z, D)
        caches.append(cache)

    # Final layer with sigmoid activation
    Z = np.dot(parameters['W' + str(L)], A) + parameters['b' + str(L)]
    AL = sigmoid(Z)

    # Cache the values for backward propagation
    cache = (Z, AL)
    caches.append(cache)

    return AL, caches



<div style="background-color: #a83d36; color: #FFFFFF; padding: 5px; text-align: center; border-radius: 15px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.5);">
  <h1 style="margin: 0; font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-weight: bold; font-size: 32px;"> COMPUTE COST</h1>

In [34]:
def compute_cost(AL, Y):

    """
    
    Compute the cross-entropy cost.

    Arguments:
    AL -- probability vector corresponding to the label predictions, shape (1, number of examples)
    Y -- true "label" vector, shape (1, number of examples)

    Returns:
    cost -- cross-entropy cost
    
    """    

    m = Y.shape[1]
    
    cost=(-1/m) * np.dot(Y,np.log(AL).T) + np.dot((1-Y),np.log(1-AL).T)

    cost = np.squeeze(cost)      # To make sure your cost's shape is what we expect (e.g. this turns [[17]] into 17).
    return cost


In [20]:
def compute_cost_with_regularization(AL, Y, parameters, lambd):
    """
    Compute the cost of the model with L2 regularization.

    Arguments:
    AL -- post-activation, output of forward propagation, shape (output size, number of examples)
    Y -- "true" labels vector, shape (output size, number of examples)
    parameters -- python dictionary containing parameters of the model
    lambd -- regularization parameter

    Returns:
    cost -- value of the regularized loss function
    """

    m = Y.shape[1]
    L = len(parameters) // 2  # number of layers in the neural network
    cross_entropy_cost = compute_cost(AL, Y)

    L2_regularization_cost = 0

    for l in range(1, L + 1):
        W = parameters["W" + str(l)]
        L2_regularization_cost += np.sum(np.square(W))

    L2_regularization_cost *= lambd / (2 * m)

    cost = cross_entropy_cost + L2_regularization_cost

    return cost



<div style="background-color: #a83d36; color: #FFFFFF; padding: 5px; text-align: center; border-radius: 15px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.5);">
  <h1 style="margin: 0; font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-weight: bold; font-size: 32px;"> BACKWARD  PROPAGATION</h1>

In [8]:
def relu_backward(dA, cache):
    """
    Implement the backward propagation for a single RELU unit.
    
    Arguments:
    dA -- post-activation gradient, same shape as A
    cache -- 'Z' where we store for computing backward propagation efficiently
    
    Returns:
    dZ -- Gradient of the cost with respect to Z
    """
    Z = cache
    dZ = np.array(dA, copy=True)  # Convert dZ to a correct object.
    dZ[Z <= 0] = 0  # When Z <= 0, set dZ to 0 as well.
    return dZ


In [9]:
def sigmoid_backward(dA, cache):
    """
    Implement the backward propagation for a single SIGMOID unit.
    
    Arguments:
    dA -- post-activation gradient, same shape as A
    cache -- 'Z' where we store for computing backward propagation efficiently
    
    Returns:
    dZ -- Gradient of the cost with respect to Z
    """
    Z = cache
    s = 1 / (1 + np.exp(-Z))
    dZ = dA * (s * (1 - s))
    return dZ

In [7]:
def linear_backward(dZ, cache):
    """
    Implement the linear portion of backward propagation for a single layer (layer l)

    Arguments:
    dZ -- Gradient of the cost with respect to the linear output (of current layer l)
    cache -- tuple of values (A_prev, W, b) coming from the forward propagation in the current layer

    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    
    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW=(1/m)* np.dot(dZ,A_prev.T)
    db=(1/m)* np.sum(dZ,axis=1,keepdims=True)
    dA_prev=np.dot(W.T,dZ)

    return dA_prev, dW, db

In [12]:
def linear_activation_backward(dA, cache, activation):
    """
    Implement the backward propagation for the LINEAR->ACTIVATION layer.
    
    Arguments:
    dA -- post-activation gradient for current layer l 
    cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
    
    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    linear_cache, activation_cache = cache

    if activation == "relu":
        
        dZ =relu_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)


    elif activation == "sigmoid":

        dZ = sigmoid_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
        

    return dA_prev, dW, db

In [13]:

def L_model_backward(AL, Y, caches):
    """
    Implement the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group
    
    Arguments:
    AL -- probability vector, output of the forward propagation (L_model_forward())
    Y -- true "label" vector (containing 0 if non-cat, 1 if cat)
    caches -- list of caches containing:
                every cache of linear_activation_forward() with "relu" (it's caches[l], for l in range(L-1) i.e l = 0...L-2)
                the cache of linear_activation_forward() with "sigmoid" (it's caches[L-1])
    
    Returns:
    grads -- A dictionary with the gradients
             grads["dA" + str(l)] = ... 
             grads["dW" + str(l)] = ...
             grads["db" + str(l)] = ... 
    """
    grads = {}
    L = len(caches) # the number of layers
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL

    # Initializing the backpropagation
    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))

    # Lth layer (SIGMOID -> LINEAR) gradients. Inputs: "dAL, current_cache". Outputs: "grads["dAL-1"], grads["dWL"], grads["dbL"]
    current_cache = caches[L-1] # Last Layer (L-1) because caches is a list
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache,activation='sigmoid')
    

    # Loop from l=L-2 to l=0
    for l in reversed(range(L-1)):
        # lth layer: (RELU -> LINEAR) gradients.
        
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l + 1)], current_cache, activation = "relu")
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp


    return grads

In [47]:
def backward_propagation_with_dropout(X, Y, cache, keep_prob):
    """
    Implements backward propagation for dropout.

    Arguments:
    X -- input data, numpy array of shape (input size, number of examples)
    Y -- true "label" vector (containing 0 if non-cat, 1 if cat), of shape (1, number of examples)
    cache -- dictionary containing "Z1", "A1", ..., "ZL", "AL" and "W1", "b1", ..., "WL", "bL", "D1", ..., "DL"
    keep_prob -- probability of keeping a neuron active during dropout, scalar

    Returns:
    gradients -- A dictionary with the gradients
                 gradients["dA" + str(l)] = ... 
                 gradients["dW" + str(l)] = ...
                 gradients["db" + str(l)] = ... 
    """

    m = X.shape[1]
    gradients = {}
    L = len(cache) // 3  # Number of layers
    
    # Retrieve cache
    for l in reversed(range(1, L + 1)):
        dA_prev = gradients.get('dA' + str(l), None)
        if dA_prev is None:
            dA_prev = Y - cache['A' + str(L)]
        
        Z = cache['Z' + str(l)]
        D = cache['D' + str(l)]
        A = cache['A' + str(l)]
        W = cache['W' + str(l)]
        b = cache['b' + str(l)]
        
        # Dropout backward
        dA = dA_prev * D / keep_prob
        
        # Linear activation backward
        dZ = dA * relu_backward(Z)
        gradients['dW' + str(l)] = 1./m * np.dot(dZ, cache['A' + str(l - 1)].T)
        gradients['db' + str(l)] = 1./m * np.sum(dZ, axis=1, keepdims=True)
        gradients['dA' + str(l - 1)] = np.dot(W.T, dZ)
        
    return gradients


In [21]:
def backward_propagation_with_regularization(X, Y, cache, lambd):
    """
    Implements backward propagation with L2 regularization.

    Arguments:
    X -- input data, numpy array of shape (input size, number of examples)
    Y -- true "label" vector (containing 0 if non-cat, 1 if cat), of shape (1, number of examples)
    cache -- dictionary containing "Z1", "A1", ..., "ZL", "AL" and "W1", "b1", ..., "WL", "bL"
    lambd -- regularization hyperparameter, scalar

    Returns:
    gradients -- A dictionary with the gradients
                 gradients["dA" + str(l)] = ... 
                 gradients["dW" + str(l)] = ...
                 gradients["db" + str(l)] = ... 
    """

    m = X.shape[1]
    gradients = {}
    L = len(cache) // 3  # Number of layers
    
    # Retrieve cache
    A_prev = X
    for l in reversed(range(1, L + 1)):
        Z = cache['Z' + str(l)]
        A = cache['A' + str(l)]
        W = cache['W' + str(l)]
        b = cache['b' + str(l)]
        
        # Compute dZ
        if l == L:
            dZ = A - Y
        else:
            dZ = np.dot(W.T, gradients['dZ' + str(l + 1)]) * (A > 0) # ReLU derivative
        
        # Add regularization term to gradients
        gradients['dW' + str(l)] = 1./m * np.dot(dZ, A_prev.T) + (lambd/m)*W
        gradients['db' + str(l)] = 1./m * np.sum(dZ, axis=1, keepdims=True)
        gradients['dA' + str(l - 1)] = np.dot(W.T, dZ)
        
        A_prev = cache['A' + str(l - 1)]
    
    return gradients


<div style="background-color: #a83d36; color: #FFFFFF; padding: 5px; text-align: center; border-radius: 15px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.5);">
  <h1 style="margin: 0; font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-weight: bold; font-size: 32px;"> OPTIMIZER INITIALIZATION</h1>

In [26]:
def initialize_velocity(parameters):
    """
    Initialize the velocity for gradient descent optimization.

    Arguments:
    parameters -- dictionary containing the parameters (weights and biases) of the model

    Returns:
    v -- dictionary containing the velocity for each parameter
    """
    L = len(parameters) // 2  # number of layers in the neural networks
    v = {}

    # Initialize velocity
    for l in range(1, L + 1):
        v['dW' + str(l)] = np.zeros(parameters['W' + str(l)].shape)
        v["db" + str(l)] = np.zeros(parameters['b' + str(l)].shape)

    return v


In [27]:
def initialize_adam(parameters):
    """
    Initialize the Adam optimization algorithm.

    Arguments:
    parameters -- dictionary containing the parameters (weights and biases) of the model

    Returns:
    v -- dictionary containing the exponentially weighted average of the gradients
    s -- dictionary containing the exponentially weighted average of the squared gradients
    """
    L = len(parameters) // 2  # number of layers in the neural networks
    v = {}
    s = {}

    # Initialize v, s.
    for l in range(1, L + 1):
        v["dW" + str(l)] = np.zeros(parameters["W" + str(l)].shape)
        v["db" + str(l)] = np.zeros(parameters["b" + str(l)].shape)
        s["dW" + str(l)] = np.zeros(parameters["W" + str(l)].shape)
        s["db" + str(l)] = np.zeros(parameters["b" + str(l)].shape)

    return v, s




<div style="background-color: #a83d36; color: #FFFFFF; padding: 5px; text-align: center; border-radius: 15px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.5);">
  <h1 style="margin: 0; font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-weight: bold; font-size: 32px;">PARAMETER UPDATE</h1>

In [30]:
def update_parameters(params, grads, learning_rate):
    """
    Update parameters using gradient descent
    
    Arguments:
    params -- python dictionary containing your parameters 
    grads -- python dictionary containing your gradients, output of L_model_backward
    
    Returns:
    parameters -- python dictionary containing your updated parameters 
                  parameters["W" + str(l)] = ... 
                  parameters["b" + str(l)] = ...
    """
    parameters = copy.deepcopy(params)
    L = len(parameters) // 2 # number of layers in the neural network

    # Update rule for each parameter. Use a for loop.
    for l in range(L+1):

        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]

    return parameters

In [31]:
def update_parameters_with_momentum(parameters, grads, v, beta, learning_rate):
    """
    Update parameters using Momentum optimization algorithm.

    Arguments:
    parameters -- dictionary containing the parameters (weights and biases) of the model
    grads -- dictionary containing the gradients of the cost function with respect to the parameters
    v -- dictionary containing the exponentially weighted average of the gradients
    beta -- the momentum hyperparameter
    learning_rate -- the learning rate

    Returns:
    parameters -- updated parameters
    v -- updated velocities
    """
    L = len(parameters) // 2  # number of layers in the neural networks

    # Momentum update for each parameter
    for l in range(1, L + 1):
        # Compute velocities
        v["dW" + str(l)] = (beta * v["dW" + str(l)]) + ((1 - beta) * grads['dW' + str(l)])
        v["db" + str(l)] = (beta * v["db" + str(l)]) + ((1 - beta) * grads["db" + str(l)])

        # Update parameters
        parameters["W" + str(l)] = parameters["W" + str(l)] - learning_rate * v["dW" + str(l)]
        parameters["b" + str(l)] = parameters["b" + str(l)] - learning_rate * v["db" + str(l)]

    return parameters, v


In [32]:
def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate=0.01,
                                beta1=0.9, beta2=0.999, epsilon=1e-8):
    """
    Update parameters using the Adam optimization algorithm.

    Arguments:
    parameters -- dictionary containing the parameters (weights and biases) of the model
    grads -- dictionary containing the gradients of the cost function with respect to the parameters
    v -- exponentially weighted average of the gradients (first moment estimate)
    s -- exponentially weighted average of the squared gradients (second moment estimate)
    t -- Adam counter (to keep track of bias corrections)
    learning_rate -- the learning rate
    beta1 -- exponential decay rate for the first moment estimates
    beta2 -- exponential decay rate for the second moment estimates
    epsilon -- small value to prevent division by zero

    Returns:
    parameters -- updated parameters
    v -- updated first moment estimates
    s -- updated second moment estimates
    v_corrected -- bias-corrected first moment estimates
    s_corrected -- bias-corrected second moment estimates
    """
    L = len(parameters) // 2  # number of layers in the neural networks
    v_corrected = {}  # Initializing first moment estimate, python dictionary
    s_corrected = {}  # Initializing second moment estimate, python dictionary

    # Perform Adam update on all parameters
    for l in range(1, L + 1):
        # Moving average of the gradients with momentum.
        v["dW" + str(l)] = (beta1 * v["dW" + str(l)]) + ((1 - beta1) * grads['dW' + str(l)])
        v["db" + str(l)] = (beta1 * v["db" + str(l)]) + ((1 - beta1) * grads['db' + str(l)])

        # Compute bias-corrected first moment estimate.
        v_corrected["dW" + str(l)] = v["dW" + str(l)] / (1 - beta1 ** t)
        v_corrected["db" + str(l)] = v["db" + str(l)] / (1 - beta1 ** t)

        # Moving average of the root mean squared gradients.
        s["dW" + str(l)] = (beta2 * s["dW" + str(l)]) + ((1 - beta2) * (np.square(grads['dW' + str(l)])))
        s["db" + str(l)] = (beta2 * s["db" + str(l)]) + ((1 - beta2) * (np.square(grads['db' + str(l)])))

        # Compute bias-corrected second raw moment estimate.
        s_corrected["dW" + str(l)] = s["dW" + str(l)] / (1 - beta2 ** t)
        s_corrected["db" + str(l)] = s["db" + str(l)] / (1 - beta2 ** t)

        # Update parameters.
        parameters["W" + str(l)] = parameters["W" + str(l)] - learning_rate * v_corrected["dW" + str(l)] / (
                    np.sqrt(s_corrected["dW" + str(l)]) + epsilon)
        parameters["b" + str(l)] = parameters["b" + str(l)] - learning_rate * v_corrected["db" + str(l)] / (
                    np.sqrt(s_corrected["db" + str(l)]) + epsilon)

    return parameters, v, s, v_corrected, s_corrected


<div style="background-color: #a83d36; color: #FFFFFF; padding: 5px; text-align: center; border-radius: 15px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.5);">
  <h1 style="margin: 0; font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-weight: bold; font-size: 32px;"> MINI BATCH</h1>

In [None]:
def random_mini_batches(X, Y, mini_batch_size = 64, seed = 0):
    """
    
    Creates a list of random mini-batches from the input data.

    Arguments:
    X -- input data, numpy array of shape (input size, number of examples)
    Y -- true "label" vector, numpy array of shape (output size, number of examples)
    mini_batch_size -- size of each mini-batch
    seed -- random seed for reproducibility

    Returns:
    mini_batches -- list of mini-batches, each mini-batch is a tuple (mini_batch_X, mini_batch_Y)
    
    """
    
    np.random.seed(seed)            
    m = X.shape[1]                  # number of training examples
    mini_batches = []

    # Step 1: Shuffle (X, Y)
    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation].reshape((1, m))

    inc = mini_batch_size

    # Step 2 - Partition (shuffled_X, shuffled_Y).
    num_complete_minibatches = math.floor(m / mini_batch_size) # number of mini batches
    for k in range(0, num_complete_minibatches):
        mini_batch_X= shuffled_X[:, k*mini_batch_size : (k+1)*mini_batch_size]
        mini_batch_Y= shuffled_Y[:, k*mini_batch_size : (k+1)*mini_batch_size]

        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    # For handling the end case (last mini-batch < mini_batch_size i.e less than 64)
    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[:, int(m/mini_batch_size)*mini_batch_size : ]
        mini_batch_Y = shuffled_Y[:, int(m/mini_batch_size)*mini_batch_size : ]

        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    return mini_batches

<div style="background-color: #a83d36; color: #FFFFFF; padding: 5px; text-align: center; border-radius: 15px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.5);">
  <h1 style="margin: 0; font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-weight: bold; font-size: 32px;"> LEARNING RATE DECAY FUNCTIONS</h1>

In [35]:
def update_lr(learning_rate0, epoch_num, decay_rate):
    """
    Update the learning rate using exponential decay.

    Arguments:
    learning_rate0 -- initial learning rate
    epoch_num -- current epoch number
    decay_rate -- rate of decay

    Returns:
    learning_rate -- updated learning rate
    """

    # Exponential decay formula for updating learning rate
    learning_rate = (1 / (1 + decay_rate * epoch_num)) * learning_rate0

    return learning_rate



In [36]:
def schedule_lr_decay(learning_rate0, epoch_num, decay_rate, time_interval=1000):
    """
    Update the learning rate using exponential decay with periodic scheduling.

    Arguments:
    learning_rate0 -- initial learning rate
    epoch_num -- current epoch number
    decay_rate -- rate of decay
    time_interval -- interval for decay scheduling (default is 1000)

    Returns:
    learning_rate -- updated learning rate
    """

    # Exponential decay with periodic scheduling
    learning_rate = (1 / (1 + decay_rate * np.floor(epoch_num / time_interval))) * learning_rate0

    return learning_rate

<div style="background-color: #a83d36; color: #FFFFFF; padding: 5px; text-align: center; border-radius: 15px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.5);">
  <h1 style="margin: 0; font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-weight: bold; font-size: 32px;"> BUILDING NEURAL NETWORK MODEL</h1>



In [18]:
def simple_model(X, Y, learning_rate=0.01, num_iterations=15000, print_cost=True, initialization="he"):
    """
    Implements a three-layer neural network: Linear -> Relu -> Linear -> Relu -> Linear -> Sigmoid.

    Arguments:
    X -- input data, numpy array of shape (input size, number of examples)
    Y -- true "label" vector, numpy array of shape (output size, number of examples)
    learning_rate -- learning rate of the optimization
    num_iterations -- number of iterations of the optimization loop
    print_cost -- if True, it prints the cost every 1000 iterations
    initialization -- flag to choose which initialization to use ("zeros", "random", "he")

    Returns:
    parameters -- parameters learnt by the model
    """

    grads = {}
    costs = []  # to keep track of the loss
    m = X.shape[1]  # number of examples
    layers_dims = [X.shape[0], 10, 5, 1]

    # Initialization of parameters
    if initialization == "zeros":
        parameters = initialize_parameters_zeros(layers_dims)
    elif initialization == "random":
        parameters = initialize_parameters_random(layers_dims)
    elif initialization == "he":
        parameters = initialize_parameters_he(layers_dims)

    # Loop (gradient descent)
    for i in range(num_iterations):

        # Forward propagation
        al, cache = L_model_forward(X, parameters)

        # Compute cost
        cost = compute_cost(al, Y)

        # Backward propagation
        grads = L_model_backward(X, Y, cache)

        # Update parameters
        parameters = update_parameters(parameters, grads, learning_rate)

        # Print the cost every 1000 iterations
        if print_cost and i % 1000 == 0:
            print("Cost after iteration {}: {}".format(i, cost))
            costs.append(cost)

    # plot the cost
    plt.plot(costs)
    plt.ylabel('cost')
    plt.xlabel('iterations (per hundreds)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()

    return parameters


In [None]:
def model_with_dropout_and_regularization(X, Y, learning_rate = 0.3, num_iterations = 30000, print_cost = True, initialization="he", lambd = 0, keep_prob = 1):


    """
    
    Implements a neural network model with dropout and regularization.

    Arguments:
    X -- input data, numpy array of shape (input size, number of examples)
    Y -- true "label" vector, numpy array of shape (output size, number of examples)
    learning_rate -- learning rate of the optimization
    num_iterations -- number of iterations of the optimization loop
    print_cost -- True to print the cost every 10000 iterations
    initialization -- type of weight initialization: "zeros", "random", or "he"
    lambd -- regularization hyperparameter, scalar
    keep_prob -- probability of keeping a neuron active during dropout, scalar

    Returns:
    parameters -- parameters learned by the model, which can be used for prediction
    
    """

    grads = {}
    costs = []                            
    m = X.shape[1]                       
    layers_dims = [X.shape[0], 20, 3, 1]

    # Initialization of parameters
    if initialization == "zeros":
        parameters = initialize_parameters_zeros(layers_dims)
    elif initialization == "random":
        parameters = initialize_parameters_random(layers_dims)
    elif initialization == "he":
        parameters = initialize_parameters_he(layers_dims)

    for i in range(0, num_iterations):

        # Forward propagation:
        if keep_prob == 1:
            al, cache = L_model_forward(X, parameters)
        elif keep_prob < 1:
            al, cache = forward_propagation_with_dropout(X, parameters, keep_prob)

        # Cost function
        if lambd == 0:
            cost = compute_cost(al, Y)
        else:
            cost = compute_cost_with_regularization(al, Y, parameters, lambd)

        # Backward propagation.
        assert (lambd == 0 or keep_prob == 1)   # it is possible to use both L2 regularization and dropout, 
        # but in this code block we will only explore one at a time
        if lambd == 0 and keep_prob == 1:
            grads = L_model_backward(X, Y, cache)
        elif lambd != 0:
            grads = backward_propagation_with_regularization(X, Y, cache, lambd)
        elif keep_prob < 1:
            grads = backward_propagation_with_dropout(X, Y, cache, keep_prob)

        # Update parameters.
        parameters = update_parameters(parameters, grads, learning_rate)

        # Print the loss every 10000 iterations
        if print_cost and i % 10000 == 0:
            print("Cost after iteration {}: {}".format(i, cost))
        if print_cost and i % 1000 == 0:
            costs.append(cost)

    # plot the cost
    plt.plot(costs)
    plt.ylabel('cost')
    plt.xlabel('iterations (x1,000)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()

    return parameters

In [33]:
def model_with_optimizer(X, Y, layers_dims, optimizer="adam", learning_rate = 0.0007,initialization="he", mini_batch_size = 64, beta = 0.9,
          beta1 = 0.9, beta2 = 0.999,  epsilon = 1e-8, num_epochs = 5001, print_cost = True, lambd = 0, keep_prob = 1):

    """
    
    Implements a neural network model with different optimization algorithms.

    Arguments:
    X -- input data, numpy array of shape (input size, number of examples)
    Y -- true "label" vector, numpy array of shape (output size, number of examples)
    layers_dims -- list containing the size of each layer (including input layer)
    optimizer -- optimization algorithm to be used: "gd", "momentum", or "adam"
    learning_rate -- learning rate of the optimization
    initialization -- type of weight initialization: "zeros", "random", or "he"
    mini_batch_size -- size of mini-batches for mini-batch gradient descent
    beta -- momentum hyperparameter
    beta1 -- exponential decay hyperparameter for the past gradients estimates in Adam
    beta2 -- exponential decay hyperparameter for the past squared gradients estimates in Adam
    epsilon -- hyperparameter preventing division by zero in Adam updates
    num_epochs -- number of epochs of the optimization loop
    print_cost -- True to print the cost every 1000 epochs
    lambd -- regularization hyperparameter, scalar
    keep_prob -- probability of keeping a neuron active during dropout, scalar

    Returns:
    parameters -- parameters learned by the model. They can then be used to predict.
    
    """

    L = len(layers_dims)             # number of layers in the neural networks
    costs = []                       # to keep track of the cost
    t = 0                            # initializing the counter required for Adam update
    seed = 10                        # For grading purposes, so that your "random" minibatches are the same as ours
    m = X.shape[1]                   # number of training examples

    # Initialize parameters
    if initialization == "zeros":
        parameters = initialize_parameters_zeros(layers_dims)
    elif initialization == "random":
        parameters = initialize_parameters_random(layers_dims)
    elif initialization == "he":
        parameters = initialize_parameters_he(layers_dims)

    # Initialize the optimizer
    if optimizer == "gd":
        pass # no initialization required for gradient descent
    elif optimizer == "momentum":
        v = initialize_velocity(parameters)
    elif optimizer == "adam":
        v, s = initialize_adam(parameters)

    # Optimization loop
    for i in range(num_epochs):

        # Define the random minibatches. We increment the seed to reshuffle differently the dataset after each epoch
        seed = seed + 1
        minibatches = random_mini_batches(X, Y, mini_batch_size, seed)
        cost_total = 0

        for minibatch in minibatches:

            # Select a minibatch
            (minibatch_X, minibatch_Y) = minibatch

            # Forward propagation
            if keep_prob == 1:
                al, cache = L_model_forward(X, parameters)
            elif keep_prob < 1:
                al, cache = forward_propagation_with_dropout(X, parameters, keep_prob)
                

            # Cost function
            if lambd == 0:
                cost_total += compute_cost(al, minibatch_Y)
            else:
                cost_total += compute_cost_with_regularization(al, minibatch_Y, parameters, lambd)


            # Backward propagation

            if lambd == 0 and keep_prob == 1:
                grads = L_model_backward(minibatch_X, minibatch_Y, cache)
            elif lambd != 0:
                grads = backward_propagation_with_regularization(minibatch_X, minibatch_Y, cache, lambd)
            elif keep_prob < 1:
                grads = backward_propagation_with_dropout(minibatch_X, minibatch_Y, cache, keep_prob)

            

            # Update parameters
            if optimizer == "gd":
                parameters = update_parameters(parameters, grads, learning_rate)
            elif optimizer == "momentum":
                parameters, v = update_parameters_with_momentum(parameters, grads, v, beta, learning_rate)
            elif optimizer == "adam":
                t = t + 1 # Adam counter
                parameters, v, s, _, _ = update_parameters_with_adam(parameters, grads, v, s,
                                                                     t, learning_rate, beta1, beta2,  epsilon)
        cost_avg = cost_total / m

        # Print the cost every 1000 epoch
        if print_cost and i % 1000 == 0:
            print ("Cost after epoch %i: %f" %(i, cost_avg))
        if print_cost and i % 100 == 0:
            costs.append(cost_avg)

    # plot the cost
    plt.plot(costs)
    plt.ylabel('cost')
    plt.xlabel('epochs (per 100)')
    plt.title("Learning rate = " + str(learning_rate))
    plt.show()

    return parameters

In [38]:
def model_with_learningrate_decay(X, Y, layers_dims, optimizer, learning_rate = 0.0007, initialization="he", mini_batch_size = 64, beta = 0.9,
          beta1 = 0.9, beta2 = 0.999,  epsilon = 1e-8, num_epochs = 5000, print_cost = True, decay=None, decay_rate=1,lambd = 0, keep_prob = 1):
    """
    3-layer neural network model which can be run in different optimizer modes.
    
    Arguments:
    X -- input data, of shape (2, number of examples)
    Y -- true "label" vector (1 for blue dot / 0 for red dot), of shape (1, number of examples)
    layers_dims -- python list, containing the size of each layer
    learning_rate -- the learning rate, scalar.
    mini_batch_size -- the size of a mini batch
    beta -- Momentum hyperparameter
    beta1 -- Exponential decay hyperparameter for the past gradients estimates 
    beta2 -- Exponential decay hyperparameter for the past squared gradients estimates 
    epsilon -- hyperparameter preventing division by zero in Adam updates
    num_epochs -- number of epochs
    print_cost -- True to print the cost every 1000 epochs

    Returns:
    parameters -- python dictionary containing your updated parameters 
    """

    L = len(layers_dims)             # number of layers in the neural networks
    costs = []                       # to keep track of the cost
    t = 0                            # initializing the counter required for Adam update
    seed = 10                        # For grading purposes, so that your "random" minibatches are the same as ours
    m = X.shape[1]                   # number of training examples
    lr_rates = []
    learning_rate0 = learning_rate   # the original learning rate

    # Initialize parameters
    if initialization == "zeros":
        parameters = initialize_parameters_zeros(layers_dims)
    elif initialization == "random":
        parameters = initialize_parameters_random(layers_dims)
    elif initialization == "he":
        parameters = initialize_parameters_he(layers_dims)


    # Initialize the optimizer
    if optimizer == "gd":
        pass # no initialization required for gradient descent
    elif optimizer == "momentum":
        v = initialize_velocity(parameters)
    elif optimizer == "adam":
        v, s = initialize_adam(parameters)

    # Optimization loop
    for i in range(num_epochs):

        # Define the random minibatches. We increment the seed to reshuffle differently the dataset after each epoch
        seed = seed + 1
        minibatches = random_mini_batches(X, Y, mini_batch_size, seed)
        cost_total = 0

        for minibatch in minibatches:

            # Select a minibatch
            (minibatch_X, minibatch_Y) = minibatch

            # Forward propagation
            if keep_prob == 1:
                al, cache = L_model_forward(X, parameters)
            elif keep_prob < 1:
                al, cache = forward_propagation_with_dropout(X, parameters, keep_prob)
                

            # Cost function
            if lambd == 0:
                cost_total += compute_cost(al, minibatch_Y)
            else:
                cost_total += compute_cost_with_regularization(al, minibatch_Y, parameters, lambd)


            # Backward propagation

            if lambd == 0 and keep_prob == 1:
                grads = L_model_backward(minibatch_X, minibatch_Y, cache)
            elif lambd != 0:
                grads = backward_propagation_with_regularization(minibatch_X, minibatch_Y, cache, lambd)
            elif keep_prob < 1:
                grads = backward_propagation_with_dropout(minibatch_X, minibatch_Y, cache, keep_prob)

            

            # Update parameters
            if optimizer == "gd":
                parameters = update_parameters(parameters, grads, learning_rate)
            elif optimizer == "momentum":
                parameters, v = update_parameters_with_momentum(parameters, grads, v, beta, learning_rate)
            elif optimizer == "adam":
                t = t + 1 # Adam counter
                parameters, v, s, _, _ = update_parameters_with_adam(parameters, grads, v, s,
                                                                     t, learning_rate, beta1, beta2,  epsilon)
        cost_avg = cost_total / m
        
        if decay=='update_lr': 
            learning_rate = update_lr(learning_rate0, i, decay_rate)
        elif decay=='schedule_lr_decay':
            learning_rate = schedule_lr_decay(learning_rate0, i, decay_rate)
        else:
            pass
            
        # Print the cost every 1000 epoch
        if print_cost and i % 1000 == 0:
            print ("Cost after epoch %i: %f" %(i, cost_avg))
            if decay:
                print("learning rate after epoch %i: %f"%(i, learning_rate))
        if print_cost and i % 100 == 0:
            costs.append(cost_avg)

    # plot the cost
    plt.plot(costs)
    plt.ylabel('cost')
    plt.xlabel('epochs (per 100)')
    plt.title("Learning rate = " + str(learning_rate))
    plt.show()

    return parameters