In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.io
import math
import sklearn
import sklearn.datasets

from w2_opt_utils_v1a import load_params_and_grads, initialize_parameters, forward_propagation, backward_propagation
from w2_opt_utils_v1a import compute_cost, predict, predict_dec, plot_decision_boundary, load_dataset
from copy import deepcopy


In [None]:
def update_parameters_with_gd(parameters, grads, learning_rate):
    # Simple gradient descend
    L = len(parameters) // 2 # number of layers in the neural networks
    for l in range(1, L + 1):
        
        parameters[f"W{l}"] = parameters[f"W{l}"] - learning_rate * grads['dW' + str(l)]
        parameters[f"b{l}"] = parameters[f"b{l}"] - learning_rate * grads['db' + str(l)]
        
    return parameters

def random_mini_batches(X, Y, mini_batch_size = 64, seed = 0):
    
    np.random.seed(seed)           
    m = X.shape[1]               
    mini_batches = []
    
    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation].reshape((1, m))
    
    inc = mini_batch_size
    
    num_complete_minibatches = math.floor(m / mini_batch_size)
    for k in range(0, num_complete_minibatches):
        
        mini_batch_X = shuffled_X[:,k*mini_batch_size:(k+1)*mini_batch_size]
        mini_batch_Y = shuffled_Y[:,k*mini_batch_size:(k+1)*mini_batch_size]
        
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
        
    if m % mini_batch_size != 0:
        
        mini_batch_X = shuffled_X[:,(k+1)*mini_batch_size:]
        mini_batch_Y = shuffled_Y[:,(k+1)*mini_batch_size:]
        
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    return mini_batches

def initialize_velocity(parameters):
    
    L = len(parameters) // 2 
    v = {}
    
    for l in range(1, L + 1):
        
        v[f"dW{l}"] = np.zeros((parameters[f"W{l}"].shape[0], parameters[f"W{l}"].shape[1]))
        v[f"db{l}"] = np.zeros((parameters[f"b{l}"].shape[0], parameters[f"b{l}"].shape[1]))
        
        
    return v

def update_parameters_with_momentum(parameters, grads, v, beta, learning_rate):
    
    L = len(parameters) // 2
    
    for l in range(1, L + 1):
        
        v[f"dW{l}"] = beta * v[f"dW{l}"] + (1-beta) * grads[f"dW{l}"]
        v[f"db{l}"] = beta * v[f"db{l}"] + (1-beta) * grads[f"db{l}"]
        parameters[f"W{l}"] = parameters[f"W{l}"] - learning_rate * v[f"dW{l}"]
        parameters[f"b{l}"] = parameters[f"b{l}"] - learning_rate * v[f"db{l}"]
        
    return parameters, v

def initialize_adam(parameters) :
    
    L = len(parameters) // 2 
    v = {}
    s = {}
    
    for l in range(1, L + 1):
        
        v[f"dW{l}"] = np.zeros((parameters[f"W{l}"].shape[0],parameters[f"W{l}"].shape[1]))
        v[f"db{l}"] = np.zeros((parameters[f"b{l}"].shape[0],parameters[f"b{l}"].shape[1]))
        s[f"dW{l}"] = np.zeros((parameters[f"W{l}"].shape[0],parameters[f"W{l}"].shape[1]))
        s[f"db{l}"] = np.zeros((parameters[f"b{l}"].shape[0],parameters[f"b{l}"].shape[1]))
        
    return v, s


# GRADED FUNCTION: update_parameters_with_adam

def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate = 0.01,
                                beta1 = 0.9, beta2 = 0.999,  epsilon = 1e-8):
    """
    Update parameters using Adam
    
    Arguments:
    parameters -- python dictionary containing your parameters:
                    parameters['W' + str(l)] = Wl
                    parameters['b' + str(l)] = bl
    grads -- python dictionary containing your gradients for each parameters:
                    grads['dW' + str(l)] = dWl
                    grads['db' + str(l)] = dbl
    v -- Adam variable, moving average of the first gradient, python dictionary
    s -- Adam variable, moving average of the squared gradient, python dictionary
    t -- Adam variable, counts the number of taken steps
    learning_rate -- the learning rate, scalar.
    beta1 -- Exponential decay hyperparameter for the first moment estimates 
    beta2 -- Exponential decay hyperparameter for the second moment estimates 
    epsilon -- hyperparameter preventing division by zero in Adam updates

    Returns:
    parameters -- python dictionary containing your updated parameters 
    v -- Adam variable, moving average of the first gradient, python dictionary
    s -- Adam variable, moving average of the squared gradient, python dictionary
    """
    
    L = len(parameters) // 2                 # number of layers in the neural networks
    v_corrected = {}                         # Initializing first moment estimate, python dictionary
    s_corrected = {}                         # Initializing second moment estimate, python dictionary
    
    # Perform Adam update on all parameters
    for l in range(1, L + 1):
        # Moving average of the gradients. Inputs: "v, grads, beta1". Output: "v".
        # (approx. 2 lines)
        # v["dW" + str(l)] = ...
        # v["db" + str(l)] = ...
        # YOUR CODE STARTS HERE
        v[f"dW{l}"] = beta1 * v[f"dW{l}"] + (1-beta1) * grads[f"dW{l}"]
        v[f"db{l}"] = beta1 * v[f"db{l}"] + (1-beta1) * grads[f"db{l}"]
        # YOUR CODE ENDS HERE

        # Compute bias-corrected first moment estimate. Inputs: "v, beta1, t". Output: "v_corrected".
        # (approx. 2 lines)
        # v_corrected["dW" + str(l)] = ...
        # v_corrected["db" + str(l)] = ...
        # YOUR CODE STARTS HERE
        v_corrected[f"dW{l}"] = v[f"dW{l}"] / (1 - np.power(beta1, t) )
        v_corrected[f"db{l}"] = v[f"db{l}"] / (1 - np.power(beta1, t) )
        # YOUR CODE ENDS HERE

        # Moving average of the squared gradients. Inputs: "s, grads, beta2". Output: "s".
        #(approx. 2 lines)
        # s["dW" + str(l)] = ...
        # s["db" + str(l)] = ...
        # YOUR CODE STARTS HERE
        s[f"dW{l}"] = beta2 * s[f"dW{l}"] + (1 - beta2) * grads[f"dW{l}"] * grads[f"dW{l}"]
        s[f"db{l}"] = beta2 * s[f"db{l}"] + (1 - beta2) * grads[f"db{l}"] * grads[f"db{l}"]
        # YOUR CODE ENDS HERE

        # Compute bias-corrected second raw moment estimate. Inputs: "s, beta2, t". Output: "s_corrected".
        # (approx. 2 lines)
        # s_corrected["dW" + str(l)] = ...
        # s_corrected["db" + str(l)] = ...
        # YOUR CODE STARTS HERE
        s_corrected[f"dW{l}"] = s[f"dW{l}"] / (1 - np.power(beta2, t) )
        s_corrected[f"db{l}"] = s[f"db{l}"] / (1 - np.power(beta2, t) )
        # YOUR CODE ENDS HERE

        # Update parameters. Inputs: "parameters, learning_rate, v_corrected, s_corrected, epsilon". Output: "parameters".
        # (approx. 2 lines)
        # parameters["W" + str(l)] = ...
        # parameters["b" + str(l)] = ...
        # YOUR CODE STARTS HERE
        parameters[f"W{l}"] = parameters[f"W{l}"] - learning_rate * v_corrected[f"dW{l}"] / np.sqrt(s_corrected[f"dW{l}"] + epsilon)
        parameters[f"b{l}"] = parameters[f"b{l}"] - learning_rate * v_corrected[f"db{l}"] / np.sqrt(s_corrected[f"db{l}"] + epsilon)
        # YOUR CODE ENDS HERE

    return parameters, v, s, v_corrected, s_corrected