In [1]:
"""the def function of this project have been use
   The file can provide the def for the new programer
"""
def NL_initialize_parameters(layer_dims):
    """
    Argument:
    n_x -- size of the input layer
    n_h -- size of the hidden layer
    n_y -- size of the output layer
    
    Returns:
    parameters -- python dictionary containing your parameters:
    
                    W1 -- weight matrix of shape (Layer_dims[W1], Layer_dims[W0])
                    b1 -- bias vector of shape (Layer_dims[W1], 1)
                    W2 -- (Layer_dims[W2], Layer_dims[W1])
                    b1 -- (Layer_dims[W2], 1)
                    
                    So we can get these rule: ("L" denotes the number of layer.)
                    Shape of "WL" -- (Layer_dims[L], Layer_dims[L-1])
                    Shape of "bL" -- (Layer_dims[L], 1)
    """
    np.random.seed(1)
    parameters = {}
    Layers = len(layer_dims)
    
    for l in range(1, Layers):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) / np.sqrt(layer_dims[l-1])
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
    
    return parameters
# The Forward pass
def NL_forwardpass(X, parameters):
    """
    Implement forward propagation for the [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID computation
    
    Arguments:
    X -- data, numpy array of shape (input size, number of examples)
    parameters -- output of initialize_parameters_deep()
    
    Returns:
    AL -- last post-activation value
    caches -- list of caches containing:
                every cache of linear_relu_forward() (there are L-1 of them, indexed from 0 to L-2)
                the cache of linear_sigmoid_forward() (there is one, indexed L-1)
    """
    
    caches = []
    A = X
    L = len(parameters) // 2  
    linear_cache = {}
    activation_cache = {}
    
    for l in range(1, L):
        linear_cache = {}
        activation_cache = {}
        A_prev = A
        W = parameters['W' + str(l)]
        b = parameters['b' + str(l)]
        
        # Linear calculation
        linear_cache["A_prev" + str(l)] = A_prev
        linear_cache["W" + str(l)] = W
        linear_cache["b" + str(l)] = b
        Z = W.dot(A_prev) + b
        
        # ReLU calculation: ReLU
        A = np.maximum(0, Z)
        activation_cache["Z" + str(l)] = Z
        cache = (linear_cache, activation_cache)
        caches.append(cache)
    
    ## The final layer.
    linear_cache = {}
    activation_cache = {}
    W = parameters['W' + str(L)]
    b = parameters['b' + str(L)]
    
    # Linear calculation
    # 
    linear_cache["A_prev" + str(L)] = A
    linear_cache["W" + str(L)] = W
    linear_cache["b" + str(L)] = b
    ZL = W.dot(A) + b
    
    # Activative calculation: Sigmoid
    AL = 1/(1+np.exp(-ZL))
    activation_cache["Z" + str(L)] = ZL
    
    
    cache = (linear_cache, activation_cache)
    caches.append(cache)
    
    
    assert(AL.shape == (1,X.shape[1]))
    
    return AL, caches
# Calculate the cost, In other words, calculate the error function
def compute_cost(AL, Y):
    m = Y.shape[1]   # number of class
    # Logistic regression
    #cost = (1./m) * (-np.dot(Y,np.log(AL).T) - np.dot(1-Y, np.log(1-AL).T))
    #cost = np.squeeze(cost)  # makes sure cost is the dimension we expect.
    #return cost
    return 0.5*np.sum((AL-Y)**2)
# The backward pass 
def NL_backwardpass(AL, X, Y, caches):
    """
    Implement the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group
    
    Arguments:
    AL -- probability vector, output of the forward propagation (L_model_forward())
    Y -- true "label" vector (containing 0 if non-cat, 1 if cat)
    caches -- list of caches containing:
                every cache of linear_activation_forward() with "relu" (there are (L-1) or them, indexes from 0 to L-2)
                the cache of linear_activation_forward() with "sigmoid" (there is one, index L-1)
    
    Returns:
    grads -- A dictionary with the gradients
             grads["dA" + str(l)] = ... 
             grads["dW" + str(l)] = ...
             grads["db" + str(l)] = ... 
    """
    
    grads = {}
    L = len(caches)
    m = AL.shape[1]
    Y = Y.reshape(AL.shape)
    
    ## Initializing the backpropagation
    ## Computes the gradient of AL. (AL means the y-hat of model.)
    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    
    ## L-th layer gradients.
    ## (Sigmoid -> Linear)
    current_cache = caches[L-1]   # The index of caches is in the range of 0 to L-1.
    linear_cache, activation_cache = current_cache
    
    # dZL (Sigmoid backward)
    s = 1/(1+np.exp( -activation_cache["Z" + str(L)] ))
    dZL = dAL * s * (1-s)
    
    # dA_prev (Linear backward)
    A_prev = linear_cache["A_prev" + str(L)]
    W = linear_cache["W" + str(L)]
    b = linear_cache["b" + str(L)]
    m = A_prev.shape[1]
    dW = 1./m * np.dot(dZL, A_prev.T)
    db = 1./m * np.sum(dZL, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZL)

    ## Save grads.(The L-th layer)
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = dA_prev, dW, db
    
    ## The value of l is decreased from L-1 to 1.
    for l in reversed(range(1, L)):
        ## l-th layer gradients.
        ## (ReLU -> Linear), Example: caches[2] contains: A_prev3, W3, b3
        current_cache = caches[l-1]
        linear_cache, activation_cache = current_cache
    
        # dZ (ReLU backward)
        Z = activation_cache["Z" + str(l)]
        dZ = np.array(grads["dA" + str(l)], copy=True)
        dZ[Z<=0] = 0
        assert (dZ.shape == Z.shape)  # check shape
        
        # dA (Linear backward)
        A_prev = linear_cache["A_prev" + str(l)]
        W = linear_cache["W" + str(l)]
        b = linear_cache["b" + str(l)]
        m = A_prev.shape[1]
        dW = 1./m * np.dot(dZ,A_prev.T)
        db = 1./m * np.sum(dZ, axis = 1, keepdims = True)
        dA_prev = np.dot(W.T,dZ)
        assert (dA_prev.shape == A_prev.shape)  # check shape
        assert (dW.shape == W.shape)
        assert (db.shape == b.shape)
        
        ## Save grads.(the l-th layer)
        grads["dA" + str(l-1)], grads["dW" + str(l)], grads["db" + str(l)] = dA_prev, dW, db
    
    return grads
# Update parameters
def update_parameters(parameters, grads, learning_rate = 1.2):
    """
    Update parameters using gradient descent
    
    Arguments:
    parameters -- python dictionary containing your parameters 
    grads -- python dictionary containing your gradients, output of L_model_backward
    
    Returns:
    parameters -- python dictionary containing your updated parameters 
                  parameters["W" + str(l)] = ... 
                  parameters["b" + str(l)] = ...
    """
    
    L = len(parameters) // 2 # number of layers in the neural network
    
    # Update rule for each parameter. Use a for loop.
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]
    
    return parameters

"""initializing the parameter -> forward pass -> compute cost -> backward pass -> update parameter
    repeat the aformation step until the end of iteration"""
def NL_nn_model(X, Y, layers_dims, num_iterations = 5000, learning_rate=0.08, print_cost = False):
    """
    Arguments:
    X -- dataset of shape (2, number of examples)
    Y -- labels of shape (1, number of examples)
    layers_dims -- size of all the hidden layers
    num_iterations -- Number of iterations in gradient descent loop
    print_cost -- if True, print the cost every 1000 iterations
    
    Returns:
    parameters -- parameters learnt by the model. They can then be used to predict.
    """
    costs = []
    np.random.seed(1)
    
    # Initialize W, b
    parameters = NL_initialize_parameters(layers_dims)
    
    for i in range(0, num_iterations):
        # Forward pass
        AL, caches = NL_forwardpass(X, parameters)
        # Compute cost
        cost = compute_cost(AL, Y)

        # Backward pass
        grads = NL_backwardpass(AL, X, Y, caches)
        
        # Update parameters
        parameters = update_parameters(parameters, grads, learning_rate)
        
        # Print the cost so far every 500 iterations
        if i % 500 == 0:
            costs.append(cost)
            if print_cost:
                print("Cost after iteration {}: {}".format(i, cost))
                #print(AL.shape)
    #print(AL)
    # Append the cost of final layer into "costs".
    costs.append(cost)
    plt.figure(num=1, figsize=(8,5))
    plt.semilogy(costs)
    plt.xlabel("Iterations")
    plt.ylabel("Cost")
    plt.title("Learning Rate = " + str(learning_rate))
    plt.show()

    return parameters
"""
    the def function cannot return AL and parameter because the parameters will change formation from
    dictionary to index when return them at the same time
"""
# Predict
def predict(X, y, parameters):
    """
    This function is used to predict the results of a  L-layer neural network.
    
    Arguments:
    X -- data set of examples you would like to label
    parameters -- parameters of the trained model
    
    Returns:
    p -- predictions for the given dataset X
    """
    
    m = X.shape[1]
    n = len(parameters) // 2 # number of layers in the neural network
    p = np.zeros((5,m))
    
    # Forward propagation
    probas, caches = NL_forwardpass(X, parameters)  

    print("Accuracy: "+str(abs(probas-y)/probas))
        
    return probas

def output(X,parameters):
    probas, caches = NL_forwardpass(X, parameters)
    
    #print("Accuracy:"+str(abs(probas-y)/probas))
    
    return probas