In [1]:
#this is all the equations needed for building Neural Network
#input= X, matrix [nx, number of dataset or m]
#output=yhat, matrix[nx of output, number of dataset or m]
"""these are the only functions we need to enter for the model:
init_parameters_deep(layers_dims)
forward_propagation_deep(X, parameters, activation1, activation_final)
compute_cost(AL,Y)
back_propagation_deep(X,Y, AL, caches, activation1, activation_final)
update_parameters(parameters,grads,learning_rate)
"""
#hyperparameter= nodes, L-layer, iterations, learning_rate, activation function
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def sigmoid(z):
    #return result and cache
    result=1/(1+np.exp(-z))
    cache=z
    return result, cache
def relu(z):
    #return result and cache
    result=np.maximum(0,z)
    cache=z
    return result, cache
def tanh(z):
    #return result and cache
    result=(np.exp(z)-np.exp(-z))/(np.exp(z)+np.exp(-z))
    cache=z
    return result, cache

In [3]:
def init_parameters_deep_normal(layers_dims):
    #layers_dims is number of nodes in each layer, start from input until AL(y-hat) layer
    #input in list
    L=len(layers_dims)-1 #L = number of NN layers, -1 because counted as 0
    parameters={}
    for l in range(L):
        parameters[f"W{l+1}"]=np.random.randn(layers_dims[l+1],layers_dims[l])*0.01
        parameters[f"b{l+1}"]=np.zeros((layers_dims[l+1],1))
    return parameters

In [4]:
def init_parameters_deep_he(layers_dims):
    #to anticipate vanishing/exploding gradient
    #works the best using relu/leaky relu
    L=len(layers_dims)
    parameters={}
    for l in range(1,L):
        parameters[f"W{l}"]=np.random.randn(layers_dims[l],layers_dims[l-1])*np.sqrt(2/layers_dims[l-1])
        parameters[f"b{l}"]=np.zeros((layers_dims[l],1))
    return parameters

In [5]:
def forward_linear(A_prev, W, b):
    #inputs A, W, and b, returns Z and cache(A,W,b) for back prop
    Z=np.dot(W,A_prev)+b
    cache=(A_prev,W,b)
    return Z, cache

In [6]:
parameters=init_parameters_deep_normal((12288,20,7,5,1))

In [7]:
def forward_activation_linear(A_prev, W, b, activation):
    """inputs previous A, W, b, activation function. 
    Returns A (current layer) and linear_cache(prev) and activation_cache(current Z)""" 
    Z, linear_cache = forward_linear(A_prev,W,b)
    if activation=="relu":
        A, activation_cache=relu(Z)
    elif activation=="sigmoid":
        A, activation_cache=sigmoid(Z)
    elif activation=="tanh":
        A, activation_cache=tanh(Z)
    else:
        raise ValueError("activation function undefined")
    assert (A.shape == (W.shape[0], A_prev.shape[1]))
    cache=(linear_cache, activation_cache)
    return A, cache

In [8]:
def forward_propagation_deep(X, parameters, activation1, activation_final):
    #input X, parameters, layers_dims, activation function. Return y-hat, dictionary-caches (linear(A,W,b) and activation(Z))
    #layer 1 to L-1 = Relu, layer sigmoid
    L=len(parameters)//2 #number of NN layers
    A_prev=X
    caches=[]
    for l in range(1,L):
        A, cache=forward_activation_linear(A_prev, parameters[f"W{l}"], parameters[f"b{l}"],activation1)
        caches.append(cache)
        A_prev=A
    AL, cache=forward_activation_linear(A_prev, parameters[f"W{L}"], parameters[f"b{L}"],activation_final)
    caches.append(cache)
    return AL, caches

In [9]:
def forward_propagation_deep_dropout(X,parameters,activation1,activation_final,keep_prob):
    #apply dropout from layer 1 to layer L-1
    L=len(parameters)//2
    A_prev=X
    caches=[]
    D_collection=[]
    for l in range(1,L):
        A, cache=forward_activation_linear(A_prev, parameters[f"W{l}"], parameters[f"b{l}"],activation1)
        D=(np.random.randn(A.shape[0],A.shape[1])<keep_prob).astype(int)
        A*=D
        D_collection.append(D) #D will be set as every third index of a member in a cache
        caches.append(cache)
        A/=keep_prob
        A_prev=A
    AL, cache=forward_activation_linear(A_prev, parameters[f"W{L}"], parameters[f"b{L}"],activation_final)
    caches.append(cache)
    return AL, caches, D_collection

In [10]:
def compute_cost(AL,Y):
    #input AL, Y. Return J (Cost function)
    #cost function is logistic regression type
    m=AL.shape[1]
    cost=(np.dot(Y,np.log(AL).T)+np.dot((1-Y),np.log(1-AL).T))/-m
    cost=np.squeeze(cost)
    return cost
def cost_backward(AL,Y):
    Y=Y.reshape(AL.shape) #making sure the shape is the same
    dAL=-(np.divide(Y,AL)-np.divide((1-Y),(1-AL)))
    return dAL

In [11]:
def compute_cost_l2(AL, Y, lambd,parameters,layers_dims):
    #input AL, Y. Return J (Cost function)
    #cost function is logistic regression
    #with l2 regularization, cost function will be added with lambda/2m * sigma of all sum W squared
    m=AL.shape[1] #amount of data
    initial_cost=compute_cost(AL,Y)
    L=len(layers_dims) #number of layer +1
    sum_every_layer=0
    for l in range(1,L):
        sum_every_layer+=np.sum(np.square(parameters[f"W{l}"]))
    l2_part=lambd/(2*m)*sum_every_layer
    l2_cost=initial_cost+l2_part
    return l2_cost

In [12]:
#sigmoid, relu, tanh backward
#calculate dcost/dz
def sigmoid_backward(dA,cache):
    z=cache
    function=1/(1+np.exp(-z))
    backward_function= function*(1-function)*dA
    return backward_function #return dcost/dZ
def relu_backward(dA,cache):
    z=cache
    backward_function=np.array(dA, copy=True)
    backward_function[z<=0]=0
    return backward_function #return dcost/dZ
def tanh_backward(dA,cache):
    z=cache
    function=np.tanh(z)
    backward_function=(1-np.power(function,2))*dA
    return backward_function #return dcost/dZ

In [13]:
#make dA_prev, db, dw
def back_linear(dZ,linear_cache):
    #input A_prev, W, b. Return dW, dB, dA_prev
    A_prev, W, b=linear_cache #current layer linear cache
    m=A_prev.shape[1]
    dW=np.dot(dZ,A_prev.T)/m
    db=np.sum(dZ, axis=1, keepdims=True)/m
    dA_prev=np.dot(W.T,dZ)
    return dA_prev, dW, db

In [14]:
#make dA_prev, db, dw
def back_linear_l2(dZ,linear_cache,lambd):
    #input A_prev, W, b. Return dW, dB, dA_prev
    A_prev, W, b=linear_cache #current layer linear cache
    m=A_prev.shape[1]
    l2_reg=W*lambd/m
    dW=np.dot(dZ,A_prev.T)/m
    dW+=l2_reg
    db=np.sum(dZ, axis=1, keepdims=True)/m
    dA_prev=np.dot(W.T,dZ)
    return dA_prev, dW, db

In [15]:
def back_linear_dropout(dZ, linear_cache, D,keep_prob):
    #input A_prev, W, b. Return dW, dB, dA_prev
    A_prev, W, b=linear_cache #current layer linear cache
    m=A_prev.shape[1]
    dW=np.dot(dZ,A_prev.T)/m
    db=np.sum(dZ, axis=1, keepdims=True)/m
    dA_prev=np.dot(W.T,dZ)
    dA_prev*=D
    dA_prev/=keep_prob
    return dA_prev, dW, db

In [16]:
#make dJ/dA[L-1], dWL, dbL of activation function
def back_activation_linear(dA, cache, activation):
    #input previous function parameters to compute dZ(prev)
    linear_cache, activation_cache=cache
    if activation=="sigmoid":
        dZ=sigmoid_backward(dA,activation_cache)
        dA_prev, dW, db= back_linear(dZ,linear_cache)
    elif activation=="relu":
        dZ=relu_backward(dA,activation_cache)
        dA_prev, dW, db= back_linear(dZ,linear_cache)
    elif activation=="tanh":
        dZ=tanh_backward(dA,activation_cache)
        dA_prev, dW, db= back_linear(dZ,linear_cache)
    else:
        raise ValueError("Activation function not recognized!")
    return dA_prev, dW, db

In [17]:
def back_activation_linear_dropout(dA, cache, D, activation,keep_prob):
    #input previous function parameters to compute dZ(prev)
    linear_cache, activation_cache=cache
    if activation=="sigmoid":
        dZ=sigmoid_backward(dA,activation_cache)
        dA_prev, dW, db= back_linear_dropout(dZ, linear_cache, D,keep_prob)
    elif activation=="relu":
        dZ=relu_backward(dA,activation_cache)
        dA_prev, dW, db= back_linear_dropout(dZ, linear_cache, D,keep_prob)
    elif activation=="tanh":
        dZ=tanh_backward(dA,activation_cache)
        dA_prev, dW, db= back_linear_dropout(dZ, linear_cache, D,keep_prob)
    else:
        raise ValueError("Activation function not recognized!")
    return dA_prev, dW, db

In [18]:
#make dJ/dA[L-1], dWL, dbL of activation function
def back_activation_linear_l2(dA, cache, activation,lambd):
    #input previous function parameters to compute dZ(prev)
    linear_cache, activation_cache=cache
    if activation=="sigmoid":
        dZ=sigmoid_backward(dA,activation_cache)
        dA_prev, dW, db= back_linear_l2(dZ,linear_cache,lambd)
    elif activation=="relu":
        dZ=relu_backward(dA,activation_cache)
        dA_prev, dW, db= back_linear_l2(dZ,linear_cache,lambd)
    elif activation=="tanh":
        dZ=tanh_backward(dA,activation_cache)
        dA_prev, dW, db= back_linear_l2(dZ,linear_cache,lambd)
    else:
        raise ValueError("Activation function not recognized!")
    return dA_prev, dW, db

In [19]:
def back_propagation_deep(AL, Y, caches, activation1, activation_final):
    #input the AL, Y, caches, activation1 (activation for layer 1 until L-1), activation final(layer L)
    #return grads, dictionary of gradient of W, b, A
    grads = {}
    L = len(caches) # the number of layers
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL
    
    # Initializing the backpropagation
    dAL = cost_backward(AL,Y)
    
    current_cache = caches[-1]
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = back_activation_linear(dAL, current_cache, activation_final)
    
    # Loop from l=L-2 to l=0
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = back_activation_linear(grads["dA" + str(l + 1)], current_cache, activation1)
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp

    return grads

In [20]:
def back_propagation_deep_l2(AL, Y, caches, activation1,activation_final,lambd,parameters):
    grads={}
    L=len(parameters)//2#number of layer+1
    m=AL.shape[1]
    Y=Y.reshape(AL.shape) #just in case Y and AL shape is different
    
    dAL=cost_backward(AL,Y)
    grads[f"dA{L-1}"]=dAL
    current_cache = caches[-1]
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = back_activation_linear_l2(dAL, current_cache, activation_final,lambd)
    for l in reversed(range(L-1)):
        current_cache=caches[l]
        dA_prev,dW,db=back_activation_linear_l2(grads[f"dA{l+1}"],current_cache,activation1,lambd)
        grads[f"dA{l}"]=dA_prev
        grads[f"dW{l+1}"]=dW
        grads[f"db{l+1}"]=db
    return grads

In [21]:
def back_propagation_deep_dropout(AL, Y, caches, D_collection, activation1,activation_final,keep_prob):
    #input the AL, Y, caches, activation1 (activation for layer 1 until L-1), activation final(layer L)
    #return grads, dictionary of gradient of W, b, A
    grads = {}
    L = len(caches) # the number of layers
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL
    
    # Initializing the backpropagation
    dAL = cost_backward(AL,Y)
    
    current_cache = caches[-1]
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = back_activation_linear(dAL, current_cache, activation_final)
    grads["dA"+str(L-1)]*=D_collection[-1]
    # Loop from l=L-2 to l=0
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        D=D_collection[l-1]
        if l!=0:
            dA_prev_temp, dW_temp, db_temp = back_activation_linear_dropout(grads["dA" + str(l+1)], current_cache, D, activation1,keep_prob)
        else:
            dA_prev_temp, dW_temp, db_temp = back_activation_linear(grads["dA" + str(l + 1)], current_cache, activation1)
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp

    return grads

In [22]:
def update_parameters_graddesc(parameters,grads,learning_rate):
    L=len(parameters)//2 #number of NN layer, input layer isn't included
    #parameters=parameters - alpha*grads
    for l in range(1,L+1):
        parameters["W"+str(l)]-= learning_rate*grads["dW"+str(l)]
        parameters["b"+str(l)]-= learning_rate*grads["db"+str(l)]
    return parameters

In [None]:
def init_parameters_momentum(parameters):
    #initialize and return v
    v={}
    L=len(parameters)//2
    for l in range(1,L+1):
        v[f"W{l}"]=np.zeros((parameters[f"W{l}"].shape[0],parameters[f"W{l}"].shape[1]))
        v[f"b{l}"]=np.zeros((parameters[f"b{l}"].shape[0],1))
    return v

In [None]:
def update_parameters_momentum(parameters, grads, learning_rate, v, beta=0.9):
    #using higher learning_rate(alpha) is possible now!
    L=len(parameters)//2
    for l in range(1,L+1):
        v[f"W{l}"]=beta*v[f"W{l}"]+(1-beta)*grads[f"dW{l}"]
        v[f"b{l}"]=beta*v[f"b{l}"]+(1-beta)*grads[f"db{l}"]
        parameters[f"W{l}"]-=learning_rate*v[f"W{l}"]
        parameters[f"b{l}"]-=learning_rate*v[f"b{l}"]
    return parameters, v

In [None]:
def init_parameters_adam(parameters):
    v={}
    s={}
    L=len(parameters)//2
    for l in range(1,L+1):
        v[f"W{l}"]=np.zeros((parameters[f"W{l}"].shape[0],parameters[f"W{l}"].shape[1]))
        s[f"W{l}"]=np.zeros((parameters[f"W{l}"].shape[0],parameters[f"W{l}"].shape[1]))
        v[f"b{l}"]=np.zeros((parameters[f"b{l}"].shape[0],1))
        s[f"b{l}"]=np.zeros((parameters[f"b{l}"].shape[0],1))
    return v,s
    #initialize and return adam

In [None]:
def update_parameters_adam(parameters, grads, learning_rate, v, s,t, beta1=0.9, beta2=0.999,epsilon_s=1e-8):
    #using higher learning_rate(alpha) and less data is possible now!
    L=len(parameters)//2
    v_corrected={}
    s_corrected={}
    for l in range(1,L+1):
        v[f"W{l}"]=beta1*v[f"W{l}"]+(1-beta1)*grads[f"dW{l}"]
        v[f"b{l}"]=beta1*v[f"b{l}"]+(1-beta1)*grads[f"db{l}"]
        v_corrected[f"W{l}"]=v[f"W{l}"]/(1-np.power(beta1,t))
        v_corrected[f"b{l}"]=v[f"b{l}"]/(1-np.power(beta1,t))
        s[f"W{l}"]=beta2*s[f"W{l}"]+(1-beta2)*np.power(grads[f"dW{l}"],2)
        s[f"b{l}"]=beta2*s[f"b{l}"]+(1-beta2)*np.power(grads[f"db{l}"],2)
        s_corrected[f"W{l}"]=s[f"W{l}"]/(1-np.power(beta2,t))
        s_corrected[f"b{l}"]=s[f"b{l}"]/(1-np.power(beta2,t))
        parameters[f"W{l}"]-=learning_rate*v_corrected[f"W{l}"]/(np.sqrt(s_corrected[f"W{l}"])+epsilon_s)
        parameters[f"b{l}"]-=learning_rate*v_corrected[f"b{l}"]/(np.sqrt(s_corrected[f"b{l}"])+epsilon_s)
    return parameters, v, s

In [23]:
def dictionary_to_vector(parameters):
    L=len(parameters)//2
    parameterscount={}
    for l in range(1,L+1):
        wvector=np.reshape(parameters[f"W{l}"],(-1,1)) #create vector for every w
        bvector=np.reshape(parameters[f"b{l}"],(-1,1)) #create vector for every b
        if l==1:
            vectors=wvector
        else:
            vectors=np.concatenate((vectors,wvector),axis=0)
        vectors=np.concatenate((vectors,bvector),axis=0)
        parameterscount[f"W{l}"]=wvector.shape[0]
        parameterscount[f"b{l}"]=bvector.shape[0]
    return vectors, parameterscount

In [24]:
def graddict_to_vector(grad):
    L=len(grad)//3
    gradcount={}
    for l in range(1,L+1):
        dwvector=np.reshape(grad[f"dW{l}"],(-1,1)) #create vector for every w
        dbvector=np.reshape(grad[f"db{l}"],(-1,1)) #create vector for every b
        if l==1:
            vectors=dwvector
        else:
            vectors=np.concatenate((vectors,dwvector),axis=0)
        vectors=np.concatenate((vectors,dbvector),axis=0)
        gradcount[f"dW{l}"]=dwvector.shape[0]
        gradcount[f"db{l}"]=dbvector.shape[0]
    return vectors, gradcount

In [34]:
def vector_to_dictionary(vectors,layers_dims,parameterscount):
    L=len(layers_dims) #number of layer + 1
    parameters={}
    prevamount=0
    currentamount=0
    for l in range(1,L):
        currentamount+=layers_dims[l-1]*layers_dims[l]
        wvector=(vectors[prevamount:currentamount]).reshape((layers_dims[l],layers_dims[l-1]))
        prevamount=currentamount
        currentamount+=layers_dims[l]
        bvector=(vectors[prevamount:currentamount]).reshape((layers_dims[l],1))
        prevamount=currentamount
        parameters[f"W{l}"]=wvector
        parameters[f"b{l}"]=bvector
    return parameters

In [None]:
def gradient_checking(X,Y,layers_dims,parameters_dict,grads_dict,epsilon,activation1,activation_final):
    grads_vector,_=graddict_to_vector(grads_dict)
    parameters_vector,parameterscount=dictionary_to_vector(parameters_dict)
    parameters_members=parameters_vector.shape[0]
    J_plus=np.zeros(grads_vector.shape)
    J_minus=np.zeros(grads_vector.shape)
    gradapprox=np.zeros(grads_vector.shape)
    for i in range(parameters_members):
        #compute every J(theta plus)
        print(i)
        theta_plus=np.copy(grads_vector)
        theta_plus[i][0]+=epsilon
        AL,_=forward_propagation_deep(X, vector_to_dictionary(parameters_vector,layers_dims,parameterscount), activation1, activation_final)
        J_plus[i]=compute_cost(AL,Y)
        #compute every J(theta minus)
        theta_minus=np.copy(grads_vector)
        theta_minus[i][0]+=epsilon
        AL,_=forward_propagation_deep(X, vector_to_dictionary(parameters_vector,layers_dims,parameterscount), activation1, activation_final)
        J_minus[i]=compute_cost(AL,Y)
        #compute gradient approximation
        gradapprox[i]=(J_plus[i]-J_minus[i])/(2*epsilon)
    numerator=np.linalg.norm(grads_vector-gradapprox)
    denominator=(np.linalg.norm(grads_vector))+np.linalg.norm(gradapprox)
    difference=numerator-denominator
    if difference<=2e-7:
        print("Your gradient is correct! difference: "+str(difference))
    else:
        print("There might be a mistake! difference: "+str(difference))

In [None]:
def model(X,Y,layers_dims,activation1="relu",activation_final="sigmoid",num_iteration=3000,learning_rate=0.075,printcost=False, init="he", keep_prob=1.0, lambd=0,epsilon=0,epsilon_s=1e-8,grad_desc="adam",beta1=0.9,beta2=0.999):
    """X is the data that will be used as a training set (data features, datapoints)
    Y is the target data in shape of (1,amount of datapoints)
    layers is a list that contain the number of nodes in each layer,
    activation1 will be used for layer 1 - layer L-1
    algorithm_final will be used for layer L before output
    num_iteration is the amount of iteration of training
    learning_rate is the amount that will be used for updating the parameter
    printcost will printcost every 100 iteration"""
    if init=="he":
        parameters=init_parameters_deep_he(layers_dims)
    else:
        parameters=init_parameters_deep(layers_dims)
    costs=[]
    layer=len(layers_dims)#number of hidden layer until L
    if grad_desc=="momentum":
        v=init_parameters_momentum(parameters)
    elif grad_desc=="adam":
        v,s=init_parameters_adam(parameters)
        t=1
    if lambd==0 and keep_prob==1:
        for i in range (0,num_iteration):
            AL,caches=forward_propagation_deep(X, parameters, activation1, activation_final)
            cost=compute_cost(AL,Y)
            grads=back_propagation_deep(AL,Y,caches,activation1,activation_final)
            if grad_desc=="normal":
                #normal gradient descent
                parameters=update_parameters_graddesc(parameters,grads,learning_rate)
            elif grad_desc=="momentum":
                #momentum gradient descent
                parameters, v= update_parameters_momentum(parameters, grads, learning_rate, v, beta1)
            elif grad_desc=="adam":
                #adam gradient descent
                parameters, v, s= update_parameters_adam(parameters, grads, learning_rate, v, s, t, beta1, beta2,epsilon_s)
                t+=1
            else:
                print("Error in gradient descent mode")
                raise ValueError
            if printcost and i%100==0:
                print(f"Cost after {i} iteration: {cost}")
                costs.append(cost)
            if epsilon!=0 and i%1000==0:
                gradient_checking(X,Y,layers_dims,parameters,grads,epsilon,activation1,activation_final)
    elif lambd!=0:
        for i in range(0, num_iteration):
            AL,caches=forward_propagation_deep(X,parameters,activation1,activation_final)
            cost=compute_cost_l2(AL, Y, lambd,parameters,layers_dims)
            grads=back_propagation_deep_l2(AL, Y, caches, activation1,activation_final,lambd,parameters)
            if grad_desc=="normal":
                #normal gradient descent
                parameters=update_parameters_graddesc(parameters,grads,learning_rate)
            elif grad_desc=="momentum":
                #momentum gradient descent
                parameters, v= update_parameters_momentum(parameters, grads, learning_rate, v, beta1)
            elif grad_desc=="adam":
                #adam gradient descent
                parameters, v, s= update_parameters_adam(parameters, grads, learning_rate, v, s, t, beta1, beta2,epsilon_s)
                t+=1
            else:
                print("Error in gradient descent mode")
                raise ValueError
            if printcost and i%100==0:
                print(f"Cost after {i} iteration: {cost}")
                costs.append(cost)
            if epsilon!=0 and i%1000==0:
                gradient_checking(X,Y,layers_dims,parameters,grads,epsilon,activation1,activation_final)
    elif keep_prob<1.0:
        for i in range(0, num_iteration):
            AL,caches, D_collection=forward_propagation_deep_dropout(X,parameters,activation1,activation_final,keep_prob)
            cost=compute_cost(AL, Y)
            grads=back_propagation_deep_dropout(AL, Y, caches, D_collection, activation1,activation_final,keep_prob)
            if grad_desc=="normal":
                #normal gradient descent
                parameters=update_parameters_graddesc(parameters,grads,learning_rate)
            elif grad_desc=="momentum":
                #momentum gradient descent
                parameters, v= update_parameters_momentum(parameters, grads, learning_rate, v, beta1)
            elif grad_desc=="adam":
                #adam gradient descent
                parameters, v, s= update_parameters_adam(parameters, grads, learning_rate, v, s, t, beta1, beta2,epsilon_s)
                t+=1
            else:
                print("Error in gradient descent mode")
                raise ValueError
            if printcost and i%100==0:
                print(f"Cost after {i} iteration: {cost}")
                costs.append(cost)
            if epsilon!=0 and i%1000==0:
                gradient_checking(X,Y,layers_dims,parameters,grads,epsilon,activation1,activation_final)
    plt.plot(np.squeeze(costs))
    plt.ylabel('cost')
    plt.xlabel('iterations (per hundreds)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()
    
    return parameters

In [None]:
trial=np.array([1.1,0,0.9])
print(relu(trial))

In [None]:
def testing(X,Y,parameters,activation1,activation_final):
    """input:
    parameters= a dictionary of parameters that have been trained previously
    layerdims=a list containing the amount of nodes in each layer
    result= matrix containing the target value
    activation1=the activation function for layer 1 - layer L-1
    activation2=the activation function for layer L
    
    
    Output:
    accuracy=the accuracy of the output that is given by the model"""
    AL,caches=forward_propagation_deep(X, parameters, activation1, activation_final)
    m=int(AL.shape[1])
    output=np.zeros((1,m))
    
    for i in range(0,m):
        if AL[0,i]>0.5:
            output[0,i]=1
        else:
            output[0,i]=0
    accuracy=np.sum((output==Y)/m)
    return accuracy

In [None]:
def predict(parameters,activation1,activation_final):
    print(parameters)