In [None]:
import numpy as np
import scipy.io
import sklearn
import sklearn.datasets
import math 
import matplotlib.pyplot as %pagelt

def load_params_and_grads(seed=1):
    np.random.seed(seed)
    W1 = np.random.randn(2,3)
    b1 = np.random.randn(2,1)
    W2 = np.random.randn(3,3)
    b1 = np.random.randn(3,1)
    
    dW1 = np.random.randn(2,3)
    db1 = np.random.randn(2,1)
    dW2 = np.random.randn(3,3)
    db2 = np.random.randn(3,1)
    return W1,b1,W2,b2,dW1,db1,dW2,db2

def relu(X):
    return np.maximum(0,X)

def sigmoid(X):
    return 1./(1+np.exp(-X))

def init_params(layer_dims):
    np.random.seed(3)
    params={}
    L = len(layer_dims)
    for l in range(1,L):
        params["W"+str(l)]=np.random.randn(layer_dims[l],layer_dims[l-1])*np.sqrt(2/layer_dims[l-1])
        params['b'+str(l)]=np.random.randn(layer_dims[l],1)
    return params

def forward_propagation(X,params):
    W1 = params["W1"]
    b1 = params['b1']
    W2 = params['W2']
    b2 = params['b2']
    W3 = params['W3']
    b3 = params['b3']
    
    z1 = np.dot(W1,X)+b1
    a1 = relu(z1)
    z2 = np.dot(W2,a1)+b2
    a2 = relu(z2)
    z3 = np.dot(W3,a2)+b3
    a3 = sigmoid(z3)
    cache = (z1,a1,W1,b1,z2,a2,W2,b2,z3,a3,W3,b3)
    return a3, cache

def backward(X,Y,cache):
    m = X.shape[1]
    (z1,a1,W1,b1,z2,a2,W2,b2,z3,a3,W3,b3)=cache
    dz3 = 1./m*(a3-Y)
    dW3 = np.dot(dz3,a2.T)
    db3 = np.sum(dz3,axis=1,keepdims=True)
    
    da2 = 1./m*np.dot(W3.T,dz3)
    dz2 = np.multiply(da2,np.int64(a2>0))
    dW2 = np.dot(dz2,a1.T)
    db2 = np.sum(dz2,axis=1,keepdims=True)
    
    da1 = 1./m*np.dot(W2.T,dz2)
    dz1 = np.multiply(da1,np.int64(a1>0))
    dW1 = np.dot(dz1,X.T)
    db1 = np.sum(dz1,axis=1,keepdims=True)
    
    gradients = {"dz3":dz3,"dW3":dW3,"db3":db3,
                "da2":da2,"dz2":dz2,"dW2":dW2,"db2":db2,
                "da1":da1,"dz1":dz1,"dW1":dW1,"db1":db1}
    return gradients

def compute_cost(a3,Y):
    m = Y.shape[1]
    logprobs = np.multiply(-np.log(a3),Y)+np.multiply(-np.log(1-a3),1-Y)
    cost = 1./m*np.sum(logprobs)
    return cost

def predict(X,y,params):
    m = X.shape[1]
    p = np.zeros((1,m),dtype=np.int)
    a3,cache = forward_propagation(X,params)
    for i in range(a3.shape[1]):
        if a3[0,i] > 0.5:
            p[0,i] = 1
        else:
            p[0,i] = 0
    return p

def predict_dec(params, X):
    a3, cache = forward_propagation(X,params)
    predictions = (a3>0.5)
    return predictions

def plot_decision_boundary(model,X,y):
    x_min,x_max = X[0,:].min(),X[0,:].max()
    y_min,y_max = X[1,:].min(),X[1,:].max()
    
    h = 0.01
    xx,yy = np.meshgrid(np.range(x_min,x_max),np.range(y_min,y_max))
    Z = model(np.c_[xx.ravel(),yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.contourf(xx,yy,Z,cmap=plt.cm.Spectral)
    plt.ylabel('x2')
    plt.xlabel('x1')
    plt.scatter(X[0,:],X[1,:],c=y,cmap=plt.cm.Spectral)
    plt.show()
    
def load_dataset():
    np.random.seed(3)
    train_X,train_Y = sklearn.datasets.make_moons(n_samples=300,noise=.2)
    plt.scatter(train_X[:,0],train_X[:,1],c=train_Y,s=40,cmap=plt.cm.Spectral)
    plt.show()
    train_X = train_X.T
    train_Y = train_Y.reshape((1,train_Y.shape[0]))
    return train_X,train_Y

def update_params_with_gd(params,grads,learning_rate):
    L = len(params)//2
    for l in range(L):
        params["W"+str(l+1)] = params['W'+str(l)]-learning_rate*grads['W'+str(l+1)]
        params["b"+str(l+1)] = params['b'+str(l)]-learning_rate*grads['b'+str(l+1)]
    return params

def random_mini_batches(X,Y,mini_batch_size=64,seed=0):
    np.random.seed(seed)
    m = X.shape[1]
    min_batches = []
    
    permutation = list(np.random.permutation(m))
    shuffled_X = X[:,permutation]
    shuffled_Y = Y[:,permutation].reshape((1,m))
    
    num_complete_minibatches = math.floor(m/mini_batch_size)
    for k in range(num_complete_minibatches):
        mini_batch_X = shuffled_X[:,k*mini_batch_size]
        mini_batch_Y = shuffled_Y[:,k*mini_batch_size]
        mini_batch = (mini_batch_X,mini_batch_Y)
        mini_batches.append(mini_batch)
        
    if m%mini_batch_size!=0:
        mini_batch_X = shuffled_X[:,num_complete_minibatches*mini_batch_size:m]
        mini_batch_Y = shuffled_Y[:,num_complete_minibatches*mini_batch_size:m]
        mini_batch = (mini_batch_X,mini_batch_Y)
        mini_batches.append(mini_batch)
    return mini_batches

# momentum
def init_velocity(params):
    L = len(params)//2
    v = {}
    for l in range(L):
        v["dW"+str(l+1)]=np.zeros(params["W"+str(l+1)].shape)
        v["db"+str(l+1)]=np.zeros(params["b"+str(l+1)].shape)
    return v

def update_params_with_momentum(params,grads,v,beta,learning_rate):
    L = len(params)//2
    for l in range(L):
        v['dW'+str(l+1)] = beta*v['dW'+str(l+1)]+(1-beta)*grads['dW'+str(l+1)]
        v['db'+str(l+1)] = beta*v['db'+str(l+1)]+(1-beta)*grads['db'+str(l+1)]
        params['W'+str(l+1)]=params['W'+str(l+1)]-learning_rate*v['dW'+str(l+1)]
        params['b'+str(l+1)]=params['b'+str(l+1)]-learning_rate*v['db'+str(l+1)]
    return params,v

def init_adam(params):
    L = len(params)//2
    v = {}
    s = {}

    for l in range(L):
        v['dW'+str(l+1)]=np.zeros(params['W'+str(l+1)].shape)
        v['db'+str(l+1)]=np.zeros(params['b'+str(l+1)].shape)
        s['dW'+str(l+1)]=np.zeros(params['W'+str(l+1)].shape)
        s['db'+str(l+1)]=np.zeros(params['b'+str(l+1)].shape)
    return v,s

def update_params_with_adam(params,grads,v,s,t,learning_rate=0.01,beta1=0.9,beta2=0.999,epsilon=1e-8):
    L = len(params)//2
    v_corrected = {}
    s_corrected = {}

    for l in range(L):
        v['dW'+str(l+1)]=beta1*v['dW'+str(l+1)]+(1-beta1)*grads['dW'+str(l+1)]
        v['db'+str(l+1)]=beta1*v['db'+str(l+1)]+(1-beta1)*grads['db'+str(l+1)]

        v_corrected['dW'+str(l+1)]=v['dW'+str(l+1)]/(1-beta1**t)
        v_corrected['db'+str(l+1)]=v['db'+str(l+1)]/(1-beta1**t)

        s['dW'+str(l+1)]=beta2*s['dW'+str(l+1)]+(1-beta2)*grads['dW'+str(l+1)]**2
        s['db'+str(l+1)]=beta2*s['db'+str(l+1)]+(1-beta2)*grads['db'+str(l+1)]**2

        s_corrected['dW'+str(l+1)]=s['dW'+str(l+1)]/(1-beta2**t)
        s_corrected['db'+str(l+1)]=s['db'+str(l+1)]/(1-beta2**t)

        params['W'+str(l+1)]=params['W'+str(l+1)]-learning_rate*(v_corrected['dW'+str(l+1)]/np.sqrt(s_corrected['dW'+str(l+1)]+epsilon))
        params['b'+str(l+1)]=params['b'+str(l+1)]-learning_rate*(v_corrected['db'+str(l+1)]/np.sqrt(s_corrected['db'+str(l+1)]+epsilon))
    return params,v,s

def model(X,Y,layers_dims,optimizer,learning_rate=0.0007,mini_batch_size=64,beta=0.9,beta1=0.9,beta2=0.999,epsilon=1e-8,num_epochs=10000,print_cost=True):
    L = len(layers_dims)
    costs=[]
    t=0
    seed=10

    params = init_params(layers_dims)
    if optimizer=='gd':
        pass
    elif optimizer=='momentum':
        v = init_velocity(params)
    elif optimizer=='adam':
        v,s = init_adam(params)

    for i in range(num_epochs):
        seed += 1
        minibatches = random_mini_batches(X,Y,mini_batch_size,seed)

        for minibatch in minibatches:
            (minibatch_X,minibatch_Y) = minibatch
            a3,caches = forward_propagation(minibatch_X, params)
            cost = compute_cost(a3,minibatch_Y)
            grads = backward_propagation(minibatch_X,minibatch_Y,caches)

            if optimizer=='gd':
                params = update_params_with_gd(params,grads,learning_rate)
            elif optimizer=='momentum':
                params,v = update_params_with_momentum(params,grads,v,beta,learning_rate)
            elif optimizer=='adam':
                t += 1
                params,v,s = update_params_with_adam(params,grads,v,s,t,learning_rate,beta1,beta2,epsilon)

        
        if print_cost and i%1000==0:
            print("cost after epoch %i: %f" % (i, cost))
        if print_cost and i%100==0:
            costs.append(cost)

    plt.plot(costs)
    plt.ylabel('cost')
    plt.xlabel("epochs(per 100)")
    plt.title("learning rate= " + str(learning_rate))
    plt.show()
    return params

if __name__=='__main__':
    train_X, train_Y = load_dataset()
    layers_dims = [train_X.shape[0],5,2,1]
    # gradient descent
    # params = model(train_X, train_Y, layers_dims, optimizer='gd')
    # plt.title('Model with gradient descent optimization')

    # momentum
    # params = model(train_X, train_Y, layers_dims, beta=0.9, optimizer='momentum')
    # plt.title('Model with Momentum optimization')

    # adam
    params = model(train_X,train_Y,layers_dims,optimizer='adam')
    plt.title('Model with Adam optimization')

    predictions = predict(train_X, train_Y, params)
    axes = plt.gca()
    axes.set_xlim([-1.5,2.5])
    axes.set_ylim([-1,1.5])
    plot_decision_boundary(lambda x: predict_dec(params, x.T), train_X, train_Y)