## Deep Neural Network from Scratch - One step at a time

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import h5py
import collections

np.random.seed(42)

### Load the dataset

In [2]:
def load_dataset(path):
    '''
    Function to load the dataset
    '''
    
    f = h5py.File(path, 'r')
    x_key = list(f.keys())[1]
    y_key = list(f.keys())[2]
    
    X_data = f[x_key]
    y_data = f[y_key]
    
    return (X_data, y_data)

In [3]:
X_train, y_train = load_dataset('train_catvnoncat.h5')
X_test, y_test = load_dataset('test_catvnoncat.h5')

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(209, 64, 64, 3)
(209,)
(50, 64, 64, 3)
(50,)


In [4]:
X_train = np.reshape(X_train, (X_train.shape[0],-1)).T
X_test = np.reshape(X_test, (X_test.shape[0],-1)).T

print(X_train.shape)
print(X_test.shape)

(12288, 209)
(12288, 50)


In [5]:
y_train = np.reshape(y_train, (1,-1))
y_test = np.reshape(y_test, (1,-1))

print(y_train.shape)
print(y_test.shape)

(1, 209)
(1, 50)


In [6]:
X_train = X_train / X_train.max()
X_test = X_test / X_test.max()

### Helper functions

In [88]:
def sigmoid(z):
    
    s = np.divide(1, (1 + np.exp(-z)))
    return s

In [89]:
def relu(z):
    
    return np.maximum(0,z)

In [90]:
def leaky_relu(z):
    
    return np.maximum(0.01*z,z)

In [91]:
def tanh(z):
    
    return (np.exp(z) - np.exp(-z)) / (np.exp(z) + np.exp(-z))

In [92]:
def sigmoid_back(z):
    
    g_prime = np.multiply(sigmoid(z), (1 - sigmoid(z)))
    return g_prime

In [93]:
def relu_back(z):
    
    g_prime = np.where(z <= 0, 0, 1)
    return g_prime

In [94]:
def leaky_relu_back(z):
    
    g_prime = np.where(z <= 0, 0.01, 1)
    return g_prime

In [95]:
def tanh_back(z):
    
    g_prime = 1 - (tanh(z))**2
    return g_prime

In [96]:
def forward_prop(X, n_layers, layers_dict, training=True):
    
    m = X.shape[1]
    
    if not training:
        print("m: ",m)
        temp_A = [None]
    
    for curr_layer in range(1, n_layers):
        
        if curr_layer == 1:
            A_prev = X
        else:
            if training:
                A_prev = layers_dict[str(curr_layer - 1)]["cache"]["A"]
            else:
                A_prev = temp_A[curr_layer-1]
            
        W = layers_dict[str(curr_layer)]["params"]["W"]
        b = layers_dict[str(curr_layer)]["params"]["b"]
        
        activation = layers_dict[str(curr_layer)]["activation"]["forward"]
        
        assert(W.shape[1] == A_prev.shape[0])
        assert(W.shape[0] == b.shape[0])
        
        Z = np.dot(W, A_prev) + b
        A = eval(activation)(Z) # Calls the layer's corresponding activation function
        
        #assert(Z.shape == (layer_dims[curr_layer],m))
        assert(A.shape[1] == m)
        
        if training:
            
            cache = {
                "A" : A,
                "Z" : Z,
            }
        
            layers_dict[str(curr_layer)]["cache"] = cache
            
        else:
            
            temp_A.append(A)

                     
    return layers_dict, A

In [118]:
def compute_cost_and_dA(y, A):
    
    m = y.shape[1]
    
    J = ( -1 / m ) * np.sum( np.multiply(y, np.log(A)) + ( np.multiply((1 - y), np.log(1 - A))) )
    J = np.squeeze(J)
    
    dA = -( np.divide(y, A) ) + ( np.divide((1 - y), (1 - A)))
    
    assert(dA.shape == A.shape)
    assert(J.shape == ())
    
    return J, dA

In [98]:
def backprop(X, dA, layers_dict, n_layers):
    
    m = dA.shape[1]
    
    for curr_layer in range(n_layers-1, 0, -1):
        
        if curr_layer != n_layers - 1:
            dA = layers_dict[str(curr_layer + 1)]["derivative"]["dA"]
            
        if curr_layer == 1:
            A_l_minus_one = X
        else:
            A_l_minus_one = layers_dict[str(curr_layer - 1)]["cache"]["A"]
            
        Z = layers_dict[str(curr_layer)]["cache"]["Z"]
        W = layers_dict[str(curr_layer)]["params"]["W"]
        activation = layers_dict[str(curr_layer)]["activation"]["backward"]
        
        #compute dZ[l] = dA[l] * g[l]'(Z[l]), shape: (l,m) * (l,m) = (l,m)
        dZ = np.multiply(dA, eval(activation)(Z))
        
        #compute dW[l] = (1/m)(dZ[l]. A[l-1].T), shape: (l,m).(m,l-1) = (l,l-1)
        dW = np.multiply((1 / m), np.dot(dZ, A_l_minus_one.T))
        
        #compute db[l] = (1/m)(sum(dZ) across the rows), shape: sum(l,m) = (l,1)
        db = np.multiply((1 / m), np.sum(dZ, axis=1, keepdims=True))
        
        #compute dA[l-1] = (W[l].T . dZ[l]), shape: (l-1,l).(l,m) = (l-1,m)
        dA = np.dot(W.T, dZ)
        
        derivative = {
            "dW" : dW,
            "db" : db,
            "dA" : dA
        }
        
        layers_dict[str(curr_layer)]["derivative"] = derivative
        
    return layers_dict
        

In [130]:
def initialize_dictionary(n_layers, layer_dims, activations=None):
    
    d = collections.defaultdict()
    
    if activations is not None:
        assert(len(activations) == n_layers)
    
    for curr_layer in range(1, n_layers):
        
        if activations:
            activation = {
                'forward' : activations[curr_layer],
                'backward' : activations[curr_layer] + "_back"
            }
        
        else:
            activation = None
        
        l_minus_one = layer_dims[curr_layer - 1]
        l = layer_dims[curr_layer]
        
        W = np.random.randn(l, l_minus_one) * 0.05
        b = np.zeros((l, 1))
        
        params = {
            "W": W, 
            "b": b
        }
        
        empty_dict = {
            "activation" : activation,
            "cache" : None,
            "derivative" : None,
            "params" : params
        }
        
        d[str(curr_layer)] = empty_dict
        
    return d

In [100]:
def update_parameters(alpha, n_layers, layers_dict):
    
    for curr_layer in range(1, n_layers):
        
        W = layers_dict[str(curr_layer)]["params"]["W"]
        b = layers_dict[str(curr_layer)]["params"]["b"]
        
        dW = layers_dict[str(curr_layer)]["derivative"]["dW"]
        db = layers_dict[str(curr_layer)]["derivative"]["db"]
        
        assert(W.shape == dW.shape)
        assert(b.shape == db.shape)
        
        W = W - np.multiply(alpha, dW)
        b = b - np.multiply(alpha, db)
        
        layers_dict[str(curr_layer)]["params"]["W"] = W
        layers_dict[str(curr_layer)]["params"]["b"] = b
        
    return layers_dict

In [101]:
def train(X_train, y_train, n_layers, layer_dims, activations, epochs, alpha, verbose):

    costs = []
    
    layers_dict = initialize_dictionary(n_layers=n_layers, layer_dims=layer_dims, activations=activations)
    
    for epoch in range(1, epochs+1):
        
        layers_dict, A = forward_prop(X=X_train, n_layers=n_layers, layers_dict=layers_dict)
        
        J, dA = compute_cost_and_dA(y=y_train, A=A)
        costs.append(J)
        
        
        layers_dict = backprop(X=X_train, dA=dA, n_layers=n_layers, layers_dict=layers_dict)
        
        layers_dict = update_parameters(alpha=alpha, n_layers=n_layers, layers_dict=layers_dict)
        
        if verbose and (epoch == 1 or epoch%100 == 0):
            
            print("Epoch: {}, Loss: {}".format(epoch, J))
            
    return costs, layers_dict
        
        
    

In [102]:
def get_params(n_layers, layers_dict):
    
    params = {}
    
    for curr_layer in range(1, n_layers):
        
        curr_layer_params = layers_dict[str(curr_layer)]["params"]
        params[str(curr_layer)] = curr_layer_params
        
    return params
        

In [129]:
def predict(X, n_layers, layers_dict, threshold):
    
    layers_dict, A = forward_prop(X=X, n_layers=n_layers, layers_dict=layers_dict, training=False)
    
    predictions = [(lambda pred: 0 if activation < threshold else 1)(activation) for activation in np.squeeze(A)]
    predictions = np.array(predictions).reshape(1,-1)
    
    return predictions    


In [104]:
def run(X_train, y_train, X_test, y_test, layer_dims, activations, epochs=1000, alpha=0.01, verbose=True, threshold=0.5):
    
    n_layers = len(layer_dims)
    n_activations = len(activations)
    n_features = X_train.shape[0]
    m_train = X_train.shape[1]
    m_test = X_test.shape[1]
    
    assert (layer_dims[0] == n_features), "First entry in the layer_dims should be number of training features"
    assert (activations[0] == None), "Input layer's activation should be None"
    assert (n_layers == n_activations), "Number of layers and their corresponding activations do not match"
    
    costs, layers_dict = train(X_train, y_train, n_layers, layer_dims, activations, epochs, alpha, verbose)
    
    yhat_train = predict(X_train, n_layers, layers_dict, threshold)
    yhat_test = predict(X_test, n_layers, layers_dict, threshold)
    
    train_acc = 100 - np.mean(np.abs(yhat_train - y_train)) * 100
    test_acc = 100 - np.mean(np.abs(yhat_test - y_test)) * 100
    
    print("Train Accuracy: {:.4f}".format(train_acc))
    print("Test Accuracy: {:.4f}".format(test_acc))
    
    params = get_params(n_layers, layers_dict)
      
    return params   
    
    

In [175]:
n_features = X_train.shape[0]
layer_dims = [64, 16, 1]
layer_dims = [n_features] + layer_dims
activations = ['leaky_relu', 'leaky_relu', 'sigmoid']
activations = [None] + activations

In [176]:
print(layer_dims)
print(activations)

[12288, 64, 16, 1]
[None, 'leaky_relu', 'leaky_relu', 'sigmoid']


In [177]:
params = run(X_train, y_train, X_test, y_test, layer_dims, activations, epochs=1500, alpha=0.04, verbose=True, threshold=0.5)

Epoch: 1, Loss: 0.70722184617536
Epoch: 100, Loss: 0.6165365084081325
Epoch: 200, Loss: 0.5180424459363124
Epoch: 300, Loss: 0.39547744931858264
Epoch: 400, Loss: 0.3056790565160951
Epoch: 500, Loss: 0.12313121990322329
Epoch: 600, Loss: 0.08374881487605544
Epoch: 700, Loss: 0.04040256478579695
Epoch: 800, Loss: 0.011683974312896757
Epoch: 900, Loss: 0.006491258995309498
Epoch: 1000, Loss: 0.004275134375809059
Epoch: 1100, Loss: 0.0030899423148545123
Epoch: 1200, Loss: 0.002375321431583761
Epoch: 1300, Loss: 0.0019032742490780678
Epoch: 1400, Loss: 0.0015731721259739705
Epoch: 1500, Loss: 0.0013328924869407592
m:  209
m:  50
Train Accuracy: 100.0000
Test Accuracy: 80.0000
