In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [36]:
charset = {
                'digit_0' : 0,
                'digit_1' : 1,
                'digit_2' : 2,
                'digit_3' : 3,
                'digit_4' : 4,
                'digit_5' : 5,
                'digit_6' : 6,
                'digit_7' : 7,
                'digit_8' : 8,
                'digit_9' : 9,
}
train_data = pd.read_csv('./dataset/train_digits_data.csv')
test_data  = pd.read_csv('./dataset/test_digits_data.csv')
X_train = train_data.iloc[:, :-1].values
y_train = train_data.iloc[:, -1]
y_train = y_train.replace(charset)
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size = .01)

X_test = test_data.iloc[:, :-1].values
y_test = test_data.iloc[:, -1]

In [None]:
mm = MinMaxScaler()
X_train = mm.fit_transform(X_train)
X_dev   = mm.fit_transform(X_dev)

In [37]:
print("Data\t\t\t","Before Processing\t","After Processing")
print("=================================================================")
print("Training Set Images:\t" + str(train_data.shape)+"\t\t"+ str(X_train.shape))
print("Training Set Labels:\t" + str(train_data.iloc[:, -1].shape)+"\t\t"+ str(y_train.shape))
print("Dev Set Images:\t\t" + str(X_dev.shape)+"\t\t"+ str(X_dev.shape))
print("Dev Set Labels:\t\t" + str(y_dev.shape)+"\t\t\t"+ str(y_dev.shape))
print("Test Set Images:\t" + str(test_data.shape)+"\t\t"+ str(X_test.shape))
print("Test Set Labels:\t" + str(test_data.iloc[:, -1].shape)+"\t\t\t"+ str(test_data.iloc[:, -1].shape))
print("=================================================================")

Data			 Before Processing	 After Processing
Training Set Images:	(17000, 1025)		(16830, 1024)
Training Set Labels:	(17000,)		(16830,)
Dev Set Images:		(170, 1024)		(170, 1024)
Dev Set Labels:		(170,)			(170,)
Test Set Images:	(3000, 1025)		(3000, 1024)
Test Set Labels:	(3000,)			(3000,)


# Utilities

In [38]:
def relu(Z):
    """
        Compute the ReLU activation of Z
        
        Argument:
            - Z -- Array of the Sum of the product of Weights and input
        
        Returns:
            - A -- Array of Activation obtained by applying ReLU function. same size as that of Z
    """
    A = np.maximum(0.0,Z)
    
    cache = Z
    assert(A.shape == Z.shape)
    return A, cache

In [39]:
def relu_grad(dA, cache):
    """
        Compute the gradient of dA
        
        Arguments:
            - dA -- Array of the gradient of activation of the previous layer
            - cache -- list of other useful variables like Z
            
        Returns:
            - dZ -- array of gradient/derivative of the dA, Same size of dA
    """
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.
    
    dZ[Z < 0] = 0
    
    assert(dZ.shape == Z.shape)
    return dZ

In [40]:
def softmax(Z):
    """
        Compute the softmax activtion of Z
        
        Argument:
            - Z -- Array of the Sum of the product of Weights and input
        
        Returns:
            - A -- Array of Activation obtained by applying Softmax function. same size as that of Z
    """
    shift = Z - np.max(Z) #Avoiding underflow or overflow errors due to floating point instability in softmax
    t = np.exp(shift)
    A = np.divide(t,np.sum(t,axis = 0))
    
    cache = Z
    assert(A.shape == Z.shape)
    return A, cache

In [41]:
def init_layers():
    """
        Initializes the layers of NN with certain neural units in each layers
        
        Returns:
            - layer_dim -- list of the units of each layer of the network
    """
    layers_dim = [1024,32,32,10]
    return layers_dim

In [42]:
def init_params(layers_dim):
    """
        Initializes the parameters of the Neural Network
        
        Argument:
            - layers_dim -- Layer Dimensions of the NN
        
        Returns:
            - params -- Dictionary of the initialized parameters: Weights and biases
    """
    L = len(layers_dim)
    params = {}
        
    for l in range(1,L):
        params['W' + str(l)] = np.random.randn(layers_dim[l],layers_dim[l-1]) *0.01
        params['b' + str(l)] = np.zeros((layers_dim[l],1))
     
        assert(params['W' + str(l)].shape == (layers_dim[l],layers_dim[l-1]))
        assert(params['b' + str(l)].shape == (layers_dim[l],1))
    return params

In [43]:
def init_hyperParams(alpha = 0.01, num_iteration = 5000):
    """
        Initializes the hyper parameters
        
        Arguments:
            - alpha -- learning rate
            - num_iteration -- number of iteration the gradient descent will run
        Returns:
            - Dictionary of hyper parameters
        
    """
    hyperParams = {}
    hyperParams['learning_rate'] = alpha
    hyperParams['num_iterations'] = num_iteration
    
    
    return hyperParams

# Forward Pass

In [44]:
def forward_sum(A,W,b):
    """
        Calculates the forward sum 
        
        Arguments:
            - A -- array of activation from the previous layer
            - W -- weights of the current layer
            - b -- bias of the current layer
    """
    
    Z = np.dot(W,A) + b
    
    cache = (A,W,b)
    assert(Z.shape == (W.shape[0],Z.shape[1]))
    
    return Z, cache

In [45]:
def forward_activation(A,W,b,activation):
    
    if activation == 'relu':
        Z, sum_cache = forward_sum(A,W,b)
        A, activation_cache = relu(Z)
        
    if activation == 'softmax':
        Z, sum_cache = forward_sum(A,W,b)
        A, activation_cache = softmax(Z)
    
    cache = (sum_cache,activation_cache)
    assert(A.shape == Z.shape)
    
    return A, cache

In [46]:
def forward_prop(X,parameters):
    caches = []
    A = X
    L = len(parameters) // 2
    for l in range(1, L):
        A_prev = A 
        A, cache = forward_activation(A_prev,parameters['W' + str(l)],parameters['b' + str(l)],activation='relu')
        caches.append(cache)

    AL, cache = forward_activation(A,parameters['W' + str(L)],parameters['b' + str(L)],activation='softmax')
    caches.append(cache)
    
    assert(AL.shape == (10,X.shape[1]))
    
    return AL,caches
    

# Cost Function

In [47]:
def compute_cost(AL,Y):
    m = Y.shape[1]

    cost = -(1./m) * np.sum(np.sum(np.multiply(Y,np.log(AL)), axis = 0,keepdims=True))
    
    cost = np.squeeze(cost)      # Making sure your cost's shape is not returned as ndarray
    assert(cost.shape == ())
    
    return cost

# Backprop

In [48]:
def backward_grad(dZ, cache):
    A_prev, W, b = cache
    m = A_prev.shape[1]
    
    dW = (1/m) * np.dot(dZ,A_prev.T)
    db = (1/m) * np.sum(dZ, axis = 1, keepdims=True )
    dA_prev = np.dot(W.T, dZ)

    
    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    
    
    return dA_prev, dW, db

In [49]:
def backward_activation(dA,cache,activation):
    sum_cache, activation_cache = cache
    
    if activation == "relu":
        dZ = relu_grad(dA,activation_cache)
        dA_prev, dW, db = backward_grad(dZ, sum_cache)
        
    elif activation == "softmax":
        dZ = dA
        dA_prev, dW, db = backward_grad(dA, sum_cache)
    
    return dA_prev, dW, db
    
    

In [50]:
def backward_prop(AL, Y,caches):
    grads = {}
    L = len(caches) # the number of layers
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL
    
    dA = np.subtract(AL,Y)
    
    current_cache = caches[L-1]
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = backward_activation(dA, current_cache, activation = 'softmax')
    
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = backward_activation(grads["dA" + str(l + 1)], current_cache, activation = 'relu')
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp

    return grads

# Update Parameters

In [51]:
def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - (learning_rate * grads["dW" + str(l+1)])
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - (learning_rate * grads["db" + str(l+1)])
    return parameters

# Prediction

In [52]:
def predict(X,y,parameters):
    m = y.shape[1]
    n = len(parameters) // 2 # number of layers in the neural network
    
    probas, caches = forward_prop(X, parameters)
   
    assert(probas.shape == y.shape)
        
    predicted_labels = np.argmax(probas,axis=0).reshape(1,probas.shape[1])
    predicted_prob = np.max(probas,axis = 0).reshape(1,m)
    
    Y = np.argmax(y,axis=0).reshape(1,y.shape[1])
    
    true_prediction = np.equal(predicted_labels,Y)
    
    num_correct_labels = np.sum(true_prediction)
    accuracy = num_correct_labels / m
        
    return predicted_labels, predicted_prob, accuracy


# Training

In [53]:
def train(X_train, Y_train, X_dev, Y_dev, layers_dim, hyperParams):

    learning_rate = hyperParams['learning_rate']
    num_iterations = hyperParams['num_iterations']
    
    np.random.seed(1)
    costs = []      # keep track of cost
    train_accs = []  # keep track of training accuracy
    val_accs = []     # keep track of Validation accuracy
    
    parameters = init_params(layers_dim)
    
    for i in range(0, num_iterations):

        AL, caches = forward_prop(X_train, parameters)
        
        cost = compute_cost(AL, Y_train)
    
        grads = backward_prop(AL, Y_train, caches)
 
        parameters = update_parameters(parameters, grads, learning_rate)
                
        _,_,train_acc = predict(X_train, Y_train,parameters)
        _,_,val_acc= predict(X_dev, Y_dev,parameters)        
        
        if i == 0 or (i+1) % 200 == 0:
            print ("Iteration: %d == Cost: %f || Training acc: %f || Val acc: %f"%(i,cost,train_acc,val_acc))
        if i == 0 or (i+1) % 100 == 0:
            costs.append(cost)
            train_accs.append(train_acc)
            val_accs.append(val_acc)
            
            
    visualize_results(costs, attr_type='costs')  
    visualize_results(train_accs, attr_type='train_accs')       
    visualize_results(val_accs, attr_type='val_accs')       
    
    return parameters

# Visualization

In [54]:
def visualize_results(attr, attr_type):
    
    plt.plot(np.squeeze(attr))
    if attr_type == 'costs':
        plt.ylabel("cost")
        plt.title("Cost")
        
    elif attr_type == 'train_accs':
        plt.ylabel("accuracy")
        plt.title("Training Accuracy")
        
    elif attr_type == 'val_accs':
        plt.ylabel("accuracy")
        plt.title("Validation Accuracy")
        
    else:
        raise ValueError("Dataset set must be training or dev or test set")
        
    plt.xlabel('iterations (per hundreds)')
    plt.show()

# Baselining the model

In [None]:
hyperParams = init_hyperParams(alpha = 0.1,num_iteration = 10000)
layers_dim = init_layers()
parameters = train(X_train.T, y_train_e.values.T,X_dev.T, y_dev_e.values.T,layers_dim, hyperParams)

Iteration: 0 == Cost: 2.297805 || Training acc: 0.242127 || Val acc: 0.223529


  A = np.divide(t,np.sum(t,axis = 0))
  cost = -(1./m) * np.sum(np.sum(np.multiply(Y,np.log(AL)), axis = 0,keepdims=True))
  cost = -(1./m) * np.sum(np.sum(np.multiply(Y,np.log(AL)), axis = 0,keepdims=True))


Iteration: 199 == Cost: nan || Training acc: 0.100238 || Val acc: 0.076471
Iteration: 399 == Cost: nan || Training acc: 0.100238 || Val acc: 0.076471
Iteration: 599 == Cost: nan || Training acc: 0.100238 || Val acc: 0.076471


In [56]:
X_train.shape

(16830, 1024)

In [62]:
y_train_e = pd.get_dummies(y_train)
y_dev_e   = pd.get_dummies(y_dev)
y_train   = pd.get_dummies(y_train)

In [69]:
y_train_e.values.T

array([[0, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)