In [4]:
# Import packages
from helpers import load_all_data, vectorized_flatten, sigmoid, get_log_loss, get_accuracy, sigmoid_derivative, gradient_update, plot_loss
from helpers import sgd_with_momentum_update, prep_data,  get_best_epoch, get_results
import numpy as np

In [5]:
def initialize_two_layers(X, h1): 
    '''
    --------------------
    Parameter Initialization
    --------------------
    Parameters: 
    X: Numpy array of training features (shape = 784 X [n = 12000])
    --------------------
    Output: 
    weights: Weight terms initialized as random normals
    biases: Bias terms initialized to zero
    --------------------
    '''
    dim1 = 1/np.sqrt(X.shape[0])
    W1 = dim1 * np.random.randn(h1, 28**2)
    
    dim2 = 1/np.sqrt(W1.shape[1])
    W2 = dim2 * np.random.randn(1, h1)
    
    b1 = np.zeros((h1, 1))
    b2 = np.zeros((1, 1))
    
    weights = (W1, W2)
    biases = (b1, b2)
    
    return weights, biases

In [7]:
def forward_pass_two_layers(X, weights, biases):
    '''
    ----------------------------------
    Forward propogation:
    Send inputs through the network to
    generate output
    ----------------------------------
    Parameters: 
    X: Numpy array of training features (shape = 784 X n)
    weights: Binary (1/0) training label (shape = n X 1)
    biases:
    --------------------
    Output: 
    activations: vector of results from passing
    inputs through each neuron
    --------------------
    '''
    W1, W2 = weights
    b1, b2 = biases
    
    z1 = W1 @ X + b1
    a1 = sigmoid(z1)
    
    z2 = W2 @ a1 + b2
    a2 = sigmoid(z2)
    
    activations = (z1, a1, z2, a2)
    
    return activations

In [8]:
def backpropagation_two_layers(X, y, weights, biases, activations):
    '''
    --------------------
    Backpropagation
    --------------------
    Parameters: 
    X: Numpy array of training features (shape = 784 X n)
    y: Binary (1/0) training label (shape = n X 1)
    weights: Current set of weights
    biases: Current set of biases
    activations: Current set of activations
    --------------------
    Output: 
    Derivatives required
    for optimization update
    --------------------
    '''
    W1, W2 = weights
    b1, b2 = biases
    z1, a1, z2, a2 = activations
    m = y.shape[1]
    #print(m)   
    
    dz2 = a2 - y
    #print("dz3", dz3.shape)
    
    dW2 = np.dot(dz2, a1.T)/m
    #print("dW2", dW2.shape)
    
    db2 = np.sum(dz2, axis=1).reshape(-1, 1)/m
    #print("db2", db2.shape)
    
    da1 = np.dot(W2.T, dz2)
    #print("da1", da1.shape)
    
    dz1 = da1 * sigmoid_derivative(z1)
    #print("dz1", dz1.shape)
    
    dW1 = np.dot(dz1, X.T)/m
    #print("dW1", dW1.shape)
    
    db1 = np.sum(dz1, axis=1).reshape(-1, 1)/m
    #print("db1", db1.shape)
    
    return db1, dW1, db2, dW2

In [22]:
def finite_differences(example, truth, weights, biases, delta_h=1e-9):
    '''
    --------------------
    Prepare data
    --------------------
    Parameters: 
    weights: Current set of weights
    biases: Current set of biases
    gradients: Current set of gradients
    learning_rate: parameter to guide SGD step size
    --------------------
    Output: 
    Updated weights and biases
    --------------------
    '''
    W1, W2 = weights
    b1, b2 = biases
    
    I, J = W2.shape # Change here
    
    deltaW = np.zeros((I, J))
    
    activations = forward_pass_two_layers(example, weights, biases)
    db1, dW1, db2, dW2 = backpropagation_two_layers(example, truth, weights, biases, activations)
    
    for i in range(I):
        for j in range(J):
    
            W_plus = np.copy(W2) # Change here
            W_minus = np.copy(W2) # Change here
            
            W_plus[i][j] += delta_h
            W_minus[i][j] -= delta_h
            
            weights_plus = [W1, W_plus] # Change here
            weights_minus = [W1, W_minus] # Change here
            
            activations_plus = forward_pass_two_layers(example, weights_plus, biases)
            activations_minus = forward_pass_two_layers(example, weights_minus, biases)

            loss_plus = get_log_loss(truth, activations_plus[-1])
            loss_minus =  get_log_loss(truth, activations_minus[-1])

            deltaW[i][j] = (loss_plus - loss_minus)/(2 * delta_h)

    difference = np.linalg.norm(dW2 - deltaW) # Change here
    
    return dW2, deltaW, difference, activations_plus[-1], activations_minus[-1] # Change here

In [23]:
# Check finite differences
def run_finite_differences(data_path, h1 = 8, idx=10):
    '''
    --------------------
    Prepare data
    --------------------
    Parameters: 
    weights: Current set of weights
    biases: Current set of biases
    gradients: Current set of gradients
    learning_rate: parameter to guide SGD step size
    --------------------
    Output: 
    Updated weights and biases
    --------------------
    '''
   
    X_train_flattened, _, _, y_train, _, _ = prep_data(data_path)
    w, b = initialize_two_layers(X_train_flattened, h1)
    dW, deltaW, difference, activations_plus, activations_minus = finite_differences(X_train_flattened[:, idx].reshape(-1, 1), 
                                                                  y_train[:, idx].reshape(-1, 1), w, b)
    
    print("dW", dW)
    print("deltaW", deltaW)
    print("difference", difference)
    print(activations_plus, activations_minus)
    
    return(dW, deltaW, difference, activations_plus, activations_minus)

In [None]:
def update_parameters_without_momentum(weights, biases, gradients, learning_rate):
    '''
    --------------------
    Update parameters
    --------------------
    Parameters: 
    weights: Current set of weights
    biases: Current set of biases
    gradients: Current set of gradients
    learning_rate: parameter to guide SGD step size
    --------------------
    Output: 
    Updated weights and biases
    --------------------
    '''
    W1, W2 = weights
    b1, b2 = biases
    
    db1, dW1, db2, dW2 = gradients
    
    W1 = gradient_update(W1, learning_rate, dW1)
    W2 = gradient_update(W2, learning_rate, dW2)
   
    b1 = gradient_update(b1, learning_rate, db1)
    b2 = gradient_update(b2, learning_rate, db2)
    
    weights = (W1, W2)
    biases = (b1, b2)
    
    return weights, biases

In [None]:
def initialize_velocity(weights,biases):
    '''
    --------------------
    Prepare data
    --------------------
    Parameters: 
    weights: Current set of weights
    biases: Current set of biases
    gradients: Current set of gradients
    learning_rate: parameter to guide SGD step size
    --------------------
    Output: 
    Updated weights and biases
    --------------------
    '''
    W1, W2 = weights
    b1, b2 = biases
    vw1 = np.zeros(W1.shape)
    vw2 = np.zeros(W2.shape)
    vb1 = np.zeros(b1.shape)
    vb2 = np.zeros(b2.shape)
    return vw1,vw2,vb1,vb2

In [None]:
def update_parameters_with_momentum(weights,biases,gradients,learning_rate,velocity,momentum):
    '''
    --------------------
    Prepare data
    --------------------
    Parameters: 
    weights: Current set of weights
    biases: Current set of biases
    gradients: Current set of gradients
    learning_rate: parameter to guide SGD step size
    --------------------
    Output: 
    Updated weights and biases
    --------------------
    '''
    W1, W2 = weights
    b1, b2 = biases
    
    db1, dW1, db2, dW2 = gradients
    vw1,vw2,vb1,vb2 = velocity
    W1,vw1 = sgd_with_momentum_update(W1, learning_rate, dW1,vw1,momentum)
    W2,vw2 = sgd_with_momentum_update(W2, learning_rate, dW2,vw2,momentum)
   
    b1,vb1 = sgd_with_momentum_update(b1, learning_rate, db1,vb1,momentum)
    b2,vb2 = sgd_with_momentum_update(b2, learning_rate, db2,vb2,momentum)
    
    weights = (W1, W2)
    biases = (b1, b2)
    velocity = (vw1,vw2,vb1,vb2)
    return weights ,biases,velocity

In [None]:
def output_layer(activations_full):
    '''
    --------------------
    Prepare data
    --------------------
    Parameters: 
    weights: Current set of weights
    biases: Current set of biases
    gradients: Current set of gradients
    learning_rate: parameter to guide SGD step size
    --------------------
    Output: 
    Updated weights and biases
    --------------------
    '''
    y_prob = activations_full[-1]
    y_pred = np.where(y_prob > 0.5, 1, 0)
    
    return(y_prob, y_pred)

In [None]:
def batchify(X):
    '''
    --------------------
    Prepare data
    --------------------
    Parameters: 
    weights: Current set of weights
    biases: Current set of biases
    gradients: Current set of gradients
    learning_rate: parameter to guide SGD step size
    --------------------
    Output: 
    Updated weights and biases
    --------------------
    '''
    pass

In [None]:
def batch_training(batch_size, weights, biases, epochs,
                   X, y, momentum_param, lr, X_dev,y_dev, velocity):
    '''
    --------------------
    Prepare data
    --------------------
    Parameters: 
    weights: Current set of weights
    biases: Current set of biases
    gradients: Current set of gradients
    learning_rate: parameter to guide SGD step size
    --------------------
    Output: 
    Updated weights and biases
    --------------------
    '''
    history = {
        "weights": [weights],
        "losses": [], 
        "biases": [biases],
        "accuracies": [],
        "velocity":[velocity],
        "dev_accuracies" :[],
        "dev_loss":[]
    }
    
    
    for epoch in range(epochs):
        
        offset = 0
        weights = history['weights'][epoch]
        biases = history['biases'][epoch]
        velocity = history['velocity'][epoch]
        
        while offset <max(y.shape):
            if offset%1000==0 :
                print("epoch :",epoch," batch:",offset)
            else :
                a=1
            if offset+batch_size >=max(y.shape):
                X_batch = X[:,offset:]
                y_batch = y[:,offset:]
            else :    
                X_batch = X[:,offset:offset+batch_size]
                y_batch = y[:,offset:offset+batch_size]
            
            activations = forward_pass_two_layers(X_batch, weights, biases)
            gradients = backpropagation_two_layers(X_batch, y_batch, weights, biases, activations)
            weights, biases,velocity = update_parameters_with_momentum(weights, biases, gradients, 
                                                                       lr, velocity, momentum_param)
            offset = offset+batch_size
        
        activations_full = forward_pass_two_layers(X, weights, biases)
        y_prob = activations_full[-1]
        y_pred = np.where(y_prob > 0.5, 1, 0)

        loss = get_log_loss(y, y_prob)
        accuracy = get_accuracy(y, y_pred)
        
        activations_dev = forward_pass_two_layers(X_dev,weights,biases)
        y_dev_prob =  activations_dev[-1]
        y_dev_pred = np.where(y_dev_prob > 0.5, 1, 0)
        
        loss_dev = get_log_loss(y_dev,y_dev_prob)
        accuracy_dev = get_accuracy(y_dev,y_dev_pred)

        history["weights"].append(weights)
        history["losses"].append(loss)
        history["biases"].append(biases)
        history["velocity"].append(velocity)
        history["accuracies"].append(accuracy)
        history["dev_accuracies"].append(accuracy_dev)
        history['dev_loss'].append(loss_dev)

        if np.isnan(loss):
            break
        print("loss after epoch: ",epoch,": ",loss)
    
    return history

In [None]:
def run_training(data_path, epochs, mode, model_name, h1, lr, batch_size, momentum_param, grid_search = False):
    '''
    --------------------
    Prepare data
    --------------------
    Parameters: 
    weights: Current set of weights
    biases: Current set of biases
    gradients: Current set of gradients
    learning_rate: parameter to guide SGD step size
    --------------------
    Output: 
    Updated weights and biases
    --------------------
    '''
    np.random.seed(1252908)
    
    X_train_flattened, X_dev_flattened, X_test_flattened, y_train, y_dev, y_test = prep_data(data_path)
    
    if mode == 'full': batch_size = max(y_train.shape)
    elif mode == 'stochastic': batch_size = 1
    
    weights, biases = initialize_two_layers(X_train_flattened, h1)
    velocity = initialize_velocity(weights, biases)
    
    history = batch_training(batch_size, weights, biases, 
                             epochs, X_train_flattened, y_train, 
                             momentum_param, lr, X_dev_flattened, y_dev, 
                             velocity)
    
    best_epoch, _, _ = get_best_epoch(history)
    best_dev_epoch, _, _ = get_best_dev_epoch(history)

    # Plots
    plot_loss("{}_loss.png".format(model_name), history["losses"][:-2])
    plot_loss("{}_accuracy.png".format(model_name), history["accuracies"][:-2], label='Training Accuracy')
    
    # Plot dev loss
    plot_loss("{}_dev_loss.png".format(model_name), history["dev_loss"][:-2])
    plot_loss("{}_dev_accuracy.png".format(model_name), history["dev_accuracies"][:-2])
    
    # If we are running grid search return the dev. loss to calling function to compare 
    if grid_search:
        activations_dev = forward_pass_two_layers(X_dev_flattened, history["weights"][best_dev_epoch], 
                                                  history["biases"][best_dev_epoch])
        y_prob = activations_dev[-1]
        dev_loss = get_log_loss(y_dev,y_prob)
        return(history, best_epoch, dev_loss)  
        
    else:
        return(history, best_epoch)

In [None]:
def grid_search(data_path, epochs, mode, model_name, h1, 
                lr_grid, batch_size_grid, momentum_grid, sampling_frac=0.1):
    '''
    --------------------
    Prepare data
    --------------------
    Parameters: 
    weights: Current set of weights
    biases: Current set of biases
    gradients: Current set of gradients
    learning_rate: parameter to guide SGD step size
    --------------------
    Output: 
    Updated weights and biases
    --------------------
    '''
    best_params = {}
    best_params['loss'] = np.inf
    best_params['momentum'] = 0
    best_params['lr'] = 0
    best_params['batch_size'] = 0
    best_params["weights"] = 0
    best_params["biases"] = 0
    
    sampled_grid = [np.random.choice(grid, int(sampling_frac*len(grid)), replace = False) 
                    for grid in [momentum_grid, lr_grid, batch_size_grid]]
    
    momentum_grid, lr_grid, batch_size_grid = *sampled_grid

    total_iters = len(momentum_grid)*len(lr_grid)*len(batch_size_grid)
    i = 0
                                     
    for m in momentum_grid:
        for lr in lr_grid:
            for bs in batch_size_grid:
                
                print("We are {} % done!".format(i/total_iters))
                
                history, best_epoch, dev_loss = run_training(data_path, 
                                                             epochs, mode, model_name, 
                                                             h1, lr, bs, m, grid_search = True)
                
                if dev_loss < best_params['loss'] :
                    best_params['momentum'] = m
                    best_params['lr'] = lr
                    best_params['batch_size'] = bs
                    best_params['loss'] = dev_loss
                    best_params["weights"] = history["weights"][best_epoch]
                    best_params["biases"] = history["biases"][best_epoch]
                
                i+=1
            
    return(best_params)           

In [None]:
# Set parameters for momentum and grid search demonstration
modes = ['full', 'stochastic', 'mini']
data_path = '../setup/data'
batch_size = 10
epochs = 100 
h1 = 8 
lr = 0.1

In [None]:
# First we train without momentum
# This is why we set the momentum_param to 0
# This is equivalent to running gradient descent
results_without_momentum = {'full':'', 'stochastic':'', 'batch': ''}
model_name = '../figs/{}_without_momentum'
momentum_param = 0

In [None]:
# We go through each mode with the training call
for mode in modes:
    fig_name = model_name.format(mode)
    results_without_momentum[mode] = run_training(data_path, 
                                                  epochs, mode, fig_name, h1, 
                                                  lr, batch_size, momentum_param)

In [None]:
# Now we conduct the same rounds of training with momentum
results_with_momentum = {'full':'', 'stochastic':'', 'batch': ''}
model_name = '../figs/{}_with_momentum'
momentum_param = 0.9

In [None]:
# Call with momentum
for mode in modes:
    fig_name = model_name.format(mode)
    results_without_momentum[mode] = run_training(data_path, 
                                                  epochs, mode, fig_name, h1, 
                                                  lr, batch_size, momentum_param)

In [None]:
# Now use grid search for hyper-parameter optimization
mode = 'mini'
model_name = 'grid_search_result'
epochs = 2 

# These are the grids that we will search over
lr_grid = np.arange(0.1, 0.2, 0.1)
batch_size_grid = np.arange(1, 12000, 5999)
momentum_grid = np.arange(0.1, 0.9, 0.4)

In [None]:
# Finally we call the function
grid_search(data_path, epochs, mode, model_name, h1, lr_grid, batch_size_grid, momentum_grid)