In [1]:
import numpy as np
import numpy.matlib
import matplotlib.pyplot as plt
import pickle
import copy


In [2]:
def load_batch(filename):
    with open(filename, 'rb') as f:
        dataset = pickle.load(f, encoding='latin1') # Nxd(3072) (Nx (32x32x3))
        X = np.transpose(dataset['data'] / 255.) # d x N
        mean_X = np.mean(X, axis=1) # mean of each row (each feature mean)
        std_X = np.std(X, axis=1)
        X = X - np.matlib.repmat(mean_X, X.shape[1], 1).T
        X = np.divide(X, np.matlib.repmat(std_X, X.shape[1], 1).T)
        
        y = np.array(dataset['labels'])
        Y = np.transpose(np.eye(X.shape[1], np.max(y) + 1)[y]) # K x N
        return X, Y, y

def load_all(validation_size):
    X_1, Y_1, y_1 = load_batch('data/data_batch_1')
    X_2, Y_2, y_2 = load_batch('data/data_batch_2')
    X_3, Y_3, y_3 = load_batch('data/data_batch_3')
    X_4, Y_4, y_4 = load_batch('data/data_batch_4')
    X_5, Y_5, y_5 = load_batch('data/data_batch_5')
    
    X = np.concatenate((X_1, X_2, X_3, X_4, X_5[:,:-validation_size]), axis=1)
    Y = np.concatenate((Y_1, Y_2, Y_3, Y_4, Y_5[:,:-validation_size]), axis=1)
    y = np.concatenate((y_1, y_2, y_3, y_4, y_5[:-validation_size]))
    
    X_valid = X_5[:,-validation_size:]
    Y_valid = Y_5[:,-validation_size:]
    y_valid = y_5[-validation_size:]
    return X, Y, y, X_valid, Y_valid, y_valid
    

In [None]:
def batch_normalize(scores, mean, variance):
    return np.dot(np.pow(np.diag(variance + 1e-9), -0.5), (s - np.array([mean]).transpose()))

In [3]:
def softmax(s):
    exponent = np.exp(s)
    return np.divide(exponent, np.sum(exponent, axis=0))

def evaluate_classifier(X, layers):
    num_layers = len(layers)
    H = []
    S = []
    S_norm = []
    
    layer_means = []
    layer_variances = []
    
    h_prev = X
    
    for i, layer in enumerate(layers):
        if i == num_layers - 1:  # If last layer
            P = softmax(np.dot(layer["W"], h_prev) + layer["b"]) # K x N
            return H, P, S, S_norm, layer_means, layer_variances
        else:
            s = np.dot(layer["W"], h_prev) + layer["b"] # m x N
            S.append(s)
            
            mean = np.mean(s, axis=1) # m len
            variance = np.mean(np.square(s - np.array([mean]).transpose()), axis=1) # m
            
            layer_means.append(mean)
            layer_variances.append(variance)
            
            gamma = 1
            beta = 0
            
            normalized = batch_normalize(scores, mean, variance)
            S_norm.append(normalized)
            
            transformed = np.multiply(gamma, normalized) + beta
            
            h = np.maximum(s, 0) # ReLU; m x N
            H.append(h)
            h_prev = h

In [20]:
def compute_cost(X, Y, layers, lmb):
    H, P = evaluate_classifier(X, layers)[:2]
    n = np.sum(np.multiply(Y, P), axis=0)
    cross_entropy = np.sum(-np.log(n))
    
    w_square_sum = 0
    if lmb > 0:
        for layer in layers:
            w_square_sum += np.sum(np.diag(np.dot(layer["W"].T, layer["W"])))
    return (cross_entropy / X.shape[1]) + (lmb * w_square_sum)

def compute_gradients(X, Y, layers, lmb):
    H, P, S, S_norm, layer_means, layer_variances = evaluate_classifier(X, layers)
    G = -(Y - P)
    Nb = X.shape[1] # batch size
    
    W_gradients = []
    b_gradients = []
    for i, layer in reversed(list(enumerate(layers))): # from last to first
        if i > 0:
            grad_W = np.divide(np.dot(G, H[i - 1].T), Nb) + (2 * lmb * layer["W"]) # J w.r.t W_k
            grad_b = np.divide(np.dot(G, np.ones((Nb, 1))), Nb) # J w.r.t b_k
            
            G = np.dot(layer["W"].T, G)
            G = G * (H[i - 1] > 0).astype(int) # element-wise
            
            gamma_grad = np.dot(np.divide(np.multiply(G, S_norm), Nb), np.ones((Nb, 1)))
            beta_grad = np.divide(np.dot(G, np.ones((Nb, 1))))
            
            G = np.multiply(G, np.dot()
            
            W_gradients.append(grad_W)
            b_gradients.append(grad_b)
        else: # first layer
            grad_W = np.divide(np.dot(G, X.T), Nb) + (2 * lmb * layer["W"])
            grad_b = np.divide(np.dot(G, np.ones((Nb, 1))), Nb)
            W_gradients.append(grad_W)
            b_gradients.append(grad_b)
    return W_gradients, b_gradients


[[1 2]
 [3 1]
 [2 9]]
[1.5 2.  5.5]
[[-0.5  0.5]
 [ 1.  -1. ]
 [-3.5  3.5]]
