In [1]:
import numpy as np
import scipy.optimize
import itertools

In [2]:
# loading input data
data = np.genfromtxt('./data/tt.csv', delimiter=',')
y = data[:,0].reshape(-1,1)
X = data[:, 1:]
global m = len(y)

In [3]:
# Some network architecture
input_layer_size  = 784
hidden_layer_size = 28
num_labels = 26
_lambda = 0
theta1 = np.random.rand(hidden_layer_size,785) * 2 * 0.11 - 0.11 #dimension(hidden layer size, input layer size + 1)
theta2 = np.random.rand(num_labels,hidden_layer_size + 1) * 2 * 0.11 - 0.11  #dimension(hiddent layer size + 1, labels)
#theta1 shape: (28, 785)
#theta2 shape: (26, 29)

In [17]:
# source: https://github.com/kaleko/CourseraML/
def flatten_params(thetas_list):
    """
    Hand this function a list of theta matrices, and it will flatten it
    into one long (n,1) shaped numpy array
    """
    flattened_list = [ mytheta.flatten() for mytheta in thetas_list ]
    combined = list(itertools.chain.from_iterable(flattened_list))
    assert len(combined) == (input_layer_size+1)*hidden_layer_size + (hidden_layer_size+1)*num_labels
    return np.array(combined).reshape((len(combined),1))

def reshape_params(flattened_array):
    theta1 = flattened_array[:(input_layer_size+1)*hidden_layer_size].reshape((hidden_layer_size,input_layer_size+1))
    theta2 = flattened_array[(input_layer_size+1)*hidden_layer_size:].reshape((num_labels,hidden_layer_size+1))
    return [ theta1, theta2 ]

def flattenX(myX):
    return np.array(myX.flatten()).reshape((m *(input_layer_size+1),1))

def reshapeX(flattenedX):
    return np.array(flattenedX).reshape((m,input_layer_size+1))

In [18]:
#grad1: (28, 784)
#grad2: (26, 28)
def flatten_grads(grad_list):
    flattened_list = [ my_grad.flatten() for my_grad in grad_list ]
    combined = list(itertools.chain.from_iterable(flattened_list))
    assert len(combined) == (input_layer_size)*hidden_layer_size + (hidden_layer_size)*num_labels
    return np.array(combined).reshape((len(combined),1))

def reshape_grads(flattened_array):
    grad1 = flattened_array[:(input_layer_size)*hidden_layer_size].reshape((hidden_layer_size,input_layer_size))
    grad2 = flattened_array[(input_layer_size)*hidden_layer_size:].reshape((num_labels,hidden_layer_size))
    return [ grad1, grad2 ]

In [19]:
# Flatten thetas
print(f'theta1: {np.shape(theta1)}, theta2: {np.shape(theta2)}')
nn_params = flatten_params([theta1, theta2])
print(np.shape(nn_params))

theta1: (28, 785), theta2: (26, 29)
(22734, 1)


In [20]:
def sigmoid(z):
    return ( (1 / (1 + np.exp(-z))) )

def sigmoid_gradient(z):
    return (sigmoid(z) * (1 - sigmoid(z)))

print(sigmoid(0.0)) #should return 0.5
print(sigmoid_gradient(0.0)) # should return 0.25

0.5
0.25


In [21]:
def feed_forward(theta1, theta2, X):
    '''
    a1 = (m, input_layer_size + 1), a2 = (m, hidden_layer_size + 1), a3= (m, num_labels)
    theta1 = (hidden_layer_size, input_layer_size + 1)
    theta2 = (num_labels, hidden_layer_size)
    '''
    m, _ = np.shape(X)
    
    # Input layer
    a1 = np.c_[np.ones((m,1)), X] # assigning a1 to X, and adding a bias (m, input_layer_size + 1)
    # Hidden layer
    z2 = a1.dot(theta1.T)
    a2 = sigmoid(z2)
    a2 = np.c_[np.ones((np.shape(a2)[0], 1)), a2] # bias for hidden layer
    # Output layer
    z3 = a2.dot(theta2.T)
    a3 = sigmoid(z3) #a3 = h(x)
    return (a1, a2, a3, z2, z3)

In [22]:
def compute_cost( nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lamda, yk = None):
    theta1, theta2 = reshape_params(nn_params)     
    a1,a2,a3,z2,z3 = feed_forward(theta1, theta2, X)
    # (m, num_labels) matrix. all zeros, one '1' per row in the column corresponding to output label
    if yk is None:
        y_k = np.zeros((m,num_labels))
        for i in range(0,m):
            label_index = int(y[i])
            y_k[i,label_index] = 1
    
    # J(theta) function: cross-entropy
    term1 = (-y_k * np.log(a3))
    term2 = (1 - y_k) * np.log(1 - a3)
    cost = np.sum(term1 + term2)/m
    # Regularization sum
    reg_term = np.sum(theta1 ** 2) + np.sum(theta2[:,1:] ** 2)
    reg_term = (_lambda/2/m) * reg_term
    return(cost + reg_term)

In [23]:
def compute_gradient(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lamda, yk = None):
    m, n = np.shape(X)
    theta1, theta2 = reshape_params(nn_params)     
    a1, a2, a3, z2, z3 = feed_forward(theta1, theta2, X)
    
    #back prop
     # (m, num_labels) matrix. all zeros, one '1' per row in the column corresponding to output label
    if yk is None:
        y_k = np.zeros((m,num_labels))
        for i in range(0,m):
            label_index = int(y[i])
            y_k[i,label_index] = 1
            
    delta_3 = a3 - y_k #(m, num_labels), theta2=(labels, hidden_label size)
    delta_2 = (delta_3.dot(theta2))[:,1:] * sigmoid_gradient(z2) #ignore bias
    sum_2 = delta_3.T.dot(a2) # sum of a_i * delta_i+1
    sum_1 = delta_2.T.dot(a1)
    
    # putting the gradient equation together
    theta_2_grad = (sum_2[:,1:] / m) + ((theta2[:,1:] * _lambda) / m)
    theta_1_grad = (sum_1[:,1:] / m) + (theta1[:,1:] * _lambda / m)
    
    # Sizes
    #grad1: (28, 784)
    #grad2: (26, 28)
    grad_flat = flatten_grads([theta_1_grad, theta_2_grad])
    

In [24]:
def nncost_function(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, _lambda):
    # Reshape nn params and some initializations
    m = len(y) #if shit breaks its this
    theta1, theta2 = reshape_params(nn_params)     
    J = 0
    theta1_grad = np.zeros(np.shape(theta1))  # Used for gradient 
    theta2_grad = np.zeros(np.shape(theta2))  # Used for gradient 
    
    # feed forward propogation
    a1, a2, a3, z2, z3 = feed_forward(theta1, theta2, X)
    
    # Compute cost
    # fill array of 
    y_k = np.zeros((m,num_labels))
    for i in range(0,m):
        label_index = int(y[i])
        y_k[i,label_index] = 1
    
    # J(theta) function: cross-entropy
    term1 = (-y_k * np.log(a3))
    term2 = (1 - y_k) * np.log(1 - a3)
    cost = np.sum(term1 + term2)/m
    # Regularization sum
    reg_term = np.sum(theta1 ** 2) + np.sum(theta2[:,1:] ** 2)
    reg_term = (_lambda/2/m) * reg_term
    J = cost + reg_term
    
    
    # Back propogation
    # delta_3: (m, num_labels), theta2: (num_labels, hidden_layer_size + 1(bias)) z2: (688, hidden_layer size)
    # delta_2: (m, hidden_layer,size) a1 = (m, input_layer_size + 1) a2 = (m, hidden_layer_size + 1)
    
    delta_3 = a3 - y_k #(m, num_labels), theta2=(labels, hidden_label size)
    delta_2 = (delta_3.dot(theta2))[:,1:] * sigmoid_gradient(z2) #ignore bias
    sum_2 = delta_3.T.dot(a2) # sum of a_i * delta_i+1
    sum_1 = delta_2.T.dot(a1)
    
    # putting the gradient equation together
    # !!! try running with adding in J_0 bias term
    theta_2_grad = (sum_2[:,1:] / m) + ((theta2[:,1:] * _lambda) / m)
    theta_1_grad = (sum_1[:,1:] / m) + (theta1[:,1:] * _lambda / m)
    
    grad_flat = flatten_grads([theta_1_grad, theta_2_grad])
    
    return (J, grad_flat)
    

In [25]:
J,grad = nncost_function(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, _lambda)

  return ( (1 / (1 + np.exp(-z))) )


In [26]:
def train(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, _lambda, ):
    result = scipy.optimize.fmin_cg(compute_cost, x0=nn_params, fprime=compute_gradient,
                               args=(flattenX(X),y,_lambda),maxiter=50,disp=True,full_output=True)
    return reshapeParams(result[0])
    

In [27]:
train(nn_params,input_layer_size, hidden_layer_size, num_labels, X, y, _lambda, )

ValueError: cannot reshape array of size 539392 into shape (540080,1)