In [1]:
import numpy as np
from scipy import optimize
import itertools

In [2]:
# loading input data
data = np.genfromtxt('./data/tt.csv', delimiter=',')
y = data[:,0].reshape(-1,1)
X = data[:, 1:]
m = len(y)

In [3]:
# Some network architecture
input_layer_size  = 784
hidden_layer_size = 28
num_labels = 26
_lambda = 0
theta1 = np.random.rand(hidden_layer_size,785) * 2 * 0.11 - 0.11 #dimension(hidden layer size, input layer size + 1)
theta2 = np.random.rand(num_labels,hidden_layer_size + 1) * 2 * 0.11 - 0.11  #dimension(hiddent layer size + 1, labels)
#theta1 shape: (28, 785)
#theta2 shape: (26, 29)

-0.048337698564858984
-0.08263754254905406
0.009939149313306292


In [4]:
# source: https://github.com/kaleko/CourseraML/
def flatten_params(thetas_list):
    """
    Hand this function a list of theta matrices, and it will flatten it
    into one long (n,1) shaped numpy array
    """
    flattened_list = [ mytheta.flatten() for mytheta in thetas_list ]
    combined = list(itertools.chain.from_iterable(flattened_list))
    assert len(combined) == (input_layer_size+1)*hidden_layer_size + (hidden_layer_size+1)*num_labels
    return np.array(combined).reshape((len(combined),1))

def reshape_params(flattened_array):
    theta1 = flattened_array[:(input_layer_size+1)*hidden_layer_size].reshape((hidden_layer_size,input_layer_size+1))
    theta2 = flattened_array[(input_layer_size+1)*hidden_layer_size:].reshape((num_labels,hidden_layer_size+1))
    return [ theta1, theta2 ]

def flattenX(myX):
    return np.array(myX.flatten()).reshape((n_training_samples*(input_layer_size+1),1))

def reshapeX(flattenedX):
    return np.array(flattenedX).reshape((n_training_samples,input_layer_size+1))

In [5]:
# Flatten thetas
print(f'theta1: {np.shape(theta1)}, theta2: {np.shape(theta2)}')
nn_params = flatten_params([theta1, theta2])
print(np.shape(nn_params))

theta1: (28, 785), theta2: (26, 29)
(22734, 1)


In [42]:
def sigmoid(z):
    return ( (1 / (1 + np.exp(-z))) )

def sigmoid_gradient(z):
    return (sigmoid(z) * (1 - sigmoid(z)))

sigmoid(0.0) #should return 0.5

0.5

In [72]:
def cost_function(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, _lambda):
    # Reshape nn params and some initializations
    theta1, theta2 = reshape_params(nn_params)     
    J = 0
    theta1_grad = np.zeros(np.shape(theta1))  # Used for gradient 
    theta2_grad = np.zeros(np.shape(theta2))  # Used for gradient 
    
    # feed forward propogation
    '''
    a1 = (m, input_layer_size + 1), a2 = (m, hidden_layer_size + 1), a3= (m, num_labels)
    theta1 = (hidden_layer_size, input_layer_size + 1)
    theta2 = (num_labels, hidden_layer_size)
    '''
    # Input layer
    a1 = np.c_[np.ones((m,1)), X] # assigning a1 to X, and adding a bias (m, input_layer_size + 1)
    # Hidden layer
    z2 = a1.dot(theta1.T)
    a2 = sigmoid(z2)
    a2 = np.c_[np.ones((np.shape(a2)[0], 1)), a2] # bias for hidden layer
    # Output layer
    z3 = a2.dot(theta2.T)
    a3 = sigmoid(z3) #a3 = h(x)
    
    # Compute cost
    # fill array of 
    y_k = np.zeros((m,num_labels))
    for i in range(0,m):
        label_index = int(y[i])
        y_k[i,label_index] = 1
    
    # J(theta) function: cross-entropy
    term1 = (-y_k * np.log(a3))
    term2 = (1 - y_k) * np.log(1 - a3)
    cost = np.sum(term1 + term2)/m
    # Regularization sum
    reg_term = np.sum(theta1 ** 2) + np.sum(theta2[:,1:] ** 2)
    reg_term = (_lambda/2/m) * reg_term
    J = cost + reg_term
    
    
    # Back propogation
    # delta_3: (m, num_labels), theta2: (num_labels, hidden_layer_size + 1(bias)) z2: (688, hidden_layer size)
    # delta_2: (m, hidden_layer,size) a1 = (m, input_layer_size + 1) a2 = (m, hidden_layer_size + 1)
    
    delta_3 = a3 - y_k #(m, num_labels), theta2=(labels, hidden_label size)
    delta_2 = (delta_3.dot(theta2))[:,1:] * sigmoid_gradient(z2) #ignore bias
    sum_3 = delta_3.T.dot(a2) # sum of a_i * delta_i+1
    sum_2 = delta_2.T.dot(a1)
    # putting the gradient equation together
    Delta_3 = (1/m)*sum_3*(_lambda * theta2 )
    

In [73]:
J = cost_function(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, _lambda)

size delta2: (688, 28)
