In [270]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import loadmat

### Load data

In [271]:
data = loadmat('data/MINST')
data

{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Oct 16 13:09:09 2011',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 'y': array([[10],
        [10],
        [10],
        ...,
        [ 9],
        [ 9],
        [ 9]], dtype=uint8)}

In [272]:
print(data['X'].shape, data['y'].shape)

(5000, 400) (5000, 1)


### Encode Label

In [273]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
y_onehot = encoder.fit_transform(data['y'])
y_onehot

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

### Activation Function

In [274]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

### Forward Propagation

In [275]:
def forward_propagate(X, theta1, theta2):
    m = X.shape[0]
    ones = np.ones([m, 1])
    # Input Layer
    a1 = np.concatenate((ones, X), axis=1)
    # Hidden Layer
    z2 =a1*theta1.T
    a2 =sigmoid(z2)
    a2 = np.concatenate((ones, a2), axis=1)
    # Output Layer
    z3 = a2*theta2.T
    h = sigmoid(z3)
    
    return a1, z2, a2, z3, h

### Cost Function

In [276]:
def cost(params, input_size, hidden_size, num_labels, X, y, learning_rate):
    m = X.shape[0]
    X = np.matrix(X)
    y = np.matrix(y)
    # reshape the parameter array into parameter matrices for each layer
    theta1 = np.matrix(np.reshape(params[0,:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1))))
    theta2 = np.matrix(np.reshape(params[0,hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1))))
    # run the feed-forward pass
    a1, z2, a2, z3, h = forward_propagate(X, theta1, theta2)
    # compute the cost
    J = 0
    for i in range(m):
        first_term = np.multiply(-y[i,:], np.log(h[i,:]))
        second_term = np.multiply((1 - y[i,:]), np.log(1 - h[i,:]))
        J += np.sum(first_term - second_term)
        
    J = J / m
    # cost function with regularization
    J += (float(learning_rate) / (2*m) * (np.sum(np.power(theta1[:,1:], 2)) + np.sum(np.power(theta2[:,1:],2))))
    
    return J
    

### Initialize

In [277]:
# initial setup
input_size = 400
hidden_size = 25
num_labels = 10
learning_rate = 1
# randomly initialize a parameter array of the size of the full network's parameters
params = (np.random.random(size=hidden_size * (input_size + 1) + num_labels * (hidden_size + 1)) - 0.5) * 0.2
params = params.reshape(1,hidden_size * (input_size + 1) + num_labels * (hidden_size + 1))
m = data['X'].shape[0]
X = np.matrix(data['X'])
y = np.matrix(data['y'])
# unravel the parameter array into parameter matrices for each layer
theta1 = np.matrix(np.reshape(params[0,:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1))))
theta2 = np.matrix(np.reshape(params[0,hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1))))

a1, z2, a2, z3, h = forward_propagate(X, theta1, theta2)

print("X")
print(X.shape)

print("\na1")
print(a1.shape)
print(a1)

print("\ntheta1")
print(theta1.shape)
# print(theta1)

print("\nz2")
print(z2.shape)
print(z2)

print("\na2")
print(a2.shape)
print(a2)

print("\ntheta2")
print(theta2.shape)
# print(theta2)

print("\nz3")
print(z3.shape)
print(z3)

print("\nh")
print(h.shape)
print(h)

X
(5000, 400)

a1
(5000, 401)
[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]

theta1
(25, 401)

z2
(5000, 25)
[[-0.11124316  0.12924514  0.14151171 ...  0.25012523  0.18344987
  -0.62078322]
 [-0.10841401  0.36218979  0.14344569 ...  0.47206997  0.25884187
  -0.26665286]
 [-0.4230997   0.13939889  0.28346869 ...  0.46829956  0.35911081
  -0.07723347]
 ...
 [-0.21732322  0.78474304 -0.29920868 ...  0.18447177  0.04772086
  -0.05696469]
 [-0.28949017  0.3957031   0.14699252 ...  0.22416434  0.08403395
  -0.26863653]
 [-0.5839135  -0.40052639 -0.13595645 ... -0.05247923  0.61271555
  -0.55196313]]

a2
(5000, 26)
[[1.         0.47221785 0.53226638 ... 0.56220732 0.54573428 0.34960334]
 [1.         0.47292301 0.58957042 ... 0.61587357 0.56435158 0.433729  ]
 [1.         0.39577526 0.5347934  ... 0.61498121 0.58882517 0.48070123]
 ...
 [1.         0.44588203 0.68670145 ... 0.5459876  

In [278]:
cost(params, input_size, hidden_size, num_labels, X, y_onehot, learning_rate)

7.012200198935983

In [279]:
loss = h - y
loss.shape

(5000, 10)

### Sigmoid Gradient

In [280]:
def sigmoid_gradient(z):
    return np.multiply(sigmoid(z), (1 - sigmoid(z)))    

### Backpropagation

In [285]:
def backprop(params, input_size, hidden_size, num_labels, X, y, learning_rate):
    m = X.shape[0]
 
    #Write codes here
    theta1 = np.matrix(np.reshape(params[0,:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1))))
    theta2 = np.matrix(np.reshape(params[0,hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1))))
    
    #forward
    J = 0
    a1, z2, a2, z3, h = forward_propagate(X, theta1, theta2)
    J = cost(params, input_size, hidden_size, num_labels, X, y_onehot, learning_rate)
    #print("loss: ", J)
    #gradient
    loss3 = h - y
    
    gradient1 = np.zeros((hidden_size, input_size+1))
    gradient2 = np.zeros((num_labels, hidden_size+1))
    
    
    
    gradient2 = np.matmul(np.transpose(loss3), a2)
    gradient2 = gradient2 / m
    gradient2[:,1:] += (float((learning_rate)) / m) * theta2[:,1:] 

    #print("sigmoid_gradient(z2)")
    #print(sigmoid_gradient(z2))
    #print(sigmoid_gradient(z2).shape)
    
    loss2 = np.array(np.matmul(loss3, theta2[:,1:])) * np.array(sigmoid_gradient(z2))
    gradient1 = np.matmul(np.transpose(loss2), a1)
    gradient1 = gradient1 / m
    gradient1[:,1:] += ((float(learning_rate)) / m) * theta1[:,1:]
    
    #reshape size
    #print("before:" ,gradient1.shape)
    gradient1 = gradient1.reshape(1,-1)
    gradient1 = gradient1[0][0]
    #print("after:" ,gradient1.shape)  
        
    gradient2 = gradient2.reshape(1,-1)
    gradient2 = gradient2[0][0]
    #print("after:" ,gradient2.shape) 
    
    grad = np.concatenate((gradient1,gradient2),axis = 1)
    
    return J, grad

In [291]:
new_theta = params
for i in range(20):
    loss, new_theta = backprop(new_theta, input_size, hidden_size, num_labels, X, y_onehot, learning_rate)
    print("loss", loss)
    
#X = np.matrix(X)
#theta1 = np.matrix(np.reshape(params[0,:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1))))
#theta2 = np.matrix(np.reshape(params[0,hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1))))
#a1, z2, a2, z3, h = forward_propagate(X, theta1, theta2)
#y_pred = np.array(np.argmax(h, axis=1) + 1)

#correct = [1 if a == b else 0 for (a, b) in zip(y_pred, y)]
#accuracy = (sum(map(int, correct)) / float(len(correct)))
#print ('accuracy = {0}%'.format(accuracy * 100))

loss 7.012200198935983
loss 27.172744342834402
loss 104.2737067277503
loss 114.98052285684608
loss 115.45502854737244
loss 109.74704629954415
loss 109.94590177242236
loss 109.74894664516735
loss 109.94863447194881
loss 109.74894405963047
loss 109.94863142623663
loss 109.7489440608422
loss 109.94863142794165
loss 109.74894406084043
loss 109.94863142794117
loss 109.74894406084043
loss 109.94863142794117
loss 109.74894406084043
loss 109.94863142794117
loss 109.74894406084043


### Predict

In [294]:
from scipy.optimize import minimize
# minimize the objective function
fmin = minimize(fun=backprop, x0=params, args=(input_size, hidden_size, num_labels, X, y_onehot, learning_rate), method='TNC', jac=True, options={'maxiter': 1})
print("fmin: ", fmin)

X = np.matrix(X)
theta1 = np.matrix(np.reshape(fmin.x[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1))))
theta2 = np.matrix(np.reshape(fmin.x[hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1))))
a1, z2, a2, z3, h = forward_propagate(X, theta1, theta2)
y_pred = np.array(np.argmax(h, axis=1) + 1)

correct = [1 if a == b else 0 for (a, b) in zip(y_pred, y)]
accuracy = (sum(map(int, correct)) / float(len(correct)))
print ('accuracy = {0}%'.format(accuracy * 100))

IndexError: too many indices for array