In [1]:
from scipy.io import loadmat
import numpy as np
import scipy.optimize as opt
import pandas as pd
import matplotlib.pyplot as plt
df = loadmat('ex4data1.mat')
X = df['X']
y = df['y']


In [2]:
weights = loadmat('ex4weights.mat')
theta1 = weights['Theta1'] 
theta2 = weights['Theta2']
nn_params = np.hstack((theta1.flatten(), theta2.flatten() ))
input_layer_size = 400
hidden_layer_size = 25
num_labels = 10
lmbda = 1
def sigmoid(z):
    return 1/(1+np.exp(-z))

In [3]:
def costv(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lmbda):
    theta1 = np.reshape(nn_params[:hidden_layer_size*(input_layer_size+1)], (hidden_layer_size, input_layer_size+1))
    theta2 = np.reshape(nn_params[hidden_layer_size*(input_layer_size+1):], (num_labels, hidden_layer_size+1))
    m = len(y)
    ones = np.ones((m,1))
    a1 = np.hstack((ones, X))
    a2 = sigmoid(np.dot(a1,theta1.T))
    a2 = np.hstack((ones, a2))
    h = sigmoid(np.dot(a2,theta2.T))
    y_d = pd.get_dummies(y.flatten()) #taking dummy values pandas function
    temp1 = np.multiply(y_d, np.log(h))
    temp2 = np.multiply(1-y_d, np.log(1-h))
    temp3 = np.sum(temp1 + temp2)
    sum1 = np.sum(np.sum(np.power(theta1[:,1:],2), axis = 1))
    sum2 = np.sum(np.sum(np.power(theta2[:,1:],2), axis = 1))

    return np.sum(temp3 / (-m)) + (sum1 + sum2) * lmbda / (2*m)


In [4]:
costv(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lmbda)

0.3837698590909235

In [5]:
def sigmoidGrad(z):
    g = 1/(1 + np.exp(-z));
    g = g*(1-g);
    return g;
def randInitializeWeights(L_in, L_out):
    epsilon = 0.12
    return np.random.rand(L_out, L_in+1) * 2 * epsilon - epsilon

initial_theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
initial_theta2 = randInitializeWeights(hidden_layer_size, num_labels)

nn_initial_params = np.hstack((initial_theta1.flatten(), initial_theta2.flatten()))
def Gradient(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lmbda):
    
    initial_theta1 = np.reshape(nn_params[:hidden_layer_size*(input_layer_size+1)], (hidden_layer_size, input_layer_size+1))
    initial_theta2 = np.reshape(nn_params[hidden_layer_size*(input_layer_size+1):], (num_labels, hidden_layer_size+1))
    y_d = pd.get_dummies(y.flatten())
    delta1 = np.zeros(initial_theta1.shape)
    delta2 = np.zeros(initial_theta2.shape)
    m = len(y)
    
    for i in range(X.shape[0]):
        ones = np.ones(1)
        a1 = np.hstack((ones, X[i]))
        z2 = np.dot(a1, initial_theta1.T)
        a2 = np.hstack((ones, sigmoid(z2)))
        z3 = np.dot(a2,initial_theta2.T)
        a3 = sigmoid(z3)

        d3 = a3 - y_d.iloc[i,:][np.newaxis,:]
        z2 = np.hstack((ones, z2))
        d2 = np.multiply(np.dot(initial_theta2.T ,d3.T), sigmoidGrad(z2).T[:,np.newaxis])
        delta1 = delta1 +np.dot( d2[1:,:] , a1[np.newaxis,:])
        delta2 = delta2 + np.dot(d3.T ,a2[np.newaxis,:])
        
    delta1 /= m
    delta2 /= m

    delta1[:,1:] = delta1[:,1:] + initial_theta1[:,1:] * lmbda / m
    delta2[:,1:] = delta2[:,1:] + initial_theta2[:,1:] * lmbda / m
        
    return np.hstack((delta1.flatten() ,delta2.flatten()))

In [10]:
nn_backprop_params = Gradient(nn_initial_params, input_layer_size, hidden_layer_size, num_labels, X, y, lmbda)
print('Backpropgation Paras:',nn_backprop_params)



Backpropgation Paras: [-1.87230071e-02 -7.11489618e-06 -1.71189030e-05 ...  1.79358315e-01
  1.82545899e-01  1.82833685e-01]


In [None]:
theta_opt = opt.fmin_cg(maxiter = 100, f = costv, x0 = nn_initial_params, fprime = Gradient,args = (input_layer_size, hidden_layer_size, num_labels, X, y.flatten(),1))
theta1_opt = np.reshape(theta_opt[:hidden_layer_size*(input_layer_size+1)], (hidden_layer_size, input_layer_size+1), 'F')
theta2_opt = np.reshape(theta_opt[hidden_layer_size*(input_layer_size+1):], (num_labels, hidden_layer_size+1), 'F')

In [7]:
def predict(theta1, theta2, X, y):
    m = len(y)
    ones = np.ones((m,1))
    a1 = np.hstack((ones, X))
    a2 = sigmoid(np.dot(a1, theta1.T))
    a2 = np.hstack((ones, a2))
    h = sigmoid(np.dot(a2,theta2.T)
    return np.argmax(h,axis = 1) + 1
pred = predict(theta1_opt, theta2_opt, X, y)
np.mean(pred == y.flatten()) * 100
print('97.43')

97.43
