In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.optimize as opt
import multiprocessing as mp
import pandas as pd
import scipy.io as sio

In [2]:
input_layer_size  = 400
hidden_layer_size = 25
num_labels = 10
print('Loading and Visualizing Data ...\n')
mat_contents = sio.loadmat('ex4data1.mat')
X = mat_contents['X']
y = mat_contents['y']
m = len(y)
#print(m)

rand_indices = np.random.permutation(m)
#print(rand_indices)
rand_indices = rand_indices.reshape(-1, 1)
sel = X[rand_indices[0:100, :]]



Loading and Visualizing Data ...



In [3]:
def arrangeParams(t1, t2):
    #print(t1.shape)
    #print(t2.shape)
    return np.concatenate((t1.reshape(t1.size, 1, order='F'),
                           t2.reshape(t2.size, 1, order='F'))
                          , axis=0).flatten()
           

In [4]:
def displayData(X):
    fig, ax = plt.subplots(10,10,sharex=True,sharey=True)
    img_num = 0
    for i in range(10):
        for j in range(10):
            # Convert column vector into 20x20 pixel matrix
            # You have to transpose to display correctly
            img = X[img_num,:].reshape(20,20).T
            ax[i][j].imshow(img,cmap='gray')
            img_num += 1

    return (fig, ax)
    
#figure, ax = displayData(sel)
#figure.show()

In [5]:
print('\nLoading Saved Neural Network Parameters ...\n')
nn_contents = sio.loadmat('ex4weights.mat')

Theta1 = nn_contents['Theta1'] 
Theta2 = nn_contents['Theta2']
print(Theta1.shape)
print(Theta2.shape)

nn_params = arrangeParams(Theta1, Theta2)
#print(Theta1[0][1])
#print(Theta2[0][1])


Loading Saved Neural Network Parameters ...

(25, 401)
(10, 26)


In [6]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z));

In [7]:
def sigmoidGradient(z):
    return sigmoid(z) * (1-sigmoid(z))

In [8]:
def nnCostFunctionPrateek(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lamb):
    m = X.shape[0]

    X = np.c_[np.ones(m),X]
    J=0
    Theta1 = nn_params[:(hidden_layer_size*(input_layer_size+1))].reshape((input_layer_size+1),hidden_layer_size).T
    Theta2 = nn_params[-(num_labels*(hidden_layer_size+1)):].reshape(num_labels,(hidden_layer_size+1))
#     print("Theta1 shape ", Theta1.shape)
#     print("Theta2 shape ", Theta2.shape)
#     print("X shape ", X.shape)
    a1 = sigmoid(X.dot(Theta1.T)) # X = 5000* 401 , Theta1.T =401*25 a1= 5000*25
    assert(a1.shape==(X.shape[0],Theta1.shape[0]))
    a1 = np.c_[np.ones(m), a1] # add ones in a1 so a1.shape = 5000*26
    h= sigmoid(a1.dot(Theta2.T)) # a1=5000*26 and Theta2 =10*26 so h = 5000*10
    assert(h.shape == (a1.shape[0],Theta2.shape[0]))
    for i in range(1,num_labels+1):
        yk = (y==i)*1
        
        h_of_x = (h[:,i-1].reshape(-1,1))
        J = J - (np.sum((yk*np.log(h_of_x)) + ((1-yk)*np.log(1-h_of_x))))/m
    
    
    rtheta1=np.sum(np.square(Theta1[:,1:]))
    rtheta2=np.sum(np.square(Theta2[:,1:]))
    bias = lamb/(2*m)
    J= J+(bias*(rtheta1+rtheta2));
    Del1=0
    Del2=0
    for t in range(1,m+1):
        A1 = X[t-1:t].T
        Z2 = Theta1.dot(A1)
        a = np.array([1]).reshape(-1,1)
        A2 = np.concatenate((a,sigmoid(Z2)), axis=0)
        Z3 = Theta2.dot(A2)
        H = sigmoid(Z3)
        actual = y[t-1];
        yk= np.zeros((num_labels,1))
        yk[actual-1] = 1;
        del3 = H - yk;
        del2 =(Theta2[:,1:].T.dot(del3)) * sigmoidGradient(Z2)
        Del1 = Del1 + del2 * A1.T
        Del2 = Del2 + del3 * A2.T

    Theta1_grad = (Del1/m) + (lamb/m) * np.concatenate((np.zeros((hidden_layer_size,1)),Theta1[:,1:]), axis=1)
    Theta2_grad = (Del2/m) + (lamb/m) *  np.concatenate((np.zeros((num_labels,1)), Theta2[:,1:]), axis=1)

    grad = np.concatenate((Theta1_grad.T.reshape(Theta1_grad.size,1), Theta2_grad.reshape(Theta2_grad.size,1)), axis=0)  
    grad = grad.reshape(-1)
    print(J, grad.shape)
    return J,grad

In [9]:
def nnCostFunction(nn_params_l, input_layer_size_l, hidden_layer_size_l, num_labels_l, X_l, y_l, lambda_l):
   
    Theta1_l = nn_params_l[0:(hidden_layer_size_l * (input_layer_size_l + 1))].reshape(input_layer_size_l + 1, hidden_layer_size_l).T
    Theta2_l = nn_params_l[(Theta1_l.size):nn_params_l.size].reshape(hidden_layer_size_l + 1, num_labels_l).T
   
    m = X_l.shape[0] #5000X400    
    X_l = np.c_[np.ones((m, 1)), X_l] #5000X401
   
    z1 = X_l.dot(Theta1_l.T)
    a1 = sigmoid(z1)
    a1 = np.c_[np.ones((m, 1)), a1]
    
    z2=  a1.dot(Theta2_l.T)
    h = sigmoid(z2)
   
    J = 0
    for k in range(1, num_labels_l + 1):
        yk = (y_l==k) * 1
        J = J - (1/m) * np.sum(yk * np.log(h.T[k-1:k].T) + (1-yk) * np.log(1-h.T[k-1:k].T))    
  
    
    rtheta1 = np.sum(np.square(Theta1_l[:,1:]))
    rtheta2 = np.sum(np.square(Theta2_l[:,1:]))
    bias = lambda_l/(2*m)
    
    J= J + (bias * (rtheta1+rtheta2))
    
    Del1, Del2 = 0, 0
    
    for t in range(m): 
        A1 = X_l[t,:].T.reshape(-1, 1) # all columns with one row at a time
        Z2 = Theta1_l.dot(A1)
        A2 = np.concatenate((np.c_[np.array([1])], sigmoid(Z2).reshape(-1,1)))
        Z3 = Theta2_l.dot(A2)
        H = sigmoid(Z3)
        actual = y_l[t].reshape(-1,1)
        yk = np.zeros((num_labels_l,1))
        yk[actual - 1] = 1
        del3 = H - yk;
        del2 = (Theta2_l[:,1:].T.dot(del3)) * sigmoidGradient(Z2).reshape(-1, 1)
       
        Del1 = Del1 + del2 * A1.T
        Del2 = Del2 + del3 * A2.T
    
       
    Theta1_grad = (Del1/m) + (lambda_l/m) * np.c_[np.zeros((hidden_layer_size_l,1)), Theta1_l[:,1:]]
    Theta2_grad = (Del2/m) + (lambda_l/m) * np.c_[np.zeros((num_labels_l,1)), Theta2_l[:,1:]]
    
    grad = arrangeParams(Theta1_grad, Theta2_grad)
    grad = grad.reshape(-1, 1)
    print(J, grad.shape)
    return J, grad
    
#calling    
lambda_val = 0

J, grad = nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lambda_val)
print(['Cost at parameters (loaded from ex4weights): \n(this value should be about 0.287629)\n'], J);


0.2876291651613189 (10285, 1)
['Cost at parameters (loaded from ex4weights): \n(this value should be about 0.287629)\n'] 0.2876291651613189


In [10]:
print('\nChecking Cost Function (w/ Regularization) ... \n')
lambda_val = 1

J, grad = nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lambda_val)
print(['Cost at parameters (loaded from ex4weights):\n(this value should be about 0.383770)\n'], J)




Checking Cost Function (w/ Regularization) ... 

0.38376985909092365 (10285, 1)
['Cost at parameters (loaded from ex4weights):\n(this value should be about 0.383770)\n'] 0.38376985909092365


In [11]:
print('\nEvaluating sigmoid gradient...\n')

g = sigmoidGradient(np.array([-1, -0.5, 0, 0.5, 1]))
print('Sigmoid gradient evaluated at [-1 -0.5 0 0.5 1]:\n  ')
print(g)



Evaluating sigmoid gradient...

Sigmoid gradient evaluated at [-1 -0.5 0 0.5 1]:
  
[0.19661193 0.23500371 0.25       0.23500371 0.19661193]


In [12]:
def randInitializeWeights(L_in, L_out):
    epsilon_init = 0.12
    return np.random.rand(L_out, 1 + L_in) * 2 * epsilon_init - epsilon_init

In [13]:


# ================ Part 6: Initializing Pameters ================
#  In this part of the exercise, you will be starting to implment a two
#  layer neural network that classifies digits. You will start by
#  implementing a function to initialize the weights of the neural network
#  (randInitializeWeights.m)
np.random.seed(0) #always used before random for testing
print('\nInitializing Neural Network Parameters ...\n')
initial_Theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
initial_Theta2 = randInitializeWeights(hidden_layer_size, num_labels)
initial_nn_params = arrangeParams(initial_Theta1, initial_Theta2)
print(initial_nn_params[1:5])


Initializing Neural Network Parameters ...

[ 0.10302994 -0.0220073   0.08647456  0.11775641]


In [14]:
def debugInitializeWeights(fan_out, fan_in):
    W = np.zeros((fan_out, 1 + fan_in))
    W = (np.sin(np.arange(1, W.size + 1)).reshape(W.shape)) / 10
    return W


In [15]:
def computeNumericalGradient(J, theta):

    numgrad = np.zeros(theta.shape)
    perturb = np.zeros(theta.shape)
    e = 1e-4
    #print("---------computeNumericalGradient-----------------")
    #print(theta.size)
    #print("---------computeNumericalGradient-----------------")
    for p in range(theta.size):
        #Set perturbation vector
        perturb[p] = e
        loss1 = J(theta - perturb)
        loss2 = J(theta + perturb)
        #Compute Numerical Gradient
        numgrad[p] = (loss2 - loss1) / (2*e)
        perturb[p] = 0
    return numgrad
    

In [16]:
#np.exp(0.00001)
1e-4

0.0001

In [17]:
def checkNNGradients(lamb=0):
    input_layer_size = 3
    hidden_layer_size = 5
    num_labels = 3
    m = 5
    Theta1 = debugInitializeWeights(hidden_layer_size, input_layer_size)
    Theta2 = debugInitializeWeights(num_labels, hidden_layer_size)
    X  = debugInitializeWeights(m, input_layer_size - 1)
    y = np.mod([i for i in range(1,m+1) ], num_labels).reshape(-1,1)
    nn_params = arrangeParams(Theta1, Theta2)# np.concatenate((Theta1.T.reshape(Theta1.size,1), Theta2.reshape(Theta2.size,1)))

    #print(Theta1)
    #print(Theta2)
    cost, grad = nnCostFunction(nn_params,input_layer_size,hidden_layer_size, num_labels, X, y, lamb)
    
    def reduced_cost_func(p): 
        #print("--------------------------------------")
        #print(p)
        #print("--------------------------------------")
        return nnCostFunction(p,input_layer_size,hidden_layer_size,num_labels,X,y,lamb)[0]
     
    numgrad = computeNumericalGradient(reduced_cost_func, nn_params)

    #print(np.c_[numgrad, grad])

    return


In [18]:
# print('\nChecking Backpropagation... \n');

# #Check gradients by running checkNNGradients
checkNNGradients()

2.0993852797173775 (38, 1)
2.0993835131753165 (38, 1)
2.0993870462561848 (38, 1)
2.099384308840557 (38, 1)
2.0993862505997076 (38, 1)
2.099385997846823 (38, 1)
2.0993845615905786 (38, 1)
2.099387027996981 (38, 1)
2.099383531446314 (38, 1)
2.0993864489217042 (38, 1)
2.099384110508848 (38, 1)
2.0993852313567896 (38, 1)
2.0993853280779513 (38, 1)
2.099385256046543 (38, 1)
2.0993853033882366 (38, 1)
2.099385302578475 (38, 1)
2.099385256856291 (38, 1)
2.0993853280671604 (38, 1)
2.0993852313676316 (38, 1)
2.0993853090927916 (38, 1)
2.0993852503419443 (38, 1)
2.099385209040645 (38, 1)
2.0993853503940785 (38, 1)
2.0993852513520173 (38, 1)
2.09938530808279 (38, 1)
2.099385319871672 (38, 1)
2.099385239563108 (38, 1)
2.0993853514222853 (38, 1)
2.0993852080125506 (38, 1)
2.099385317046896 (38, 1)
2.09938524238782 (38, 1)
2.0993852517043696 (38, 1)
2.0993853077303797 (38, 1)
2.099385272736462 (38, 1)
2.0993852866983027 (38, 1)
2.099385300247191 (38, 1)
2.099385259187569 (38, 1)
2.0993853088522316 (

In [19]:
print('\nChecking Backpropagation (w/ Regularization) ... \n')
lambda_val = 3
checkNNGradients(lambda_val)
debug_J, debug_grad  = nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lambda_val);

print(['\n\nCost at (fixed) debugging parameters (w/ lambda = %f): \n(for lambda = 3, this value should be about 0.576051)\n\n'], lambda_val, debug_J)





Checking Backpropagation (w/ Regularization) ... 

2.1459566783371105 (38, 1)
2.1459549117950494 (38, 1)
2.1459584448759177 (38, 1)
2.1459557074602897 (38, 1)
2.1459576492194405 (38, 1)
2.1459573964665557 (38, 1)
2.1459559602103115 (38, 1)
2.1459584266167138 (38, 1)
2.145954930066047 (38, 1)
2.145957847541437 (38, 1)
2.145955509128581 (38, 1)
2.1459511771919613 (38, 1)
2.145962185482245 (38, 1)
2.145958334159265 (38, 1)
2.14595502851498 (38, 1)
2.145959968324873 (38, 1)
2.1459533943493585 (38, 1)
2.145950786042759 (38, 1)
2.1459625766314985 (38, 1)
2.145961216636005 (38, 1)
2.1459521460381965 (38, 1)
2.1459557639403295 (38, 1)
2.1459575987338595 (38, 1)
2.145952711052158 (38, 1)
2.145960651622115 (38, 1)
2.145962721432644 (38, 1)
2.145950641241601 (38, 1)
2.145952851314977 (38, 1)
2.1459605113593243 (38, 1)
2.1459558194033708 (38, 1)
2.1459575432708107 (38, 1)
2.1459611941390744 (38, 1)
2.145952168535141 (38, 1)
2.1459507382067153 (38, 1)
2.1459626244675154 (38, 1)
2.145959921304432 (

In [23]:

#=================== Part 8: Training NN ===================
#  You have now implemented all the code necessary to train a neural 
#  network. To train your neural network, we will now use "fmincg", which
#  is a function which works similarly to "fminunc". Recall that these
#  advanced optimizers are able to train our cost functions efficiently as
#  long as we provide them with the gradient computations.
#
print('\nTraining Neural Network... \n')
lambda_val = 1
#print(initial_nn_params.shape)
#res = opt.minimize(nnCostFunction, initial_nn_params, args=(input_layer_size, hidden_layer_size, num_labels, X, y, lambda_val), jac=True, options={'maxiter':50})
def min_cost_func(p):
    j, g = nnCostFunction(p, input_layer_size, hidden_layer_size, num_labels, X, y, lambda_val)
    return j, g.flatten()


#min_cost_func(initial_nn_params)
res = opt.minimize(min_cost_func, initial_nn_params, jac=True, method='CG', options={'maxiter':120, 'eps': 0.8 })
print(res)


Training Neural Network... 

6.909270510710377 (10285, 1)
6.909270510710377 (10285, 1)
4.351684303986592 (10285, 1)
3.304453113873082 (10285, 1)
3.254024171558584 (10285, 1)
3.2463980418051968 (10285, 1)
3.219764637381702 (10285, 1)
3.173006291981995 (10285, 1)
3.0332331460088486 (10285, 1)
2.9170591525269134 (10285, 1)
2.6866924547673654 (10285, 1)
2.660525089713059 (10285, 1)
2.2842999211248505 (10285, 1)
2.297751842111555 (10285, 1)
2.049531660488122 (10285, 1)
1.930778251952054 (10285, 1)
1.8532783110841131 (10285, 1)
1.744318214121175 (10285, 1)
1.6438132026145353 (10285, 1)
1.5854617739813086 (10285, 1)
1.542603031819505 (10285, 1)
1.4857130217140375 (10285, 1)
1.4425446239865014 (10285, 1)
1.352208280327473 (10285, 1)
1.3159236795453264 (10285, 1)
1.2660018676285436 (10285, 1)
1.2398195688821008 (10285, 1)
1.2029863787560018 (10285, 1)
1.174386533577021 (10285, 1)
1.1109330536965558 (10285, 1)
1.08785502426485 (10285, 1)
1.050301434371719 (10285, 1)
1.025239039466908 (10285, 1)

KeyboardInterrupt: 

In [None]:
def test1(a, b, c):
    print("in test1")
    print("a = ", a)
    print("b = ", b)
    print("c = ", c)

def test2():
    a1 = 10
    b1 = 20
    c1 = 30
    def test3(d):
        print("d = ", d)
        return "test3"
    test1(test3, b1, c1)

test2()

    

In [None]:
#print(a[:,2:3])
#a

In [None]:
a = np.random.rand(4, 4)
a

In [None]:
A1 = a[1,:].T.reshape(-1, 1)
A1

In [None]:
A2 = a[1:2].T
A2

In [None]:
y = np.random.rand(4, 1)
y

In [None]:
print((y[2]).shape)

In [None]:
print(y[2,:].reshape(-1, 1).shape)