In [1]:
#Data set
import numpy as np
import matplotlib.pyplot as plt
import pickle
import pandas as pd

N = 100 # number of points per class
D = 2 # dimensionality
K = 3 # number of classes
X = np.zeros((N*K,D))
y = np.zeros(N*K, dtype='uint8')
for j in range(K):
  ix = range(N*j,N*(j+1))
  r = np.linspace(0.0,1,N) # radius
  t = np.linspace(j*4,(j+1)*4,N) + np.random.randn(N)*0.2 # theta
  X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
  y[ix] = j

pickle.dump(X,open('dataX.pickle','wb'))
pickle.dump(y,open('dataY.pickle','wb'))

In [2]:
#Partition
from sklearn.model_selection import train_test_split
data_X_train, data_X_test, data_Y_train, data_Y_test = train_test_split(X, y, test_size=0.20)
print (" Training Dataset X Shape : ",data_X_train.shape)
print (" Test Dataset X Shape : ",data_X_test.shape)
print (" Training Dataset Y Shape : ",data_Y_train.shape)
print (" Test Dataset Y Shape : ",data_Y_test.shape)

 Training Dataset X Shape :  (240, 2)
 Test Dataset X Shape :  (60, 2)
 Training Dataset Y Shape :  (240,)
 Test Dataset Y Shape :  (60,)


In [3]:
def ffn(X,y,s,r,g_iter,h1):
    #For simplicity we will take the batch size to be the same as number of examples
    num_examples = X.shape[0]
    h = h1
    np.random.seed(1234)
    W = 0.01 * np.random.randn(D,h)
    b = np.zeros((1,h))
    W2 = 0.01 * np.random.randn(h,K)
    b2 = np.zeros((1,K))
    
    #Initial value for the Gradient Descent Parameter
    step_size = s #Also called learning rate

    #For simplicity, we will not hand tune this algorithm parameter as well.
    reg= r
    
    # gradient descent loop
    for i in range(g_iter):
      
  # evaluate class scores, [N x K]
        hidden_layer = np.maximum(0, np.dot(X, W) + b) # note, ReLU activation
        scores = np.dot(hidden_layer, W2) + b2
  
  # compute the class probabilities
        exp_scores = np.exp(scores)
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
  
  # compute the loss: average cross-entropy loss and regularization
        corect_logprobs = -np.log(probs[range(num_examples),y])
        data_loss = np.sum(corect_logprobs)/num_examples
        reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
        loss = data_loss + reg_loss
        #if i % 1000 == 0:
            #print "iteration %d: loss %f" % (i, loss)
  
  # compute the gradient on scores
        dscores = probs
        dscores[range(num_examples),y] -= 1
        dscores /= num_examples
  
  # backpropate the gradient to the parameters
  # first backprop into parameters W2 and b2
        dW2 = np.dot(hidden_layer.T, dscores)
        db2 = np.sum(dscores, axis=0, keepdims=True)
  # next backprop into hidden layer
        dhidden = np.dot(dscores, W2.T)
  # backprop the ReLU non-linearity
        dhidden[hidden_layer <= 0] = 0
  # finally into W,b
        dW = np.dot(X.T, dhidden)
        db = np.sum(dhidden, axis=0, keepdims=True)
  
  # add regularization gradient contribution
        dW2 += reg * W2
        dW += reg * W
  
  # perform a parameter update
        W += -step_size * dW
        b += -step_size * db
        W2 += -step_size * dW2
        b2 += -step_size * db2
        
    return(W,b,W2,b2)

In [5]:
#Question 5.4.1
giter=[50,100,150]
for j in giter[:]:
    W,b,W2,b2= ffn(data_X_train,data_Y_train,0.4,0.001,500,j)

    hidden_layer = np.maximum(0, np.dot(data_X_train, W) + b)
    scores = np.dot(hidden_layer, W2) + b2
    predicted_class = np.argmax(scores, axis=1)
    print('Number of Hidden Units: %.2f' % j)
    print ('train accuracy : %.2f' % (np.mean(predicted_class == data_Y_train)*100))

    hidden_layer = np.maximum(0, np.dot(data_X_test, W) + b)
    scores = np.dot(hidden_layer, W2) + b2
    predicted_class = np.argmax(scores, axis=1)
    Test_Acc=(np.mean(predicted_class == data_Y_test)*100)
    print ('Test accuracy: %.2f' % Test_Acc)

Number of Hidden Units: 50.00
train accuracy : 65.83
Test accuracy: 58.33
Number of Hidden Units: 100.00
train accuracy : 67.92
Test accuracy: 61.67
Number of Hidden Units: 150.00
train accuracy : 69.58
Test accuracy: 61.67


In [6]:
def ffnl(X,y,s,r,g_iter,h):
    #For simplicity we will take the batch size to be the same as number of examples
    num_examples = X.shape[0]
    h = h
    np.random.seed(1234)
    W = 0.01 * np.random.randn(D,h)
    b = np.zeros((1,h))
    W2 = 0.01 * np.random.randn(h,K)
    b2 = np.zeros((1,K))
    
    #Initial value for the Gradient Descent Parameter
    step_size = s #Also called learning rate

    #For simplicity, we will not hand tune this algorithm parameter as well.
    reg= r
    
    # gradient descent loop
    for i in range(g_iter):
      
  # evaluate class scores, [N x K]
        #hidden_layer = np.maximum(0.01*x, np.dot(X, W) + b)# note, ReLU activation
        hidden_layer = np.maximum(np.dot(0.01*X,W)+b  , np.dot(X,W)+ b) # Leaky ReLU activation
        scores = np.dot(hidden_layer, W2) + b2
         
  # compute the class probabilities
        exp_scores = np.exp(scores)
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
  
  # compute the loss: average cross-entropy loss and regularization
        corect_logprobs = -np.log(probs[range(num_examples),y])
        data_loss = np.sum(corect_logprobs)/num_examples
        reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
        loss = data_loss + reg_loss
        #if i % 1000 == 0:
            #print "iteration %d: loss %f" % (i, loss)
  
  # compute the gradient on scores
        dscores = probs
        dscores[range(num_examples),y] -= 1
        dscores /= num_examples
  
  # backpropate the gradient to the parameters
  # first backprop into parameters W2 and b2
        dW2 = np.dot(hidden_layer.T, dscores)
        db2 = np.sum(dscores, axis=0, keepdims=True)
  # next backprop into hidden layer
        dhidden = np.dot(dscores, W2.T)
  # backprop the ReLU non-linearity
        dhidden[hidden_layer <= 0] *= 0.01
  # finally into W,b
        dW = np.dot(X.T, dhidden)
        db = np.sum(dhidden, axis=0, keepdims=True)
  
  # add regularization gradient contribution
        dW2 += reg * W2
        dW += reg * W
  
  # perform a parameter update
        W += -step_size * dW
        b += -step_size * db
        W2 += -step_size * dW2
        b2 += -step_size * db2
        
    return(W,b,W2,b2)

In [9]:
#Question 5.4.2
giter=[50,100,150]
for j in giter[:]:
    W,b,W2,b2= ffnl(data_X_train,data_Y_train,0.4,0.001,1000,j)

    hidden_layer = np.maximum(np.dot(0.01*data_X_train,W)+b  , np.dot(data_X_train,W)+ b)
    scores = np.dot(hidden_layer, W2) + b2
    predicted_class = np.argmax(scores, axis=1)
    print('Number of Hidden Units: %.2f' % j)
    print ('train accuracy : %.2f' % (np.mean(predicted_class == data_Y_train)*100))

    hidden_layer = np.maximum(np.dot(0.01*data_X_test,W)+b  , np.dot(data_X_test,W)+ b)
    scores = np.dot(hidden_layer, W2) + b2
    predicted_class = np.argmax(scores, axis=1)
    Test_Acc=(np.mean(predicted_class == data_Y_test)*100)
    print ('Test accuracy: %.2f' % Test_Acc)

Number of Hidden Units: 50.00
train accuracy : 55.42
Test accuracy: 55.00
Number of Hidden Units: 100.00
train accuracy : 56.25
Test accuracy: 55.00
Number of Hidden Units: 150.00
train accuracy : 54.17
Test accuracy: 53.33


In [11]:
def ffnmo(X,y,step_size, reg, no_iter,h):
        
    # Start with an initial set of parameters randomly
    h1 = h # size of hidden layer1
    #h2 = 100 # size of hidden layer2
    np.random.seed(1234)
    W1 = 0.01 * np.random.randn(D,h1)
    b1 = np.zeros((1,h1))
    W2 = 0.01 * np.random.randn(D,h1)
    b2 = np.zeros((1,h1))
    W3 = 0.01 * np.random.randn(h1,K)
    b3 = np.zeros((1,K))

    num_examples = X.shape[0]

    # gradient descent loop
    for i in range(no_iter):

        # evaluate class scores, [N x K]
        #hidden_layer1 = np.maximum(0, np.dot(X, W1) + b1) # note, ReLU activation
        #hidden_layer2 = np.maximum(0, np.dot(X, W2) + b2) # note, ReLU activation
        hidden_layer = np.maximum(np.dot(X, W1) + b1, np.dot(X, W2) + b2) # note, ReLU activation
        scores = np.dot(hidden_layer, W3) + b3

        # compute the class probabilities
        exp_scores = np.exp(scores)
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]

        # compute the loss: average cross-entropy loss and regularization
        corect_logprobs = -np.log(probs[range(num_examples),y])
        data_loss = np.sum(corect_logprobs)/num_examples
        reg_loss = 0.5*reg*np.sum(W1*W1) + 0.5*reg*np.sum(W2*W2) + 0.5*reg*np.sum(W3*W3)
        loss = data_loss + reg_loss
        #if i % 1000 == 0:
            #print "iteration %d: loss %f" % (i, loss)

        # compute the gradient on scores
        dscores = probs
        dscores[range(num_examples),y] -= 1
        dscores /= num_examples
        
        dW3 = np.dot(hidden_layer.T, dscores)
        db3 = np.sum(dscores, axis=0, keepdims=True)
        
        #dhidden[hidden_layer <= 0] = 0
        dhidden2 = np.where(np.dot(X, W2) + b2 >= np.dot(X, W1) + b1, np.dot(dscores, W3.T), 0)
        dhidden  = np.where(np.dot(X, W2) + b2 <= np.dot(X, W1) + b1, np.dot(dscores, W3.T), 0)
  
        # finally into W,b, W2,b2
  
        dW2 = np.dot(X.T, dhidden2)
        db2 = np.sum(dhidden2, axis=0, keepdims=True)
        dW1 = np.dot(X.T, dhidden)
        db1 = np.sum(dhidden, axis=0, keepdims=True)
        

        # add regularization gradient contribution
        dW3 += reg * W3
        dW2 += reg * W2
        dW1 += reg * W1

        # perform a parameter update
        W1 += -step_size * dW1
        b1 += -step_size * db1
        W2 += -step_size * dW2
        b2 += -step_size * db2
        W3 += -step_size * dW3
        b3 += -step_size * db3
    return(W1,b1,W2,b2,W3,b3)


In [13]:
#Question 5.4.3
giter=[50,100,150]
for j in giter[:]:
    W1,b1,W2,b2,W3,b3 = ffnmo(data_X_train,data_Y_train, 0.7,0.0005,1000,j)
    hidden_layer = np.maximum(np.dot(data_X_train, W1) + b1, np.dot(data_X_train, W2) + b2) # note, ReLU activation
    scores = np.dot(hidden_layer, W3) + b3

    predicted_class = np.argmax(scores, axis=1)
    print('Number of Hidden Units: %.2f' % j)
    print ('train accuracy : %.2f' % (np.mean(predicted_class == data_Y_train)*100))

    hidden_layer = np.maximum(np.dot(data_X_test, W1) + b1, np.dot(data_X_test, W2) + b2) # note, ReLU activation
    scores = np.dot(hidden_layer, W3) + b3
    predicted_class = np.argmax(scores, axis=1)
    Test_Acc=(np.mean(predicted_class == data_Y_test)*100)
    print ('Test accuracy: %.2f' % Test_Acc)

Number of Hidden Units: 50.00
train accuracy : 95.83
Test accuracy: 93.33
Number of Hidden Units: 100.00
train accuracy : 94.17
Test accuracy: 95.00
Number of Hidden Units: 150.00
train accuracy : 95.42
Test accuracy: 93.33
