In [3]:
import sys
sys.version

'3.8.8 (default, Apr 13 2021, 12:59:45) \n[Clang 10.0.0 ]'

# Read in Data

The MNIST dataset is contained in keras

In [1]:
import typing
import numpy as np 
from keras.datasets import mnist

In [2]:
(X_train, y_train),(X_test, y_test) = mnist.load_data()

In [5]:
def phsp(x, x_name = None):
    if x_name != None:
        print(f"{x_name} is of shape {x.shape}")

In [6]:
phsp(X_train, "x_train")
phsp(y_train, "y_train")
phsp(X_test, "x_test")
phsp(y_test, "y_test")

x_train is of shape (60000, 28, 28)
y_train is of shape (60000,)
x_test is of shape (10000, 28, 28)
y_test is of shape (10000,)


## Filter to NUM_EXAMPLES

In [7]:
NUM_EXAMPLES = 1000

In [8]:
train_imgs = X_train[0:NUM_EXAMPLES].reshape(NUM_EXAMPLES, 28*28)/255
train_lbls = y_train[0:NUM_EXAMPLES]

test_imgs = X_test.reshape(len(X_test), 28*28)/255
test_lbls = y_test

In [9]:
phsp(train_imgs,"train_images")
phsp(train_lbls, "labels")

train_images is of shape (1000, 784)
labels is of shape (1000,)


In [10]:
#rows, columns
train_imgs[0:5, 0:2] 

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.]])

## make the labels one-hot

In [11]:
ohe = np.zeros((len(train_lbls), 10))
phsp(ohe,"train_lbls_ohe")

train_lbls_ohe is of shape (1000, 10)


In [12]:
ohe[0:5,0:4]

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [13]:
for i,l in enumerate(train_lbls):
    ohe[i][l] = 1

ohe[0:5,0:4]

array([[0., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 0.]])

In [14]:
train_lbls = ohe
phsp(train_lbls, "train_lbls")

train_lbls is of shape (1000, 10)


## same for test

In [15]:
ohe = np.zeros((len(test_lbls), 10))
for i, l in enumerate(test_lbls):
    ohe[i][l] = 1

test_lbls = ohe
phsp(test_lbls, "test_lbls")

test_lbls is of shape (10000, 10)


# Initialize Parameters

In [127]:
np.random.seed(1)

relu = lambda x:(x>=0)*x
relu2deriv = lambda x: x>=0

batch_size = 100
alpha, iterations, hidden_size, pixels_per_image, num_labels = (0.005, 300, 40, 784, 10)

In [123]:
weights_0_1 = 0.2*np.random.random((pixels_per_image, hidden_size)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size, num_labels)) - 0.1

In [124]:
weights_0_1[2:5, 0:3]

array([[ 0.07666122,  0.02473444,  0.05018849],
       [-0.09602397, -0.0947578 , -0.0943387 ],
       [-0.05203045, -0.00124606,  0.02399114]])

## Naive Attempt

In [110]:
#without drop out

for j in range(iterations):
    error, correct_cnt = (0.0, 0)
    
    for i in range(len(train_imgs)):
        layer_0 = train_imgs[i:i+1] #single row (1,784)
        layer_1 = relu(np.dot(layer_0, weights_0_1)) # (1,40)
        layer_2 = np.dot(layer_1, weights_1_2) # (1,10)
        
        error = error + np.sum((train_lbls[i:i+1] - layer_2) **2)
        correct_cnt += int(np.argmax(layer_2) == np.argmax(train_lbls[i:i+1]))
        
        layer_2_delta = (train_lbls[i:i+1] - layer_2)
        layer_1_delta = np.dot(layer_2_delta, weights_1_2.T) * relu2deriv(layer_1) #take the dir of the original output
        
        weights_1_2 += alpha*(np.dot(layer_1.T, layer_2_delta)) #same as np.dot
        weights_0_1 += alpha*(np.dot(layer_0.T, layer_1_delta))
    
    if(j % 50 == 0 or j == iterations -1):
        print(f"Iteration:{j}, Train Error:{error/float(len(train_imgs)):.3f}, Train Correct:{correct_cnt/float(len(train_imgs))}")
        
        

Iteration:0, Train Error:0.722, Train Correct:0.537
Iteration:50, Train Error:0.204, Train Correct:0.966
Iteration:100, Train Error:0.167, Train Correct:0.984
Iteration:150, Train Error:0.145, Train Correct:0.991
Iteration:200, Train Error:0.130, Train Correct:0.998
Iteration:250, Train Error:0.120, Train Correct:0.999
Iteration:299, Train Error:0.113, Train Correct:0.999


## Test Accuracy

Shows overfitting

In [111]:
## try on test
error, correct_cnt = (0.0, 0)

for i in range(len(test_imgs)):
    layer_0 = test_imgs[i:i+1]
    layer_1 = relu(np.dot(layer_0,weights_0_1))
    layer_2 = np.dot(layer_1, weights_1_2)
    
    error = error + np.sum((test_lbls[i:i+1] - layer_2)**2)
    correct_cnt += int(np.argmax(layer_2) == np.argmax(test_lbls[i:i+1]))
    
print(f"Test Error:{error/float(len(test_imgs)):.3f}, Test Correct:{correct_cnt/float(len(test_imgs)):.3f}")

Test Error:0.614, Test Correct:0.718


# Add dropout

In [112]:
#without drop out

for j in range(iterations):
    error, correct_cnt = (0.0, 0)
    
    for i in range(len(train_imgs)):
        layer_0 = train_imgs[i:i+1] #single row (1,784)
        layer_1 = relu(np.dot(layer_0, weights_0_1)) # (1,40)
        
        #####################################################
        dropout_mask = np.random.randint(2, size  = layer_1.shape) #0 or 1 for (1,40)
        layer_1 = layer_1*dropout_mask * 2 #randomly turn on off nodes, make sure to amplify volume of remaining
        #######################################################
        
        
        layer_2 = np.dot(layer_1, weights_1_2) # (1,10)
        
        error = error + np.sum((train_lbls[i:i+1] - layer_2) **2)
        correct_cnt += int(np.argmax(layer_2) == np.argmax(train_lbls[i:i+1]))
        
        layer_2_delta = (train_lbls[i:i+1] - layer_2)
        layer_1_delta = (np.dot(layer_2_delta, weights_1_2.T)) * relu2deriv(layer_1) #take the dir of the original output
        
        ####################################################
        layer_1_delta = layer_1_delta * dropout_mask #turn off those that were original turned off, can't update these weights
        
        weights_1_2 += alpha*(np.dot(layer_1.T, layer_2_delta)) #same as np.dot
        weights_0_1 += alpha*(np.dot(layer_0.T, layer_1_delta))
    
    if(j % 50 == 0 or j == iterations -1):
        print(f"Iteration:{j}, Train Error:{error/float(len(train_imgs)):.3f}, Train Correct:{correct_cnt/float(len(train_imgs))}")
        
        

Iteration:0, Train Error:0.606, Train Correct:0.655
Iteration:50, Train Error:0.435, Train Correct:0.767
Iteration:100, Train Error:0.437, Train Correct:0.782
Iteration:150, Train Error:0.415, Train Correct:0.811
Iteration:200, Train Error:0.394, Train Correct:0.828
Iteration:250, Train Error:0.380, Train Correct:0.838
Iteration:299, Train Error:0.373, Train Correct:0.84


In [113]:
## try on test
error, correct_cnt = (0.0, 0)

for i in range(len(test_imgs)):
    layer_0 = test_imgs[i:i+1]
    layer_1 = relu(np.dot(layer_0,weights_0_1))
    layer_2 = np.dot(layer_1, weights_1_2)
    
    error = error + np.sum((test_lbls[i:i+1] - layer_2)**2)
    correct_cnt += int(np.argmax(layer_2) == np.argmax(test_lbls[i:i+1]))
    
print(f"Test Error:{error/float(len(test_imgs)):.3f}, Test Correct:{correct_cnt/float(len(test_imgs)):.3f}")

Test Error:0.418, Test Correct:0.813


## Add batch size

In [180]:
batch_size = 100
alpha, iterations = (0.001, 300)
pixels_per_image, num_labels, hidden_size = (784, 10, 100)

weights_0_1 = 0.2*np.random.random((pixels_per_image, hidden_size)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size, num_labels)) - 0.1

In [161]:
for j in range(iterations):
    error, correct_cnt = (0.0, 0)
    
    for i in range(int(len(train_imgs) / batch_size)):
        
        batch_start, batch_end = ((i * batch_size),((i+1)*batch_size))
        
        layer_0 = train_imgs[batch_start:batch_end] #single row (1,784) or with batch (100, 784)
        
        layer_1 = relu(np.dot(layer_0, weights_0_1)) # (1,40) 
        dropout_mask = np.random.randint(2, size  = layer_1.shape) 
        layer_1 = layer_1*dropout_mask * 2 
        
        layer_2 = np.dot(layer_1, weights_1_2) # (1,10)
        
        error = error + np.sum((train_lbls[batch_start:batch_end] - layer_2)**2)

        layer_2_delta = (train_lbls[batch_start:batch_end] - layer_2)
        
        layer_1_delta = (np.dot(layer_2_delta, weights_1_2.T)) * relu2deriv(layer_1) #take the dir of the original output
        layer_1_delta = layer_1_delta * dropout_mask
        
        weights_1_2 += alpha*(np.dot(layer_1.T, layer_2_delta)) #same as np.dot
        weights_0_1 += alpha*(np.dot(layer_0.T, layer_1_delta))
        
        for k in range(batch_size):
            correct_cnt += int(np.argmax(layer_2[k:k+1]) == np.argmax(train_lbls[batch_start+k:batch_start+k+1]))
        
    
    if(j % 50 == 0 or j == iterations -1):
        
        test_error = 0.0
        test_correct_cnt = 0

        for i in range(len(test_imgs)):
            layer_0 = test_imgs[i:i+1]
            layer_1 = relu(np.dot(layer_0,weights_0_1))
            layer_2 = np.dot(layer_1, weights_1_2)

            test_error += np.sum((test_lbls[i:i+1] - layer_2) ** 2)
            test_correct_cnt += int(np.argmax(layer_2) == np.argmax(test_lbls[i:i+1]))

        print(f'''Iteration:{j}, 
                  Train Error:{error/float(len(train_imgs)):.3f},
                  Train Correct:{correct_cnt/float(len(train_imgs))},
                  Test Error: {test_error/float(len(test_imgs)):3f},
                  Test Correct: {test_correct_cnt/float(len(test_imgs))}''')

Iteration:0, 
                  Train Error:1.196,
                  Train Correct:0.168,
                  Test Error: 0.824656,
                  Test Correct: 0.3782
Iteration:50, 
                  Train Error:0.453,
                  Train Correct:0.797,
                  Test Error: 0.439282,
                  Test Correct: 0.8059
Iteration:100, 
                  Train Error:0.426,
                  Train Correct:0.816,
                  Test Error: 0.431470,
                  Test Correct: 0.8128
Iteration:150, 
                  Train Error:0.391,
                  Train Correct:0.837,
                  Test Error: 0.427535,
                  Test Correct: 0.8174
Iteration:200, 
                  Train Error:0.382,
                  Train Correct:0.849,
                  Test Error: 0.425232,
                  Test Correct: 0.802
Iteration:250, 
                  Train Error:0.382,
                  Train Correct:0.842,
                  Test Error: 0.422566,
                 

#### New Activation Functions

In [34]:
tanh = lambda x: np.tanh(x)
tanh2deriv = lambda x: (1-(x**2))

def softmax(x):
    tmp = np.exp(x)
    return tmp / np.sum(tmp, axis = 1, keepdims = True)

In [35]:
batch_size = 100
#### alpha is much higher ? 
alpha, iterations, hidden_size = (2, 300, 100)
pixels_per_image, num_labelse = (784, 10)

###### adjusted to be between -0.01 and 0.01 
weights_0_1 = 0.02*np.random.random((pixels_per_image, hidden_size)) - 0.01
weights_1_2 = 0.2*np.random.random((hidden_size, num_labels)) - 0.1

In [36]:
for j in range(iterations):
    error, correct_cnt = (0.0, 0)
    
    for i in range(int(len(train_imgs) / batch_size)):
        
        batch_start, batch_end = ((i * batch_size),((i+1)*batch_size))
        
        layer_0 = train_imgs[batch_start:batch_end] #single row (1,784) or with batch (100, 784)
        
        layer_1 = tanh(np.dot(layer_0, weights_0_1)) # (1,40) 
        dropout_mask = np.random.randint(2, size  = layer_1.shape) 
        layer_1 = layer_1*dropout_mask * 2 
        
        layer_2 = softmax(np.dot(layer_1, weights_1_2)) # (1,10)
        
        
        ##############################
        layer_2_delta = (train_lbls[batch_start:batch_end] - layer_2)/ (batch_size * layer_2.shape[0])
        ##############################
        
        layer_1_delta = (np.dot(layer_2_delta, weights_1_2.T)) * tanh2deriv(layer_1) #take the dir of the original output
        layer_1_delta = layer_1_delta * dropout_mask
        
        weights_1_2 += alpha*(np.dot(layer_1.T, layer_2_delta)) #same as np.dot
        weights_0_1 += alpha*(np.dot(layer_0.T, layer_1_delta))
        
        for k in range(batch_size):
            correct_cnt += int(np.argmax(layer_2[k:k+1]) == np.argmax(train_lbls[batch_start+k:batch_start+k+1]))
        
    
    if(j % 50 == 0 or j == iterations -1):
        
        test_correct_cnt = 0

        for i in range(len(test_imgs)):
            layer_0 = test_imgs[i:i+1]
            ###########################################
            layer_1 = tanh(np.dot(layer_0,weights_0_1))
            ###########################################
            layer_2 = np.dot(layer_1, weights_1_2)

            test_correct_cnt += int(np.argmax(layer_2) == np.argmax(test_lbls[i:i+1]))

        print(f'''Iteration:{j}, 
                  Train Accuracy:{correct_cnt/float(len(train_imgs))},
                  Test Accuracy: {test_correct_cnt/float(len(test_imgs))}''')

Iteration:0, 
                  Train Accuracy:0.141,
                  Test Accuracy: 0.2986
Iteration:50, 
                  Train Accuracy:0.829,
                  Test Accuracy: 0.7974
Iteration:100, 
                  Train Accuracy:0.878,
                  Test Accuracy: 0.8398
Iteration:150, 
                  Train Accuracy:0.912,
                  Test Accuracy: 0.8568
Iteration:200, 
                  Train Accuracy:0.931,
                  Test Accuracy: 0.8633
Iteration:250, 
                  Train Accuracy:0.938,
                  Test Accuracy: 0.8682
Iteration:299, 
                  Train Accuracy:0.954,
                  Test Accuracy: 0.8727
