In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from datasets import MNISTDataset
import math

In [None]:
mnist = tf.keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()


data = MNISTDataset(train_images.reshape([-1, 784]), train_labels, 
                    test_images.reshape([-1, 784]), test_labels,
                    batch_size=128)

In [None]:
train_steps = 1000
learning_rate = 0.1

n_input = 784
h1 = 512
h2 = 128
n_classes = 10

# weights and bias initializations
W1 = tf.Variable(tf.random.uniform(shape = (n_input,h1), minval = -(math.sqrt(6)/math.sqrt(n_input+h1)),  
                            maxval = (math.sqrt(6)/math.sqrt(n_input+h1)))) # Xavier uniform
W2 = tf.Variable(tf.random.uniform(shape = (h1,h2), minval = -(math.sqrt(6)/math.sqrt(h1+h2)),
                             maxval = (math.sqrt(6)/math.sqrt(h1+h2)))) 
out = tf.Variable(tf.random.uniform(shape = (h2,n_classes), minval = -(math.sqrt(6/(h2+n_classes))),
                                   maxval = math.sqrt(6/(h2+n_classes)) ))

b1 = tf.Variable(tf.random.uniform([h1]))
b2 = tf.Variable(tf.random.uniform([h2]))
b_out = tf.Variable(tf.random.uniform([n_classes]))


In [None]:
for step in range(train_steps):
    img_batch, lbl_batch = data.next_batch()
   
    with tf.GradientTape() as tape:
        logit1 = tf.nn.sigmoid(tf.matmul(img_batch, W1) + b1)
        logit2 = tf.nn.sigmoid(tf.matmul(logit1, W2) + b2)
        output = tf.matmul(logit2,out) + b_out
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=output, labels=lbl_batch))
    grads = tape.gradient(xent, [W1, b1, W2, b2, out, b_out])    
    
    W1.assign_sub(learning_rate * grads[0])
    b1.assign_sub(learning_rate * grads[1])
    W2.assign_sub(learning_rate * grads[2])
    b2.assign_sub(learning_rate * grads[3])
    out.assign_sub(learning_rate * grads[4])
    b_out.assign_sub(learning_rate * grads[5])

        
    if not step % 100:
        preds = tf.argmax(output, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch),
                             tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))

Loss: 2.7673261165618896 Accuracy: 0.125
Loss: 2.2043304443359375 Accuracy: 0.1484375
Loss: 1.8858424425125122 Accuracy: 0.5234375
Loss: 1.376692295074463 Accuracy: 0.6875
Loss: 1.0477204322814941 Accuracy: 0.734375
Starting new epoch...
Loss: 0.7562645673751831 Accuracy: 0.828125
Loss: 0.6474988460540771 Accuracy: 0.8671875
Loss: 0.6458619832992554 Accuracy: 0.875
Loss: 0.6489923000335693 Accuracy: 0.8046875
Loss: 0.4788413643836975 Accuracy: 0.8671875
Starting new epoch...


In [None]:
test_preds1 = tf.matmul(data.test_data, W1) + b1
test_preds2 = tf.matmul(test_preds1, W2) + b2
test_preds3 = tf.argmax(tf.matmul(test_preds2, out) + b_out, axis=1,
                       output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds3, data.test_labels),
                             tf.float32))
print(acc)

tf.Tensor(0.7957, shape=(), dtype=float32)


We used the simple softmax on the outer layer and sigmoid in the hidden layers. 


The accuracy comes out to be 80.37 % which is worse than the linear model

We'll try adjusting the activation functions now

In [None]:
for step in range(train_steps):
    img_batch, lbl_batch = data.next_batch()
   
    with tf.GradientTape() as tape:
        logit1 = tf.nn.relu(tf.matmul(img_batch, W1) + b1)
        logit2 = tf.nn.relu(tf.matmul(logit1, W2) + b2)
        output = tf.matmul(logit2,out) + b_out
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=output, labels=lbl_batch))
    grads = tape.gradient(xent, [W1, b1, W2, b2, out, b_out])    
    
    W1.assign_sub(learning_rate * grads[0])
    b1.assign_sub(learning_rate * grads[1])
    W2.assign_sub(learning_rate * grads[2])
    b2.assign_sub(learning_rate * grads[3])
    out.assign_sub(learning_rate * grads[4])
    b_out.assign_sub(learning_rate * grads[5])

        
    if not step % 100:
        preds = tf.argmax(output, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch),
                             tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))

Loss: 1.6021804809570312 Accuracy: 0.796875
Loss: 0.24951386451721191 Accuracy: 0.921875
Loss: 0.19135567545890808 Accuracy: 0.9296875
Loss: 0.32472795248031616 Accuracy: 0.9375
Loss: 0.13926321268081665 Accuracy: 0.9609375
Starting new epoch...
Loss: 0.15404875576496124 Accuracy: 0.9453125
Loss: 0.31452053785324097 Accuracy: 0.921875
Loss: 0.18411535024642944 Accuracy: 0.9375
Loss: 0.08679554611444473 Accuracy: 0.9609375
Starting new epoch...
Loss: 0.13405629992485046 Accuracy: 0.9609375


In [None]:
test_preds1 = tf.matmul(data.test_data, W1) + b1
test_preds2 = tf.matmul(test_preds1, W2) + b2
test_preds3 = tf.argmax(tf.matmul(test_preds2, out) + b_out, axis=1,
                       output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds3, data.test_labels),
                             tf.float32))
print(acc)

tf.Tensor(0.8061, shape=(), dtype=float32)


Using RELU on the hidden layers and Softmax on the Outer layer we observed that the training accuracy increased significantly

## Adjusting weights and biases

In [None]:
train_steps = 1000
learning_rate = 0.1

n_input = 784
h1 = 512
h2 = 128
n_classes = 10

# weights and bias initializations
W1 = tf.Variable(tf.random.uniform(shape = (n_input,h1), minval = -0.1,maxval = 0.1)) # Xavier uniform
W2 = tf.Variable(tf.random.uniform(shape = (h1,h2), minval = -0.1,maxval = 0.1)) 
out = tf.Variable(tf.random.uniform(shape = (h2,n_classes), minval = -0.1,maxval =0.2))

b1 = tf.Variable(tf.random.uniform([h1]))
b2 = tf.Variable(tf.random.uniform([h2]))
b_out = tf.Variable(tf.random.uniform([n_classes]))


In [None]:
for step in range(train_steps):
    img_batch, lbl_batch = data.next_batch()
   
    with tf.GradientTape() as tape:
        logit1 = tf.nn.sigmoid(tf.matmul(img_batch, W1) + b1)
        logit2 = tf.nn.sigmoid(tf.matmul(logit1, W2) + b2)
        output = tf.matmul(logit2,out) + b_out
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=output, labels=lbl_batch))
    grads = tape.gradient(xent, [W1, b1, W2, b2, out, b_out])    
    
    W1.assign_sub(learning_rate * grads[0])
    b1.assign_sub(learning_rate * grads[1])
    W2.assign_sub(learning_rate * grads[2])
    b2.assign_sub(learning_rate * grads[3])
    out.assign_sub(learning_rate * grads[4])
    b_out.assign_sub(learning_rate * grads[5])

        
    if not step % 100:
        preds = tf.argmax(output, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch),
                             tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))

Loss: 2.309579372406006 Accuracy: 0.1484375
Loss: 2.1901614665985107 Accuracy: 0.28125
Loss: 1.9276642799377441 Accuracy: 0.4140625
Loss: 1.4804365634918213 Accuracy: 0.640625
Starting new epoch...
Loss: 1.254069209098816 Accuracy: 0.6875
Loss: 0.877936840057373 Accuracy: 0.78125
Loss: 0.7249398231506348 Accuracy: 0.84375
Loss: 0.6332966685295105 Accuracy: 0.8359375
Loss: 0.541893720626831 Accuracy: 0.8671875
Starting new epoch...
Loss: 0.5570704340934753 Accuracy: 0.8359375


In [None]:
test_preds1 = tf.matmul(data.test_data, W1) + b1
test_preds2 = tf.matmul(test_preds1, W2) + b2
test_preds3 = tf.argmax(tf.matmul(test_preds2, out) + b_out, axis=1,
                       output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds3, data.test_labels),
                             tf.float32))
print(acc)

tf.Tensor(0.098, shape=(), dtype=float32)


Adjust weights and biases gives loss of 0.55 and Accuracy of 83%


Now adjusting the activation function (relu) on the changed weights

In [None]:
for step in range(train_steps):
    img_batch, lbl_batch = data.next_batch()
   
    with tf.GradientTape() as tape:
        logit1 = tf.nn.relu(tf.matmul(img_batch, W1) + b1)
        logit2 = tf.nn.relu(tf.matmul(logit1, W2) + b2)
        output = tf.matmul(logit2,out) + b_out
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=output, labels=lbl_batch))
    grads = tape.gradient(xent, [W1, b1, W2, b2, out, b_out])    
    
    W1.assign_sub(learning_rate * grads[0])
    b1.assign_sub(learning_rate * grads[1])
    W2.assign_sub(learning_rate * grads[2])
    b2.assign_sub(learning_rate * grads[3])
    out.assign_sub(learning_rate * grads[4])
    b_out.assign_sub(learning_rate * grads[5])

        
    if not step % 100:
        preds = tf.argmax(output, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch),
                             tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))

Loss: 1.2685776948928833 Accuracy: 0.796875
Loss: 0.24316102266311646 Accuracy: 0.8984375
Loss: 0.19263552129268646 Accuracy: 0.9453125
Loss: 0.312516450881958 Accuracy: 0.90625
Starting new epoch...
Loss: 0.3839570879936218 Accuracy: 0.890625
Loss: 0.23785501718521118 Accuracy: 0.9296875
Loss: 0.22646576166152954 Accuracy: 0.921875
Loss: 0.11145719885826111 Accuracy: 0.9609375
Starting new epoch...
Loss: 0.17879125475883484 Accuracy: 0.9296875
Loss: 0.1379394680261612 Accuracy: 0.953125


After adjusting the activatio function we see improvement in loss and accuracy.
We get loss of 0.13 and accuracy of 95%.

In [None]:
test_preds1 = tf.matmul(data.test_data, W1) + b1
test_preds2 = tf.matmul(test_preds1, W2) + b2
test_preds3 = tf.argmax(tf.matmul(test_preds2, out) + b_out, axis=1,
                       output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds3, data.test_labels),
                             tf.float32))
print(acc)

tf.Tensor(0.8328, shape=(), dtype=float32)


## Fashion MNIST

In [None]:
tf.keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()


data = MNISTDataset(train_images.reshape([-1, 784]), train_labels, 
                    test_images.reshape([-1, 784]), test_labels,
                    batch_size=128)

In [None]:
train_steps = 1000
learning_rate = 0.1

n_input = 784
h1 = 512
h2 = 128
n_classes = 10

# weights and bias initializations
W1 = tf.Variable(tf.random.uniform(shape = (n_input,h1), minval = -(math.sqrt(6)/math.sqrt(n_input+h1)),  
                            maxval = (math.sqrt(6)/math.sqrt(n_input+h1)))) # Xavier uniform
W2 = tf.Variable(tf.random.uniform(shape = (h1,h2), minval = -(math.sqrt(6)/math.sqrt(h1+h2)),
                             maxval = (math.sqrt(6)/math.sqrt(h1+h2)))) 
out = tf.Variable(tf.random.uniform(shape = (h2,n_classes), minval = -(math.sqrt(6/(h2+n_classes))),
                                   maxval = math.sqrt(6/(h2+n_classes)) ))

b1 = tf.Variable(tf.random.uniform([h1]))
b2 = tf.Variable(tf.random.uniform([h2]))
b_out = tf.Variable(tf.random.uniform([n_classes]))

In [None]:
for step in range(train_steps):
    img_batch, lbl_batch = data.next_batch()
   
    with tf.GradientTape() as tape:
        logit1 = tf.nn.sigmoid(tf.matmul(img_batch, W1) + b1)
        logit2 = tf.nn.sigmoid(tf.matmul(logit1, W2) + b2)
        output = tf.matmul(logit2,out) + b_out
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=output, labels=lbl_batch))
    grads = tape.gradient(xent, [W1, b1, W2, b2, out, b_out])    
    
    W1.assign_sub(learning_rate * grads[0])
    b1.assign_sub(learning_rate * grads[1])
    W2.assign_sub(learning_rate * grads[2])
    b2.assign_sub(learning_rate * grads[3])
    out.assign_sub(learning_rate * grads[4])
    b_out.assign_sub(learning_rate * grads[5])

        
    if not step % 100:
        preds = tf.argmax(output, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch),
                             tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))

Loss: 2.687591075897217 Accuracy: 0.0859375
Loss: 2.206146240234375 Accuracy: 0.3125
Loss: 1.8497662544250488 Accuracy: 0.5
Loss: 1.419837474822998 Accuracy: 0.6796875
Loss: 1.0113914012908936 Accuracy: 0.7265625
Starting new epoch...
Loss: 0.7953387498855591 Accuracy: 0.8125
Loss: 0.7797795534133911 Accuracy: 0.8046875
Loss: 0.550279438495636 Accuracy: 0.8515625
Loss: 0.536864161491394 Accuracy: 0.8203125
Loss: 0.5461123585700989 Accuracy: 0.890625
Starting new epoch...


In [None]:
test_preds1 = tf.matmul(data.test_data, W1) + b1
test_preds2 = tf.matmul(test_preds1, W2) + b2
test_preds3 = tf.argmax(tf.matmul(test_preds2, out) + b_out, axis=1,
                       output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds3, data.test_labels),
                             tf.float32))
print(acc)

tf.Tensor(0.8166, shape=(), dtype=float32)


Adjusting activation function

In [None]:
for step in range(train_steps):
    img_batch, lbl_batch = data.next_batch()
   
    with tf.GradientTape() as tape:
        logit1 = tf.nn.relu(tf.matmul(img_batch, W1) + b1)
        logit2 = tf.nn.relu(tf.matmul(logit1, W2) + b2)
        output = tf.matmul(logit2,out) + b_out
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=output, labels=lbl_batch))
    grads = tape.gradient(xent, [W1, b1, W2, b2, out, b_out])    
    
    W1.assign_sub(learning_rate * grads[0])
    b1.assign_sub(learning_rate * grads[1])
    W2.assign_sub(learning_rate * grads[2])
    b2.assign_sub(learning_rate * grads[3])
    out.assign_sub(learning_rate * grads[4])
    b_out.assign_sub(learning_rate * grads[5])

        
    if not step % 100:
        preds = tf.argmax(output, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch),
                             tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))

Loss: 1.1053563356399536 Accuracy: 0.8671875
Loss: 0.5295026302337646 Accuracy: 0.8671875
Loss: 0.24275965988636017 Accuracy: 0.9296875
Loss: 0.33690667152404785 Accuracy: 0.9140625
Loss: 0.2732604146003723 Accuracy: 0.9140625
Starting new epoch...
Loss: 0.14272314310073853 Accuracy: 0.9453125
Loss: 0.19466015696525574 Accuracy: 0.9453125
Loss: 0.13944914937019348 Accuracy: 0.96875
Loss: 0.163396418094635 Accuracy: 0.9609375
Starting new epoch...
Loss: 0.19348980486392975 Accuracy: 0.9609375


Adjusting the activation function gives us better loss (0.193) and accuracy (96%).

In [None]:
test_preds1 = tf.matmul(data.test_data, W1) + b1
test_preds2 = tf.matmul(test_preds1, W2) + b2
test_preds3 = tf.argmax(tf.matmul(test_preds2, out) + b_out, axis=1,
                       output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds3, data.test_labels),
                             tf.float32))
print(acc)

tf.Tensor(0.8411, shape=(), dtype=float32)
