In [None]:
import numpy as np
#%matplotlib notebook
%matplotlib inline
import matplotlib.pyplot as plt
from IPython.core.debugger import set_trace
import warnings
import math
warnings.filterwarnings('ignore')
np.random.seed(1234)

In [None]:
import tensorflow as tf

# Task 1: Acquire the Data

In [None]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
assert x_train.shape == (60000, 28, 28)
assert x_test.shape == (10000, 28, 28)
assert y_train.shape == (60000,)
assert y_test.shape == (10000,)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [None]:
def one_hot(y, c):

    # y--> label/ground truth.
    # c--> Number of classes.

    # A zero matrix of size (m, c)
    y_hot = np.zeros((len(y), c))

    # Putting 1 for column where the label is,
    # Using multidimensional indexing.
    y_hot[np.arange(len(y)), y] = 1

    return y_hot



In [None]:
#vectorize
x_train_f = np.reshape(x_train, (60000,  784))
x_test_f = np.reshape(x_test, (10000,  784))

x_train_f=x_train_f[0:1000]
y_train=y_train[0:1000]
x_test_f=x_test_f[0:400]
y_test=y_test[0:400]


In [None]:
#normalize
def norm(arr):
  mean = np.mean(arr, axis = 0)
  arr = np.subtract(arr,mean)
  std = np.std(arr, axis = 0)
  arr = np.divide(arr,std)
  return arr

#x_train_fn = norm(x_train_f)
x_train_fn=x_train_f/255
#x_test_fn = norm(x_test_f)
x_test_fn=x_test_f/255


# Task 2: Implement a Multilayer Perceptron

In [None]:
class GradientDescent:

    def __init__(self, learning_rate=.001, max_iters=1e4, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.max_iters = max_iters
        self.epsilon = epsilon

    def run(self, gradient_fn, x, y, params):
        norms = np.array([np.inf])
        t = 1
        losses=[]
        while np.any(norms > self.epsilon) and t < self.max_iters:
            grad, yh= gradient_fn(x, y, params)

            momentum = 0.9                         # momentum

            for p in range(len(grad)):
                #update_p = np.zeros(grad[p].shape)

                params[p]-= self.learning_rate * grad[p]

#Using SGD with Nesterov acceleration Optimizer to update the weights and biases: https://medium.com/@neuralthreads/l1-l2-regularization-adding-penalties-to-the-loss-function-b5c330d30b3f

                # update_p= -self.learning_rate * grad[p] + momentum * update_p
                # update_p_= -self.learning_rate * grad[p] + momentum * update_p
                # params[p]+= update_p_

            t += 1
            loss = -np.mean(np.log(yh[np.arange(len(y)), y]))
            if t%100==0:

              print('Epoch {epoch}==> Loss = {loss}' .format(epoch=t, loss=loss))
              losses.append(loss)
            norms = np.array([np.linalg.norm(g) for g in grad])
        return params , losses

In [None]:
# Useful functions
# ReLu
def relu(x):
  return(np.maximum(0,x))

# Three different Softmax
def stable_softmax(x):
    z = x - np.max(x, axis=-1, keepdims=True)
    numerator = np.exp(z)
    denominator = np.sum(numerator, axis=-1, keepdims=True)
    softmax = numerator / denominator
    return softmax

def softmax(X):
    X_exp = np.exp(X)
    partition = X_exp.sum(1, keepdims=True)
    return X_exp / partition

def logsoftmax(x, recover_probs=True):  #This gives us the most stable softmax! No underflowing or overflowing. Theory explained here: https://ogunlao.github.io/2020/04/26/you_dont_really_know_softmax.html

    max_x = np.max(x)
    exp_x = np.exp(x - max_x)
    sum_exp_x = np.sum(exp_x)
    log_sum_exp_x = np.log(sum_exp_x)
    max_plus_log_sum_exp_x = max_x + log_sum_exp_x
    log_probs = x - max_plus_log_sum_exp_x

    # Recover probs
    if recover_probs:
        exp_log_probs = np.exp(log_probs)
        sum_log_probs = np.sum(exp_log_probs)
        probs = exp_log_probs / sum_log_probs
        return probs

    return log_probs

# tanh
def tanh_act(x):
  return (np.tanh(x))

# Leaky ReLu
def leaky_relu(x, l):
  return (np.maximum(0, x) + l * np.minimum(0, x))

#Relu derivative
def reluDerivative(x):
     x[x<=0] = 0
     x[x>0] = 1
     return x

# #Cross entropy
# def cross_E(y_true, y_pred):                    # CE
#     return -np.sum(y_true * np.log(y_pred + 10**-100))
# def cross_E_grad(y_true, y_pred):               # CE derivative
#     return -y_true/(y_pred + 10**-100)

In [None]:

class MLP:

    def __init__(self, hidden_units, HL, activation):
        self.M = hidden_units
        self.HL = HL
        self.activation = activation

        dv_sum=np.zeros((784, self.M))
    def fit(self, x, y, optimizer):
        # D: number of features
        # N: number of samples
        N, D = x.shape

        def gradient(x, y, params):

          if (self.HL==0):
            v,b1= params
            yh = stable_softmax(np.dot(x,v)+b1)  #N x M
            y_hot=one_hot(y,10)
            dy = yh-y_hot
            dv = np.dot(x.T, dy)/N
            dparams = [dv, b1]
            return dparams, yh

          elif (self.HL==1):
            v, w ,b1,b2= params
            z = self.activation(np.dot(x, v)+b1) #N x C
            yh =stable_softmax(np.dot(z, w)+b2)#N
            dy = (yh - one_hot(y,10))
            dw = np.dot(z.T, dy)/N  #M x C
            dv_sum=np.zeros((D, self.M))

            for i in range (0,10):
              dz = np.outer(dy[:,i], w[:,i])
              dv = np.dot(x.T, dz * reluDerivative(z))/N
              dv_sum = dv_sum+dv

            dparams = [dv_sum, dw, b1, b2]
            return dparams , yh

          elif (self.HL==2):
            v, w ,p ,b1 ,b2 ,b3 = params
            z = self.activation(np.dot(x, v)+b1)    #N x C
            k = self.activation(np.dot(z, w)+b2)
            yh = stable_softmax(np.dot(k, p)+b3)     #N

            dy = (yh - one_hot(y,10))
            dp = np.dot(k.T, dy)/N        #M x C
            dw_sum=np.zeros((self.M, self.M))

            for i in range (0,10):
              dk=np.outer(dy[:,i], p[:,i])
              dw = np.dot(z.T, dk * reluDerivative(k))/N
              dw_sum=dw_sum+dw

            dv_sum=np.zeros((D, self.M))

            for i in range (0,10):
              dz = np.outer(dk[:,i], w[:,i])
              dv = np.dot(x.T, dz * reluDerivative(z))/N
              dv_sum = dv_sum+dv

            dparams = [dv_sum, dw_sum, dp , b1, b2,b3]
            return dparams , yh

        if (self.HL==0):
          v = np.random.randn(D, 10) * .01
          b1 = np.random.randn(1, 10) *.01
          params0 = [v,b1]
          self.params, losses = optimizer.run(gradient, x, y, params0)

        elif (self.HL==1):
          w = np.random.randn(self.M,10) * .01 # changed from 0.1 to math.sqrt(self.M)
          v = np.random.randn(D,self.M) * .01
          b1 = np.random.randn(1, self.M) *.01
          b2 = np.random.randn(1, 10) *.01
          params0 = [v,w,b1,b2]
          self.params, losses = optimizer.run(gradient, x, y, params0)

        elif (self.HL==2):
          p = np.random.randn(self.M, 10) * .01
          w = np.random.randn(self.M, self.M) * .01
          v = np.random.randn(D, self.M) * .01
          b1 = np.random.randn(1, self.M) *.01
          b2 = np.random.randn(1, self.M) *.01
          b3 = np.random.randn(1, 10) *.01
          params0 = [v,w,p,b1,b2,b3]
          self.params, losses = optimizer.run(gradient, x, y, params0)

        return self, losses

    def predict(self, x):

      if (self.HL==1):
        v, w , b1, b2= self.params
        z = self.activation(np.dot(x, v)+b1) #N x M
        yh = stable_softmax(np.dot(z, w)+b2)#N

      elif (self.HL==0):
        v,b1=self.params
        yh=stable_softmax(np.dot(x,v)+b1)

      elif (self.HL==2):
        v, w, p, b1, b2, b3 = self.params
        z = self.activation(np.dot(x, v)+b1) #N x M
        k = self.activation(np.dot(z, w)+b2) #M x M
        yh = stable_softmax(np.dot(k, p)+b3)  #N

      return np.argmax(yh, axis=1)


In [None]:
#swith 4000
model = MLP(hidden_units=64, HL=2, activation=relu)
optimizer = GradientDescent(learning_rate=.1, max_iters=4000)
output, losses_2 = model.fit(x_train_fn, y_train, optimizer)

yh= output.predict(x_test_fn)


Epoch 100==> Loss = 2.3020973009466354
Epoch 200==> Loss = 2.301132763925152
Epoch 300==> Loss = 2.29675791759154
Epoch 400==> Loss = 2.1737323019188564
Epoch 500==> Loss = 1.9103757437126843
Epoch 600==> Loss = 1.8909053047789421
Epoch 700==> Loss = 1.8725384997831924
Epoch 800==> Loss = 1.843791758074598
Epoch 900==> Loss = 1.7998864417412992
Epoch 1000==> Loss = 1.7153661273852854
Epoch 1100==> Loss = 1.5934241246189953
Epoch 1200==> Loss = 1.4908926184181173
Epoch 1300==> Loss = 1.4311755245889475
Epoch 1400==> Loss = 1.3976332543002083
Epoch 1500==> Loss = 1.3721931363093913
Epoch 1600==> Loss = 1.3467147671212514
Epoch 1700==> Loss = 1.31875604681474
Epoch 1800==> Loss = 1.2880675458020376
Epoch 1900==> Loss = 1.25678862921562
Epoch 2000==> Loss = 1.2289277305716941
Epoch 2100==> Loss = 1.225612070991332
Epoch 2200==> Loss = 1.208850049366838
Epoch 2300==> Loss = 1.1948454418182457
Epoch 2400==> Loss = 1.1838679986364087
Epoch 2500==> Loss = 1.1700367252824417
Epoch 2600==> Loss 

In [None]:

print(yh)

print(losses_2)

[9 2 1 1 4 1 6 4 7 7 4 7 7 3 4 1 2 6 8 0 2 7 7 7 1 2 4 3 9 1 8 0 3 1 8 0 7
 7 7 9 0 1 3 7 6 9 6 1 4 4 2 4 7 2 6 2 8 4 8 0 7 7 8 7 1 1 3 1 7 8 7 0 2 0
 4 1 1 4 8 2 1 8 5 9 7 0 1 4 0 6 5 3 4 7 1 8 0 1 6 4 3 4 7 6 7 8 7 9 9 4 2
 7 7 0 9 2 8 4 7 2 0 0 9 9 1 0 5 4 1 5 4 1 9 1 8 6 4 1 6 7 1 6 0 0 1 6 1 1
 4 6 4 4 1 1 7 6 4 7 9 1 7 2 0 9 0 9 4 7 4 6 3 7 4 1 4 4 1 0 9 1 0 9 6 8 7
 9 9 4 4 7 1 2 0 6 3 2 5 1 2 1 1 0 4 9 4 4 0 7 9 0 4 1 8 4 1 3 1 2 7 4 8 8
 1 0 7 7 0 4 7 0 7 8 9 2 9 0 9 1 4 4 9 2 9 2 4 8 4 4 2 4 9 7 4 7 5 4 4 5 4
 3 0 2 8 0 0 4 1 2 9 0 1 4 0 4 1 0 8 1 7 4 0 1 6 1 0 4 6 7 8 1 7 9 7 0 7 5
 1 9 8 3 1 1 4 8 0 0 4 9 7 7 1 3 2 2 4 4 7 1 2 2 5 4 6 7 6 4 7 7 7 3 3 7 1
 7 1 1 7 2 2 3 4 0 3 1 0 1 9 4 9 9 1 7 2 1 1 0 6 4 4 6 1 4 4 4 4 7 1 2 2 5
 0 7 9 4 0 9 3 9 1 6 8 0 6 1 9 2 1 6 2 2 6 4 9 5 2 0 4 4 1 2]
[2.3020973009466354, 2.301132763925152, 2.29675791759154, 2.1737323019188564, 1.9103757437126843, 1.8909053047789421, 1.8725384997831924, 1.8437917580745977, 1.7998864417412987, 1.71536612

In [None]:
def evaluate_acc(true_y,predicted_y):
  print("true y shape: " + str(true_y.shape[0]))
  print("np.sum(predicted_y == true_y): " + str(np.sum(predicted_y == true_y)))
  return (np.sum(predicted_y == true_y)/true_y.shape[0])

In [None]:
mlp_acc_1= evaluate_acc(y_test,yh)
print("--------------------------------")
print("# of hidden layers: 2")
print("# of units: 64")
print("Accuracy: " + str(mlp_acc_1))


true y shape: 400
np.sum(predicted_y == true_y): 253
--------------------------------
# of hidden layers: 1
# of units: 128
Accuracy: 0.6325


# Task 3: Experiments

In [None]:
#with 8000
model = MLP(hidden_units=64, HL=2, activation=relu)
optimizer = GradientDescent(learning_rate=.1, max_iters=8000)
output, losses_3 = model.fit(x_train_fn, y_train, optimizer)

yh= output.predict(x_test_fn)

Epoch 100==> Loss = 2.302169032410297
Epoch 200==> Loss = 2.301444603921684
Epoch 300==> Loss = 2.2995403510339383
Epoch 400==> Loss = 2.2912015344613446
Epoch 500==> Loss = 2.139489770208229
Epoch 600==> Loss = 1.919475065487825
Epoch 700==> Loss = 1.8914866787005722
Epoch 800==> Loss = 1.8930506821611384
Epoch 900==> Loss = 1.8785875817879578
Epoch 1000==> Loss = 1.8545208300405833
Epoch 1100==> Loss = 1.8241916922562162
Epoch 1200==> Loss = 1.77801948433492
Epoch 1300==> Loss = 1.7013605873312665
Epoch 1400==> Loss = 1.6042910374438566
Epoch 1500==> Loss = 1.5203407222961918
Epoch 1600==> Loss = 1.4618189253687444
Epoch 1700==> Loss = 1.4185711249359234
Epoch 1800==> Loss = 1.3863631094317244
Epoch 1900==> Loss = 1.3596389520428767
Epoch 2000==> Loss = 1.3371487599816319
Epoch 2100==> Loss = 1.3205008243944842
Epoch 2200==> Loss = 1.3123652085572943
Epoch 2300==> Loss = 1.3052072338912273
Epoch 2400==> Loss = 1.2975836209771805
Epoch 2500==> Loss = 1.2895834648641533
Epoch 2600==> L

In [None]:
def evaluate_acc(true_y,predicted_y):
  print("true y shape: " + str(true_y.shape[0]))
  print("np.sum(predicted_y == true_y): " + str(np.sum(predicted_y == true_y)))
  return (np.sum(predicted_y == true_y)/true_y.shape[0])

mlp_acc_1= evaluate_acc(y_test,yh)
print("--------------------------------")
print("# of hidden layers: 2")
print("# of units: 64")
print("Accuracy: " + str(mlp_acc_1))

true y shape: 400
np.sum(predicted_y == true_y): 251
--------------------------------
# of hidden layers: 2
# of units: 64
Accuracy: 0.6275


In [None]:
#with 8000
model = MLP(hidden_units=64, HL=1, activation=relu)
optimizer = GradientDescent(learning_rate=.1, max_iters=4000)
output, losses_4 = model.fit(x_train_fn, y_train, optimizer)

yh_2= output.predict(x_test_fn)

Epoch 100==> Loss = 0.9352407673058577
Epoch 200==> Loss = 0.6923510009046661
Epoch 300==> Loss = 0.5367905304631673
Epoch 400==> Loss = 0.4408959096271017
Epoch 500==> Loss = 0.3684593991823587
Epoch 600==> Loss = 0.3182868244074773
Epoch 700==> Loss = 0.3115668185914102
Epoch 800==> Loss = 0.2430013002558912
Epoch 900==> Loss = 0.21357973095702756
Epoch 1000==> Loss = 0.18091889992765015
Epoch 1100==> Loss = 0.16784292437993298
Epoch 1200==> Loss = 0.1573885861744172
Epoch 1300==> Loss = 0.1615248510583485
Epoch 1400==> Loss = 0.1235048917748485
Epoch 1500==> Loss = 0.0932180732849198
Epoch 1600==> Loss = 0.07959145556891103
Epoch 1700==> Loss = 0.07562179914144367
Epoch 1800==> Loss = 0.061751336887682175
Epoch 1900==> Loss = 0.054884688138625536
Epoch 2000==> Loss = 0.04890900610290354
Epoch 2100==> Loss = 0.04353724449600618
Epoch 2200==> Loss = 0.03923849771083114
Epoch 2300==> Loss = 0.035269843224788704
Epoch 2400==> Loss = 0.03206275021283487
Epoch 2500==> Loss = 0.02922342757

In [None]:
def evaluate_acc(true_y,predicted_y):
  print("true y shape: " + str(true_y.shape[0]))
  print("np.sum(predicted_y == true_y): " + str(np.sum(predicted_y == true_y)))
  return (np.sum(predicted_y == true_y)/true_y.shape[0])

mlp_acc_1= evaluate_acc(y_test,yh_2)
print("--------------------------------")
print("# of hidden layers: 1")
print("# of units: 64")
print("Accuracy: " + str(mlp_acc_1))

true y shape: 400
np.sum(predicted_y == true_y): 324
--------------------------------
# of hidden layers: 1
# of units: 64
Accuracy: 0.81


In [None]:
#with 8000
model = MLP(hidden_units=64, HL=0, activation=relu)
optimizer = GradientDescent(learning_rate=.1, max_iters=4000)
output, losses_5 = model.fit(x_train_fn, y_train, optimizer)

yh_2= output.predict(x_test_fn)

NameError: ignored

In [None]:
print(losses_2 , "\n", losses_3, "\n", losses_4 )

NameError: ignored

## 1. 3 Different Models


In [None]:
#MLP with no hidden layers, i.e., it directly maps the inputs to outputs
model = MLP(M=128, HL=0, activation=relu)
optimizer = GradientDescent(learning_rate=.1, max_iters=20000)
yh = model.fit(x_train_f, y_train, optimizer).predict(x_test_f)
mlp_acc = evaluate_acc(y_test,yh)
print("--------------------------------")
print("# of hidden layers: none")
print("# of units: 128")
print("Activation function: ReLu")
print("Accuracy: " + str(mlp_acc))

#MLP with a single hidden layer having 128 units and ReLU activations
model = MLP(M=128, HL=1, activation=relu)
optimizer = GradientDescent(learning_rate=.1, max_iters=20000)
yh = model.fit(x_train_f, y_train, optimizer).predict(x_test_f)
mlp_acc = evaluate_acc(y_test,yh)
print("--------------------------------")
print("# of hidden layers: 1")
print("# of units: 128")
print("Activation function: ReLu")
print("Accuracy: " + str(mlp_acc))

#MLP with 2 hidden layers each having 128 units with ReLU activations
model = MLP(M=128, HL=2, activation=relu)
optimizer = GradientDescent(learning_rate=.1, max_iters=20000)
yh = model.fit(x_train_f, y_train, optimizer).predict(x_test_f)
mlp_acc = evaluate_acc(y_test,yh)
print("--------------------------------")
print("# of hidden layers: 2")
print("# of units: 128")
print("Activation function: ReLu")
print("Accuracy: " + str(mlp_acc))

## 2. Activation Functions

In [None]:
# Activation function: tanh
model = MLP(M=128, HL=2, activation=tanh_act)
optimizer = GradientDescent(learning_rate=.1, max_iters=20000)
yh = model.fit(x_train_f, y_train, optimizer).predict(x_test_f)
mlp_acc = evaluate_acc(y_test,yh)
print("--------------------------------")
print("# of hidden layers: 2")
print("# of units: 128")
print("Activation function: tanh")
print("Accuracy: " + str(mlp_acc))

# Activation function: Leaky-ReLu
# we probably need training for the parameters of leaky ReLu !!
model = MLP(M=128, HL=2, activation=leaky_relu)
optimizer = GradientDescent(learning_rate=.1, max_iters=20000)
yh = model.fit(x_train_f, y_train, optimizer).predict(x_test_f)
mlp_acc = evaluate_acc(y_test,yh)
print("--------------------------------")
print("# of hidden layers: 2")
print("# of units: 128")
print("Activation function: Leaky ReLu")
print("Accuracy: " + str(mlp_acc))

## 3. Dropout Regularization

In [None]:
# Dropout during training

In [None]:
# Dropout during testing

## 4. Training with Unnormalized Images

In [None]:
model = MLP(M=128, HL=2, activation=relu)
optimizer = GradientDescent(learning_rate=.1, max_iters=20000)
yh = model.fit(x_train_f, y_train, optimizer).predict(x_test_f)
mlp_acc = evaluate_acc(y_test,yh)
print("--------------------------------")
print("# of hidden layers: 2")
print("# of units: 128")
print("Activation function: ReLu")
print("Data: unnormalized")
print("Accuracy: " + str(mlp_acc))

## 5. Convolutional Neural Network

In [None]:
import torch
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt


training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor()
)

test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor()
)

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to data/FashionMNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 26421880/26421880 [00:01<00:00, 18913215.00it/s]


Extracting data/FashionMNIST/raw/train-images-idx3-ubyte.gz to data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 29515/29515 [00:00<00:00, 317909.42it/s]


Extracting data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 4422102/4422102 [00:00<00:00, 5562990.85it/s]


Extracting data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 5148/5148 [00:00<00:00, 15184442.33it/s]


Extracting data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw



In [None]:
print(training_data)

Dataset FashionMNIST
    Number of datapoints: 60000
    Root location: data
    Split: Train
    StandardTransform
Transform: ToTensor()


In [None]:
labels_map = {
    0: "T-Shirt",
    1: "Trouser",
    2: "Pullover",
    3: "Dress",
    4: "Coat",
    5: "Sandal",
    6: "Shirt",
    7: "Sneaker",
    8: "Bag",
    9: "Ankle Boot",
}
figure = plt.figure(figsize=(8, 8))
cols, rows = 3, 3
for i in range(1, cols * rows + 1):
    sample_idx = torch.randint(len(training_data), size=(1,)).item()
    img, label = training_data[sample_idx]
    figure.add_subplot(rows, cols, i)
    plt.title(labels_map[label])
    plt.axis("off")
    plt.imshow(img.squeeze(), cmap="gray")
plt.show()

In [None]:
from torch.utils.data import DataLoader

train_dataloader = torch.utils.data.DataLoader(training_data, batch_size=100, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=100, shuffle=True)

In [None]:
print(train_dataloader)

<torch.utils.data.dataloader.DataLoader object at 0x7dded9affcd0>


In [None]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()

        self.conv_net = nn.Sequential(
          nn.Conv2d(in_channels=1, out_channels=3, kernel_size=4, stride=1, padding=2), #output 28x28
          nn.ReLU(),
          nn.MaxPool2d(2, 2), #output 14x14
          nn.Conv2d(in_channels=3, out_channels=1, kernel_size=2, stride=1, padding=1),  #output 14x14
          nn.ReLU(),
          nn.MaxPool2d(2, 2), #output 7x7
          nn.Flatten(),
          nn.Linear(49, 128),
          nn.Linear(128, 128),
          nn.Linear(128, 10) #output layer
        )

    def forward(self, x):
        out = self.conv_net(x)
        return out

model = CNN()

# If GPU is available, move the model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"Network Architecture: \n {model}")
print(f"The model is at : {device}")

In [None]:
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9)

In [None]:
# training took about 11min
losses = []
accuracies = []
for epoch in range(50):
    epoch_loss = []
    train_acc = []

    # Iterate over data
    for batch_idx, (images, labels) in enumerate(train_dataloader):
      #move tensor to the same device (CPU/GPU) as the model
      images, labels = images.to(device), labels.to(device)

      # Zero the parameter gradients
      optimizer.zero_grad()

      # forward + loss calc + backward + step
      outputs = model(images)
      loss = loss_func(outputs, labels)
      loss.backward()
      optimizer.step()

      if batch_idx % 500 == 0:
        print(f"Epoch: {epoch+1}, Loss: {loss.item():.4f}")

      epoch_loss.append(loss.item())

    with torch.no_grad():
      for images, labels in test_dataloader:
        images, labels = images.to(device), labels.to(device)
        #images = images.view(-1, 28*28)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    losses.append(np.mean(epoch_loss))
    accuracies.append(100 * correct / total)

In [None]:
fig, ax1 = plt.subplots()
plt.title('CNN Loss and Accuracy During Training')

color = 'tab:red'
ax1.set_xlabel('epoch')
ax1.set_ylabel('loss', color=color)
ax1.plot(np.arange(len(losses)), losses, color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:blue'
ax2.set_ylabel('accuracy', color=color)  # we already handled the x-label with ax1
ax2.plot(np.arange(len(accuracies)), accuracies, color=color)
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.show()

In [None]:
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_dataloader:
        images, labels = images.to(device), labels.to(device)
        #images = images.view(-1, 28*28)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Test Accuracy: {(100 * correct / total)} %")

## 6. MLP Architecture

## 7. Plots of Results