# Neural Networks for MNIST dataset

In [1]:
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
from torch.utils.data import Dataset
import pandas as pd
import numpy as np

In [2]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

## Loading MNIST
Here we load the dataset and create data loaders.

In [3]:
train_ds = datasets.MNIST('../data', train=True, download=True, 
                       transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))
test_ds = datasets.MNIST('../data', train=False, download=True, 
                       transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))

In [4]:
batch_size = 64
kwargs = {'num_workers': 1, 'pin_memory': True} 

train_loader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(test_ds, batch_size=batch_size, shuffle=False, **kwargs)

## Feed Forward Neural Network

In [5]:
# for the number of neurons in the hidden unit
def get_model(M = 300):
    net = nn.Sequential(nn.Linear(28*28, M),
                        nn.ReLU(),
                        nn.Linear(M, 10))
    return net.cuda()

In [6]:
def train_model(train_loader, test_loader, num_epochs, model, optimizer):
    sum_loss = 0.0
    total = 0
    for epoch in range(num_epochs):
        model.train()
        for i, (images, labels) in enumerate(train_loader):  
            batch = images.shape[0] # size of the batch
            # Convert torch tensor to Variable, change shape of the input
            images = Variable(images.view(-1, 28*28)).cuda()
            labels = Variable(labels).cuda()
        
            # Forward + Backward + Optimize
            optimizer.zero_grad()  # zero the gradient buffer
            outputs = model(images)
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()
        
            total += batch
            sum_loss += batch * loss.data[0]
                
        train_loss = sum_loss/total
        print('Epoch [%d/%d], Loss: %.4f' %(epoch+1, num_epochs, train_loss))
        val_acc, val_loss = model_accuracy_loss(model, test_loader)
        print('Epoch [%d/%d], Valid Accuracy: %.4f, Valid Loss: %.4f' %(epoch+1, num_epochs, val_acc, val_loss))
    return val_acc, val_loss, train_loss

In [7]:
def model_accuracy_loss(model, test_loader):
    model.eval()
    correct = 0
    sum_loss = 0.0
    total = 0
    for images, labels in test_loader:
        images = Variable(images.view(-1, 28*28)).cuda()
        labels = Variable(labels).cuda()
        outputs = model(images)
        _, pred = torch.max(outputs.data, 1)
        loss = F.cross_entropy(outputs, labels)
        sum_loss += labels.size(0)*loss.data[0]
        total += labels.size(0)
        correct += pred.eq(labels.data).cpu().sum()
    return 100 * correct / total, sum_loss/ total

## Training

Learning Rate tuning

In [30]:
%%time
learning_rates = [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
validation_accuracy0 = []

for r in learning_rates:
    net = get_model()
    optimizer = optim.Adam(net.parameters(), lr=r)
    model_accuracy_loss(net, test_loader)
    val_acc, _, _ = train_model(train_loader, test_loader, num_epochs=10, model=net, optimizer=optimizer)
    validation_accuracy0.append(val_acc)

Epoch [1/10], Loss: 82.9692
Epoch [1/10], Valid Accuracy: 10.0900, Valid Loss: 2.7915
Epoch [2/10], Loss: 42.6954
Epoch [2/10], Valid Accuracy: 10.6600, Valid Loss: 2.7205
Epoch [3/10], Loss: 29.3076
Epoch [3/10], Valid Accuracy: 8.9400, Valid Loss: 2.6788
Epoch [4/10], Loss: 22.5738
Epoch [4/10], Valid Accuracy: 10.3000, Valid Loss: 2.6880
Epoch [5/10], Loss: 18.5326
Epoch [5/10], Valid Accuracy: 10.3100, Valid Loss: 2.7285
Epoch [6/10], Loss: 15.8397
Epoch [6/10], Valid Accuracy: 10.1100, Valid Loss: 2.6987
Epoch [7/10], Loss: 13.9158
Epoch [7/10], Valid Accuracy: 9.6000, Valid Loss: 2.7176
Epoch [8/10], Loss: 12.4728
Epoch [8/10], Valid Accuracy: 11.3700, Valid Loss: 2.7930
Epoch [9/10], Loss: 11.3502
Epoch [9/10], Valid Accuracy: 10.1100, Valid Loss: 2.7261
Epoch [10/10], Loss: 10.4528
Epoch [10/10], Valid Accuracy: 9.8100, Valid Loss: 2.6930
Epoch [1/10], Loss: 2.2936
Epoch [1/10], Valid Accuracy: 38.2100, Valid Loss: 1.8948
Epoch [2/10], Loss: 1.9857
Epoch [2/10], Valid Accuracy:

In [31]:
pd.DataFrame(data=[learning_rates, validation_accuracy0],index=['Learning Rate','Validation Accuracy'])

Unnamed: 0,0,1,2,3,4,5
Learning Rate,1.0,0.1,0.01,0.001,0.0001,1e-05
Validation Accuracy,9.81,9.86,95.27,97.98,97.61,92.76


In [10]:
%%time
learning_rates = np.linspace(0.0001, 0.001, 5)
validation_accuracy0 = []

for r in learning_rates:
    net = get_model()
    optimizer = optim.Adam(net.parameters(), lr=r)
    model_accuracy_loss(net, test_loader)
    val_acc, _, _ = train_model(train_loader, test_loader, num_epochs=10, model=net, optimizer=optimizer)
    validation_accuracy0.append(val_acc)


Epoch [1/10], Loss: 0.4896
Epoch [1/10], Valid Accuracy: 92.4800, Valid Loss: 0.2697
Epoch [2/10], Loss: 0.3653
Epoch [2/10], Valid Accuracy: 94.2100, Valid Loss: 0.2028
Epoch [3/10], Loss: 0.3053
Epoch [3/10], Valid Accuracy: 95.2300, Valid Loss: 0.1615
Epoch [4/10], Loss: 0.2664
Epoch [4/10], Valid Accuracy: 95.9900, Valid Loss: 0.1362
Epoch [5/10], Loss: 0.2382
Epoch [5/10], Valid Accuracy: 96.4500, Valid Loss: 0.1206
Epoch [6/10], Loss: 0.2161
Epoch [6/10], Valid Accuracy: 96.8700, Valid Loss: 0.1056
Epoch [7/10], Loss: 0.1983
Epoch [7/10], Valid Accuracy: 97.0900, Valid Loss: 0.0975
Epoch [8/10], Loss: 0.1834
Epoch [8/10], Valid Accuracy: 97.2800, Valid Loss: 0.0913
Epoch [9/10], Loss: 0.1708
Epoch [9/10], Valid Accuracy: 97.4000, Valid Loss: 0.0840
Epoch [10/10], Loss: 0.1600
Epoch [10/10], Valid Accuracy: 97.4800, Valid Loss: 0.0827
Epoch [1/10], Loss: 0.3097
Epoch [1/10], Valid Accuracy: 94.9400, Valid Loss: 0.1700
Epoch [2/10], Loss: 0.2237
Epoch [2/10], Valid Accuracy: 96.780

In [11]:
pd.DataFrame(data=[learning_rates, validation_accuracy0],index=['Learning Rate','Validation Accuracy'])

Unnamed: 0,0,1,2,3,4
Learning Rate,0.0001,0.000325,0.00055,0.000775,0.001
Validation Accuracy,97.48,97.97,97.67,97.82,97.88


## Number of neurons M in the hidden layer

In [27]:
%%time
M = [10, 50, 100, 300, 1000, 2000]
validation_accuracy = []

for m in M:
    net = get_model(m)
    optimizer = optim.Adam(net.parameters(), lr=0.01)
    model_accuracy_loss(net, test_loader)
    val_acc, val_loss, train_loss = train_model(train_loader, test_loader, num_epochs=10, model=net, optimizer=optimizer)
    validation_accuracy.append(val_acc)
    


Epoch [1/10], Loss: 0.4706
Epoch [1/10], Valid Accuracy: 89.2600, Valid Loss: 0.3844
Epoch [2/10], Loss: 0.4162
Epoch [2/10], Valid Accuracy: 90.1400, Valid Loss: 0.3476
Epoch [3/10], Loss: 0.3939
Epoch [3/10], Valid Accuracy: 89.6400, Valid Loss: 0.3680
Epoch [4/10], Loss: 0.3805
Epoch [4/10], Valid Accuracy: 90.5300, Valid Loss: 0.3343
Epoch [5/10], Loss: 0.3720
Epoch [5/10], Valid Accuracy: 89.4600, Valid Loss: 0.3722
Epoch [6/10], Loss: 0.3663
Epoch [6/10], Valid Accuracy: 90.3000, Valid Loss: 0.3543
Epoch [7/10], Loss: 0.3614
Epoch [7/10], Valid Accuracy: 90.8300, Valid Loss: 0.3368
Epoch [8/10], Loss: 0.3578
Epoch [8/10], Valid Accuracy: 90.1300, Valid Loss: 0.3444
Epoch [9/10], Loss: 0.3550
Epoch [9/10], Valid Accuracy: 91.2900, Valid Loss: 0.3262
Epoch [10/10], Loss: 0.3524
Epoch [10/10], Valid Accuracy: 90.1200, Valid Loss: 0.3660
Epoch [1/10], Loss: 0.2908
Epoch [1/10], Valid Accuracy: 93.6100, Valid Loss: 0.2247
Epoch [2/10], Loss: 0.2466
Epoch [2/10], Valid Accuracy: 94.150

In [28]:
pd.DataFrame(data=[M, validation_accuracy],index=['M','Validation Accuracy'])

Unnamed: 0,0,1,2,3,4,5
M,10.0,50.0,100.0,300.0,1000.0,2000.0
Validation Accuracy,90.12,94.95,95.12,94.99,95.53,95.19


If we look at the end of 10 epoches, M = 1000 seems to be the best with a Validation Accuracy of 95.53, however we do see that most of the models overfit: i.e. loss decreases while the validation accuracy increases.

## Models with L2 regularization
To add L2 regularization use the `weight_decay` argument on the optimizer

In [26]:
%%time
weight_decay = [0, 0.0001, 0.001, 0.01, 0.1, 0.3]
validation_accuracy2 = []
Train_loss = []
Validation_loss = []

for decay_r in weight_decay:
    net = get_model(300)
    optimizer = optim.Adam(net.parameters(), lr=0.001, weight_decay = decay_r)
    model_accuracy_loss(net, test_loader)
    val_acc, train_loss, val_loss = train_model(train_loader, test_loader, num_epochs=20, model=net, optimizer=optimizer)
    print(val_acc, train_loss, val_loss)
    validation_accuracy2.append(val_acc)
    Train_loss.append(round(train_loss,4))
    Validation_loss.append(round(val_loss,4))

Epoch [1/20], Loss: 0.2240
Epoch [1/20], Valid Accuracy: 95.7200, Valid Loss: 0.1341
Epoch [2/20], Loss: 0.1587
Epoch [2/20], Valid Accuracy: 97.1700, Valid Loss: 0.0930
Epoch [3/20], Loss: 0.1268
Epoch [3/20], Valid Accuracy: 97.6300, Valid Loss: 0.0735
Epoch [4/20], Loss: 0.1068
Epoch [4/20], Valid Accuracy: 97.6600, Valid Loss: 0.0796
Epoch [5/20], Loss: 0.0929
Epoch [5/20], Valid Accuracy: 97.7600, Valid Loss: 0.0726
Epoch [6/20], Loss: 0.0824
Epoch [6/20], Valid Accuracy: 97.7500, Valid Loss: 0.0814
Epoch [7/20], Loss: 0.0740
Epoch [7/20], Valid Accuracy: 98.2000, Valid Loss: 0.0673
Epoch [8/20], Loss: 0.0674
Epoch [8/20], Valid Accuracy: 98.1400, Valid Loss: 0.0777
Epoch [9/20], Loss: 0.0618
Epoch [9/20], Valid Accuracy: 97.6400, Valid Loss: 0.0961
Epoch [10/20], Loss: 0.0573
Epoch [10/20], Valid Accuracy: 98.0300, Valid Loss: 0.0944
Epoch [11/20], Loss: 0.0533
Epoch [11/20], Valid Accuracy: 98.0900, Valid Loss: 0.0809
Epoch [12/20], Loss: 0.0501
Epoch [12/20], Valid Accuracy: 98

Epoch [14/20], Valid Accuracy: 90.2100, Valid Loss: 0.4397
Epoch [15/20], Loss: 0.4751
Epoch [15/20], Valid Accuracy: 88.6400, Valid Loss: 0.4623
Epoch [16/20], Loss: 0.4743
Epoch [16/20], Valid Accuracy: 89.8200, Valid Loss: 0.4432
Epoch [17/20], Loss: 0.4736
Epoch [17/20], Valid Accuracy: 88.8800, Valid Loss: 0.4498
Epoch [18/20], Loss: 0.4729
Epoch [18/20], Valid Accuracy: 90.0600, Valid Loss: 0.4379
Epoch [19/20], Loss: 0.4723
Epoch [19/20], Valid Accuracy: 89.7800, Valid Loss: 0.4451
Epoch [20/20], Loss: 0.4718
Epoch [20/20], Valid Accuracy: 89.6800, Valid Loss: 0.4420
89.68 0.4420184807538986 0.4717634092092514
Epoch [1/20], Loss: 0.8626
Epoch [1/20], Valid Accuracy: 85.0900, Valid Loss: 0.7986
Epoch [2/20], Loss: 0.8499
Epoch [2/20], Valid Accuracy: 84.0800, Valid Loss: 0.7997
Epoch [3/20], Loss: 0.8427
Epoch [3/20], Valid Accuracy: 85.5500, Valid Loss: 0.7985
Epoch [4/20], Loss: 0.8382
Epoch [4/20], Valid Accuracy: 85.7600, Valid Loss: 0.7900
Epoch [5/20], Loss: 0.8343
Epoch [5

In [27]:
weight_decay = [0, 0.0001, 0.001, 0.01, 0.1, 0.3]
pd.DataFrame(data=[weight_decay, validation_accuracy2,Train_loss,Validation_loss],
             index=['Weight decay ','Validation Accuracy','Train Loss','Validation Loss'])

Unnamed: 0,0,1,2,3,4,5
Weight decay,0.0,0.0001,0.001,0.01,0.1,0.3
Validation Accuracy,97.54,97.85,97.71,96.22,89.68,85.79
Train Loss,0.1475,0.0836,0.0777,0.1439,0.442,0.7806
Validation Loss,0.0342,0.0402,0.0712,0.1692,0.4718,0.8181


## Models with Dropout

In [12]:
def get_model_v2(M = 300, p=0):
    modules = []
    modules.append(nn.Linear(28*28, M))
    modules.append(nn.ReLU())
    if p > 0:
        modules.append(nn.Dropout(p))
    modules.append(nn.Linear(M, 10))
    return nn.Sequential(*modules).cuda()

In [15]:
%%time
dropout = [0.1, 0.3, 0.5, 0.7, 0.9]
validation_accuracy3 = []
Train_loss3 = []
Validation_loss3 = []

for p1 in dropout:
    net2 = get_model_v2(M = 300, p=p1)
    optimizer = optim.Adam(net2.parameters(), lr=0.001)
    model_accuracy_loss(net2, test_loader)
    val_acc, train_loss, val_loss = train_model(train_loader, test_loader, 
                                                num_epochs=20, model=net2, 
                                                optimizer=optimizer)
    print(val_acc, train_loss, val_loss)
    validation_accuracy3.append(val_acc)
    Train_loss3.append(round(train_loss,4))
    Validation_loss3.append(round(val_loss,4))

Epoch [1/20], Loss: 0.2344
Epoch [1/20], Valid Accuracy: 96.6700, Valid Loss: 0.1097
Epoch [2/20], Loss: 0.1678
Epoch [2/20], Valid Accuracy: 97.3500, Valid Loss: 0.0874
Epoch [3/20], Loss: 0.1364
Epoch [3/20], Valid Accuracy: 97.5800, Valid Loss: 0.0796
Epoch [4/20], Loss: 0.1166
Epoch [4/20], Valid Accuracy: 97.4200, Valid Loss: 0.0832
Epoch [5/20], Loss: 0.1025
Epoch [5/20], Valid Accuracy: 97.9900, Valid Loss: 0.0707
Epoch [6/20], Loss: 0.0922
Epoch [6/20], Valid Accuracy: 98.0200, Valid Loss: 0.0717
Epoch [7/20], Loss: 0.0838
Epoch [7/20], Valid Accuracy: 97.5500, Valid Loss: 0.0888
Epoch [8/20], Loss: 0.0773
Epoch [8/20], Valid Accuracy: 97.7600, Valid Loss: 0.0803
Epoch [9/20], Loss: 0.0718
Epoch [9/20], Valid Accuracy: 98.1700, Valid Loss: 0.0761
Epoch [10/20], Loss: 0.0672
Epoch [10/20], Valid Accuracy: 98.0000, Valid Loss: 0.0799
Epoch [11/20], Loss: 0.0631
Epoch [11/20], Valid Accuracy: 98.0800, Valid Loss: 0.0829
Epoch [12/20], Loss: 0.0599
Epoch [12/20], Valid Accuracy: 98

Epoch [14/20], Valid Accuracy: 95.2100, Valid Loss: 0.1649
Epoch [15/20], Loss: 0.5628
Epoch [15/20], Valid Accuracy: 95.2700, Valid Loss: 0.1715
Epoch [16/20], Loss: 0.5588
Epoch [16/20], Valid Accuracy: 95.3100, Valid Loss: 0.1745
Epoch [17/20], Loss: 0.5545
Epoch [17/20], Valid Accuracy: 95.5500, Valid Loss: 0.1655
Epoch [18/20], Loss: 0.5507
Epoch [18/20], Valid Accuracy: 95.3200, Valid Loss: 0.1716
Epoch [19/20], Loss: 0.5478
Epoch [19/20], Valid Accuracy: 95.5600, Valid Loss: 0.1668
Epoch [20/20], Loss: 0.5446
Epoch [20/20], Valid Accuracy: 95.5600, Valid Loss: 0.1641
95.56 0.16408598736524582 0.5445897932958603
CPU times: user 3min 49s, sys: 45.5 s, total: 4min 34s
Wall time: 13min 41s


In [16]:
pd.DataFrame(data=[dropout, validation_accuracy3, Train_loss3, Validation_loss3],
             index=['Dropout rate','Validation Accuracy','Train Loss','Validation Loss'])

Unnamed: 0,0,1,2,3,4
Dropout rate,0.1,0.3,0.5,0.7,0.9
Validation Accuracy,98.11,98.22,98.13,97.93,95.56
Train Loss,0.1046,0.0852,0.0798,0.0815,0.1641
Validation Loss,0.0431,0.0634,0.0998,0.1824,0.5446


## Models with 3 layer

In [17]:
def get_model_v3(M = 500, p=0):
    modules = []
    modules.append(nn.Linear(28*28, M))
    modules.append(nn.ReLU())
    if p > 0:
        modules.append(nn.Dropout(p))
    modules.append(nn.Linear(M, int(.3*M)))
    modules.append(nn.ReLU())
    modules.append(nn.Linear(int(.3*M), 10))
    
    return nn.Sequential(*modules).cuda()

In [23]:
%%time
M = [300, 500, 800]
dropout = [0.3, 0.5]
weight_decay = [0.0001, 0.001]

best_validation_accuracy = 0
best_para = None

for m in M:
    for p1 in dropout:
        for decay_r in weight_decay:
            
            net3 = get_model_v3(M = m, p=p1)
            optimizer = optim.Adam(net3.parameters(), lr=0.001, weight_decay = decay_r)
            model_accuracy_loss(net, test_loader)
            val_acc, train_loss, val_loss = train_model(train_loader, test_loader, 
                                                        num_epochs=16, model=net3, 
                                                        optimizer=optimizer)
            if val_acc > best_validation_accuracy:
                best_validation_accuracy = val_acc
                best_para = [decay_r, p1, m]
print('best parameters:', best_para)
print('best validation accuracy:', best_validation_accuracy)

Epoch [1/16], Loss: 0.2781
Epoch [1/16], Valid Accuracy: 96.0900, Valid Loss: 0.1316
Epoch [2/16], Loss: 0.2051
Epoch [2/16], Valid Accuracy: 96.9600, Valid Loss: 0.0991
Epoch [3/16], Loss: 0.1717
Epoch [3/16], Valid Accuracy: 97.5500, Valid Loss: 0.0782
Epoch [4/16], Loss: 0.1513
Epoch [4/16], Valid Accuracy: 97.9100, Valid Loss: 0.0703
Epoch [5/16], Loss: 0.1369
Epoch [5/16], Valid Accuracy: 97.6000, Valid Loss: 0.0771
Epoch [6/16], Loss: 0.1263
Epoch [6/16], Valid Accuracy: 97.9600, Valid Loss: 0.0650
Epoch [7/16], Loss: 0.1181
Epoch [7/16], Valid Accuracy: 97.7800, Valid Loss: 0.0750
Epoch [8/16], Loss: 0.1113
Epoch [8/16], Valid Accuracy: 97.8300, Valid Loss: 0.0744
Epoch [9/16], Loss: 0.1057
Epoch [9/16], Valid Accuracy: 97.9100, Valid Loss: 0.0704
Epoch [10/16], Loss: 0.1010
Epoch [10/16], Valid Accuracy: 98.1600, Valid Loss: 0.0661
Epoch [11/16], Loss: 0.0968
Epoch [11/16], Valid Accuracy: 98.0700, Valid Loss: 0.0677
Epoch [12/16], Loss: 0.0934
Epoch [12/16], Valid Accuracy: 98

Epoch [1/16], Loss: 0.2842
Epoch [1/16], Valid Accuracy: 96.4300, Valid Loss: 0.1179
Epoch [2/16], Loss: 0.2214
Epoch [2/16], Valid Accuracy: 96.7900, Valid Loss: 0.0994
Epoch [3/16], Loss: 0.1909
Epoch [3/16], Valid Accuracy: 97.2400, Valid Loss: 0.0917
Epoch [4/16], Loss: 0.1718
Epoch [4/16], Valid Accuracy: 97.7300, Valid Loss: 0.0762
Epoch [5/16], Loss: 0.1588
Epoch [5/16], Valid Accuracy: 96.8800, Valid Loss: 0.0988
Epoch [6/16], Loss: 0.1483
Epoch [6/16], Valid Accuracy: 97.8400, Valid Loss: 0.0715
Epoch [7/16], Loss: 0.1402
Epoch [7/16], Valid Accuracy: 97.9400, Valid Loss: 0.0678
Epoch [8/16], Loss: 0.1337
Epoch [8/16], Valid Accuracy: 97.8000, Valid Loss: 0.0680
Epoch [9/16], Loss: 0.1285
Epoch [9/16], Valid Accuracy: 97.6800, Valid Loss: 0.0743
Epoch [10/16], Loss: 0.1240
Epoch [10/16], Valid Accuracy: 98.0300, Valid Loss: 0.0659
Epoch [11/16], Loss: 0.1199
Epoch [11/16], Valid Accuracy: 98.0400, Valid Loss: 0.0652
Epoch [12/16], Loss: 0.1165
Epoch [12/16], Valid Accuracy: 98

In [24]:
print('best parameters:', best_para)
print('best validation accuracy:', best_validation_accuracy)

best parameters: [0.0001, 0.5, 800]
best validation accuracy: 98.29


In [25]:
net4 = get_model_v3(M = 800, p=.5)
optimizer = optim.Adam(net4.parameters(), lr=0.001, weight_decay = 0.0001)
model_accuracy_loss(net4, test_loader)
val_acc, train_loss, val_loss = train_model(train_loader, test_loader, 
                                            num_epochs=20, model=net4, 
                                            optimizer=optimizer)

Epoch [1/20], Loss: 0.2645
Epoch [1/20], Valid Accuracy: 96.2800, Valid Loss: 0.1194
Epoch [2/20], Loss: 0.2052
Epoch [2/20], Valid Accuracy: 96.9500, Valid Loss: 0.0927
Epoch [3/20], Loss: 0.1779
Epoch [3/20], Valid Accuracy: 97.2000, Valid Loss: 0.0883
Epoch [4/20], Loss: 0.1614
Epoch [4/20], Valid Accuracy: 97.4100, Valid Loss: 0.0826
Epoch [5/20], Loss: 0.1494
Epoch [5/20], Valid Accuracy: 97.6600, Valid Loss: 0.0727
Epoch [6/20], Loss: 0.1404
Epoch [6/20], Valid Accuracy: 97.8300, Valid Loss: 0.0717
Epoch [7/20], Loss: 0.1331
Epoch [7/20], Valid Accuracy: 97.6900, Valid Loss: 0.0819
Epoch [8/20], Loss: 0.1272
Epoch [8/20], Valid Accuracy: 97.5400, Valid Loss: 0.0800
Epoch [9/20], Loss: 0.1225
Epoch [9/20], Valid Accuracy: 97.7300, Valid Loss: 0.0701
Epoch [10/20], Loss: 0.1184
Epoch [10/20], Valid Accuracy: 98.1200, Valid Loss: 0.0643
Epoch [11/20], Loss: 0.1147
Epoch [11/20], Valid Accuracy: 97.7800, Valid Loss: 0.0725
Epoch [12/20], Loss: 0.1117
Epoch [12/20], Valid Accuracy: 98

The 3-layer NN turned out to be doing similar to 2-layer NN with dropout rate of 0.3, the validation accuract are both 98.22