# Q 1, 2

![title](Q1.jpg)

![title](Q2.jpg)

# Q 3

In [1]:
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
from torch.utils.data import Dataset
import pandas as pd
import numpy as np

  (fname, cnt))
  (fname, cnt))


In [2]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

  'Matplotlib is building the font cache using fc-list. '


In [4]:
train_ds = datasets.MNIST('data', train=True, download=True, 
                       transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))
test_ds = datasets.MNIST('data', train=False, download=True, 
                       transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Processing...
Done!


In [5]:
batch_size = 64
kwargs = {'num_workers': 1, 'pin_memory': True} 

train_loader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(test_ds, batch_size=batch_size, shuffle=False, **kwargs)

In [6]:
# for the number of neurons in the hidden unit
def get_model(M = 300):
    net = nn.Sequential(nn.Linear(28*28, M),
                        nn.ReLU(),
                        nn.Linear(M, 10))
    return net.cuda()

In [20]:
def train_model(train_loader, test_loader, num_epochs, model, optimizer):
    for epoch in range(num_epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for i, (images, labels) in enumerate(train_loader):  
            batch = images.shape[0] # size of the batch
            # Convert torch tensor to Variable, change shape of the input
            images = Variable(images.view(-1, 28*28)).cuda()
            labels = Variable(labels).cuda()
        
            # Forward + Backward + Optimize
            optimizer.zero_grad()  # zero the gradient buffer
            outputs = model(images)
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()
        
            total += batch
            sum_loss += batch * loss.data[0]
                
        train_loss = sum_loss/total
        print('Epoch [%d/%d], Loss: %.4f' %(epoch+1, num_epochs, train_loss))
        val_acc, val_loss = model_accuracy_loss(model, test_loader)
        print('Epoch [%d/%d], Valid Accuracy: %.4f, Valid Loss: %.4f' %(epoch+1, num_epochs, val_acc, val_loss))
    return val_acc, val_loss, train_loss

In [21]:
def model_accuracy_loss(model, test_loader):
    model.eval()
    correct = 0
    sum_loss = 0.0
    total = 0
    for images, labels in test_loader:
        images = Variable(images.view(-1, 28*28)).cuda()
        labels = Variable(labels).cuda()
        outputs = model(images)
        _, pred = torch.max(outputs.data, 1)
        loss = F.cross_entropy(outputs, labels)
        sum_loss += labels.size(0)*loss.data[0]
        total += labels.size(0)
        correct += pred.eq(labels.data).cpu().sum()
    return 100 * correct / total, sum_loss/ total

## Q 3.1

In [22]:
%%time
learning_rates = [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
val_accs = []

for r in learning_rates:
    net = get_model()
    optimizer = optim.Adam(net.parameters(), lr=r)
    val_acc = train_model(train_loader, test_loader, num_epochs=10, model=net, optimizer=optimizer)[0]
    val_accs.append(val_acc)

Epoch [1/10], Loss: 124.5998
Epoch [1/10], Valid Accuracy: 11.4000, Valid Loss: 3.0034
Epoch [2/10], Loss: 2.5318
Epoch [2/10], Valid Accuracy: 9.8400, Valid Loss: 2.9144
Epoch [3/10], Loss: 2.3731
Epoch [3/10], Valid Accuracy: 10.1600, Valid Loss: 2.9165
Epoch [4/10], Loss: 2.3663
Epoch [4/10], Valid Accuracy: 9.8500, Valid Loss: 2.9412
Epoch [5/10], Loss: 2.3675
Epoch [5/10], Valid Accuracy: 10.3400, Valid Loss: 2.8960
Epoch [6/10], Loss: 2.3794
Epoch [6/10], Valid Accuracy: 10.3400, Valid Loss: 2.9082
Epoch [7/10], Loss: 2.3738
Epoch [7/10], Valid Accuracy: 10.3400, Valid Loss: 2.9277
Epoch [8/10], Loss: 2.3637
Epoch [8/10], Valid Accuracy: 9.8100, Valid Loss: 2.9689
Epoch [9/10], Loss: 2.3852
Epoch [9/10], Valid Accuracy: 9.8100, Valid Loss: 2.8988
Epoch [10/10], Loss: 2.3752
Epoch [10/10], Valid Accuracy: 9.8500, Valid Loss: 2.9238
Epoch [1/10], Loss: 3.0976
Epoch [1/10], Valid Accuracy: 29.7000, Valid Loss: 1.8964
Epoch [2/10], Loss: 2.1077
Epoch [2/10], Valid Accuracy: 11.4300, 

In [23]:
pd.DataFrame({'Learning Rate':learning_rates, 'Validation Accuracy': val_accs})

Unnamed: 0,Learning Rate,Validation Accuracy
0,1.0,9.85
1,0.1,11.35
2,0.01,94.88
3,0.001,98.06
4,0.0001,97.67
5,1e-05,93.04


In [24]:
%%time
learning_rates = np.linspace(0.0001, 0.001, num=5)
val_accs = []

for r in learning_rates:
    net = get_model()
    optimizer = optim.Adam(net.parameters(), lr=r)
    val_acc = train_model(train_loader, test_loader, num_epochs=10, model=net, optimizer=optimizer)[0]
    val_accs.append(val_acc)

Epoch [1/10], Loss: 0.4902
Epoch [1/10], Valid Accuracy: 92.3100, Valid Loss: 0.2671
Epoch [2/10], Loss: 0.2408
Epoch [2/10], Valid Accuracy: 94.2200, Valid Loss: 0.2022
Epoch [3/10], Loss: 0.1846
Epoch [3/10], Valid Accuracy: 95.3400, Valid Loss: 0.1611
Epoch [4/10], Loss: 0.1490
Epoch [4/10], Valid Accuracy: 95.9100, Valid Loss: 0.1385
Epoch [5/10], Loss: 0.1241
Epoch [5/10], Valid Accuracy: 96.5200, Valid Loss: 0.1191
Epoch [6/10], Loss: 0.1056
Epoch [6/10], Valid Accuracy: 96.8200, Valid Loss: 0.1053
Epoch [7/10], Loss: 0.0913
Epoch [7/10], Valid Accuracy: 97.0600, Valid Loss: 0.0984
Epoch [8/10], Loss: 0.0793
Epoch [8/10], Valid Accuracy: 97.3500, Valid Loss: 0.0896
Epoch [9/10], Loss: 0.0700
Epoch [9/10], Valid Accuracy: 97.4000, Valid Loss: 0.0863
Epoch [10/10], Loss: 0.0620
Epoch [10/10], Valid Accuracy: 97.4900, Valid Loss: 0.0837
Epoch [1/10], Loss: 0.3088
Epoch [1/10], Valid Accuracy: 95.2400, Valid Loss: 0.1662
Epoch [2/10], Loss: 0.1351
Epoch [2/10], Valid Accuracy: 96.540

In [25]:
pd.DataFrame({'Learning Rate':learning_rates, 'Validation Accuracy': val_accs})

Unnamed: 0,Learning Rate,Validation Accuracy
0,0.0001,97.49
1,0.000325,98.0
2,0.00055,97.88
3,0.000775,97.94
4,0.001,97.75


## Q 3.2

In [26]:
%%time
hidden_layer_size = [10, 50, 100, 300, 1000, 2000]
val_accs = []

for m in hidden_layer_size:
    net = get_model(m)
    optimizer = optim.Adam(net.parameters(), lr=0.01)
    val_acc = train_model(train_loader, test_loader, num_epochs=10, model=net, optimizer=optimizer)[0]
    val_accs.append(val_acc)

Epoch [1/10], Loss: 0.4872
Epoch [1/10], Valid Accuracy: 90.5100, Valid Loss: 0.3295
Epoch [2/10], Loss: 0.3461
Epoch [2/10], Valid Accuracy: 89.3700, Valid Loss: 0.3652
Epoch [3/10], Loss: 0.3295
Epoch [3/10], Valid Accuracy: 90.6900, Valid Loss: 0.3172
Epoch [4/10], Loss: 0.3234
Epoch [4/10], Valid Accuracy: 90.9500, Valid Loss: 0.3121
Epoch [5/10], Loss: 0.3215
Epoch [5/10], Valid Accuracy: 91.6000, Valid Loss: 0.2978
Epoch [6/10], Loss: 0.3141
Epoch [6/10], Valid Accuracy: 91.3500, Valid Loss: 0.3017
Epoch [7/10], Loss: 0.3135
Epoch [7/10], Valid Accuracy: 89.7700, Valid Loss: 0.3673
Epoch [8/10], Loss: 0.3096
Epoch [8/10], Valid Accuracy: 89.4600, Valid Loss: 0.3726
Epoch [9/10], Loss: 0.3057
Epoch [9/10], Valid Accuracy: 90.6300, Valid Loss: 0.3308
Epoch [10/10], Loss: 0.3078
Epoch [10/10], Valid Accuracy: 91.2200, Valid Loss: 0.3196
Epoch [1/10], Loss: 0.2941
Epoch [1/10], Valid Accuracy: 93.3700, Valid Loss: 0.2439
Epoch [2/10], Loss: 0.2164
Epoch [2/10], Valid Accuracy: 94.330

In [27]:
pd.DataFrame({'hidden layer size': hidden_layer_size, 'Validation Accuracy': val_accs})

Unnamed: 0,Validation Accuracy,hidden layer size
0,91.22,10
1,95.13,50
2,95.12,100
3,95.37,300
4,95.72,1000
5,95.9,2000


For all hidden layer sizes, hidden_layer_size = 2000 seems to be the best with a Validation Accuracy of 95.90, however most of the models overfit: validation loss decreases while the training loss increases.

## Q 3.3

In [29]:
%%time
weight_decay = [0, 0.0001, 0.001, 0.01, 0.1, 0.3]
Train_loss = []
Validation_loss = []
Validation_accuracy = []

for decay_param in weight_decay:
    net = get_model(300)
    optimizer = optim.Adam(net.parameters(), lr=0.001, weight_decay = decay_param)
    val_acc, train_loss, val_loss = train_model(train_loader, test_loader, num_epochs=20, model=net, optimizer=optimizer)
    print(val_acc, train_loss, val_loss)
    Validation_accuracy.append(val_acc)
    Train_loss.append(round(train_loss,4))
    Validation_loss.append(round(val_loss,4))

Epoch [1/20], Loss: 0.2243
Epoch [1/20], Valid Accuracy: 96.3900, Valid Loss: 0.1141
Epoch [2/20], Loss: 0.0904
Epoch [2/20], Valid Accuracy: 97.3700, Valid Loss: 0.0833
Epoch [3/20], Loss: 0.0607
Epoch [3/20], Valid Accuracy: 97.6400, Valid Loss: 0.0734
Epoch [4/20], Loss: 0.0452
Epoch [4/20], Valid Accuracy: 97.0800, Valid Loss: 0.0933
Epoch [5/20], Loss: 0.0349
Epoch [5/20], Valid Accuracy: 98.0000, Valid Loss: 0.0677
Epoch [6/20], Loss: 0.0261
Epoch [6/20], Valid Accuracy: 97.6500, Valid Loss: 0.0796
Epoch [7/20], Loss: 0.0230
Epoch [7/20], Valid Accuracy: 97.6900, Valid Loss: 0.0806
Epoch [8/20], Loss: 0.0203
Epoch [8/20], Valid Accuracy: 97.5600, Valid Loss: 0.0939
Epoch [9/20], Loss: 0.0161
Epoch [9/20], Valid Accuracy: 97.7000, Valid Loss: 0.0860
Epoch [10/20], Loss: 0.0175
Epoch [10/20], Valid Accuracy: 97.9000, Valid Loss: 0.0807
Epoch [11/20], Loss: 0.0126
Epoch [11/20], Valid Accuracy: 97.2600, Valid Loss: 0.1173
Epoch [12/20], Loss: 0.0150
Epoch [12/20], Valid Accuracy: 98

Epoch [14/20], Valid Accuracy: 90.0200, Valid Loss: 0.4452
Epoch [15/20], Loss: 0.4617
Epoch [15/20], Valid Accuracy: 90.1900, Valid Loss: 0.4299
Epoch [16/20], Loss: 0.4625
Epoch [16/20], Valid Accuracy: 90.2100, Valid Loss: 0.4313
Epoch [17/20], Loss: 0.4616
Epoch [17/20], Valid Accuracy: 90.3600, Valid Loss: 0.4370
Epoch [18/20], Loss: 0.4607
Epoch [18/20], Valid Accuracy: 90.2100, Valid Loss: 0.4329
Epoch [19/20], Loss: 0.4612
Epoch [19/20], Valid Accuracy: 89.2800, Valid Loss: 0.4508
Epoch [20/20], Loss: 0.4606
Epoch [20/20], Valid Accuracy: 90.2300, Valid Loss: 0.4352
90.23 0.4351710855960846 0.4606189204533895
Epoch [1/20], Loss: 0.8623
Epoch [1/20], Valid Accuracy: 78.4200, Valid Loss: 0.8514
Epoch [2/20], Loss: 0.8373
Epoch [2/20], Valid Accuracy: 83.4800, Valid Loss: 0.8129
Epoch [3/20], Loss: 0.8325
Epoch [3/20], Valid Accuracy: 84.5000, Valid Loss: 0.8007
Epoch [4/20], Loss: 0.8276
Epoch [4/20], Valid Accuracy: 82.8100, Valid Loss: 0.8086
Epoch [5/20], Loss: 0.8212
Epoch [5

In [30]:
pd.DataFrame({'Weight Decay': weight_decay, 'Train_loss': Train_loss, 
              'Validation_loss': Validation_loss, 'Validation Accuracy': Validation_accuracy})

Unnamed: 0,Train_loss,Validation Accuracy,Validation_loss,Weight Decay
0,0.1183,98.1,0.0089,0.0
1,0.0784,98.11,0.0145,0.0001
2,0.0847,97.3,0.0531,0.001
3,0.1411,96.36,0.1555,0.01
4,0.4352,90.23,0.4606,0.1
5,0.7851,85.9,0.8112,0.3


## Q 3.4

In [31]:
def get_model_v2(M = 300, p=0):
    modules = []
    modules.append(nn.Linear(28*28, M))
    modules.append(nn.ReLU())
    if p > 0:
        modules.append(nn.Dropout(p))
    modules.append(nn.Linear(M, 10))
    return nn.Sequential(*modules).cuda()

In [32]:
%%time
dropout = np.linspace(0, 1, num=5)
validation_accuracy_4 = []
Train_loss_4 = []
Validation_loss_4 = []

for d in dropout:
    net2 = get_model_v2(M = 300, p=d)
    optimizer = optim.Adam(net2.parameters(), lr=0.001)
    val_acc, train_loss, val_loss = train_model(train_loader, test_loader, num_epochs=20, model=net2, optimizer=optimizer)
    validation_accuracy_4.append(val_acc)
    Train_loss_4.append(round(train_loss,4))
    Validation_loss_4.append(round(val_loss,4))

Epoch [1/20], Loss: 0.2173
Epoch [1/20], Valid Accuracy: 96.7900, Valid Loss: 0.1059
Epoch [2/20], Loss: 0.0902
Epoch [2/20], Valid Accuracy: 97.1000, Valid Loss: 0.0930
Epoch [3/20], Loss: 0.0617
Epoch [3/20], Valid Accuracy: 97.4400, Valid Loss: 0.0817
Epoch [4/20], Loss: 0.0435
Epoch [4/20], Valid Accuracy: 97.6100, Valid Loss: 0.0828
Epoch [5/20], Loss: 0.0353
Epoch [5/20], Valid Accuracy: 97.0600, Valid Loss: 0.0995
Epoch [6/20], Loss: 0.0292
Epoch [6/20], Valid Accuracy: 97.9600, Valid Loss: 0.0731
Epoch [7/20], Loss: 0.0225
Epoch [7/20], Valid Accuracy: 97.8400, Valid Loss: 0.0836
Epoch [8/20], Loss: 0.0212
Epoch [8/20], Valid Accuracy: 97.9600, Valid Loss: 0.0787
Epoch [9/20], Loss: 0.0191
Epoch [9/20], Valid Accuracy: 97.7800, Valid Loss: 0.0887
Epoch [10/20], Loss: 0.0151
Epoch [10/20], Valid Accuracy: 97.8200, Valid Loss: 0.0870
Epoch [11/20], Loss: 0.0151
Epoch [11/20], Valid Accuracy: 97.8000, Valid Loss: 0.0925
Epoch [12/20], Loss: 0.0132
Epoch [12/20], Valid Accuracy: 98

Epoch [16/20], Valid Accuracy: 10.1400, Valid Loss: 2.3175
Epoch [17/20], Loss: 2.3013
Epoch [17/20], Valid Accuracy: 10.2900, Valid Loss: 2.3173
Epoch [18/20], Loss: 2.3013
Epoch [18/20], Valid Accuracy: 10.4100, Valid Loss: 2.3174
Epoch [19/20], Loss: 2.3013
Epoch [19/20], Valid Accuracy: 10.3900, Valid Loss: 2.3171
Epoch [20/20], Loss: 2.3013
Epoch [20/20], Valid Accuracy: 10.5000, Valid Loss: 2.3173
CPU times: user 3min 59s, sys: 43.4 s, total: 4min 42s
Wall time: 12min 56s


In [39]:
pd.DataFrame({'Dropout': dropout, 'Train loss': Train_loss_4, 
              'Validation loss': Validation_loss_4, 'Validation Accuracy': validation_accuracy_4})

Unnamed: 0,Dropout,Train loss,Validation Accuracy,Validation loss
0,0.0,0.12,97.81,0.0098
1,0.25,0.0908,98.3,0.0254
2,0.5,0.0863,97.96,0.0627
3,0.75,0.0931,97.49,0.1796
4,1.0,2.3173,10.5,2.3013


Dropout parameter 0.25 achieves the best performance, because it reduce overfitting by zeroing out some of the neurons, while keep enough neurons to remain accurate.  
Dropout does help to increase testing accuracy(p = 0.25 or 0.5) compared to the model without a dropout(p = 0), and overall has better test accuracy than L2 regularization(except for p = 1, which means to drop all neurons).

## Q 3.5

In [48]:
def get_model_v3(M = 500, p=0):
    modules = []
    modules.append(nn.Linear(28*28, M))
    modules.append(nn.ReLU())
    if p > 0:
        modules.append(nn.Dropout(p))
    modules.append(nn.Linear(M, int(0.2*M)))
    modules.append(nn.ReLU())
    modules.append(nn.Linear(int(0.2*M), 10))
    
    return nn.Sequential(*modules).cuda()

In [49]:
%%time
hidden_layer_size = [300, 400, 500]
weight_decay = [0.0001, 0.001]
dropout = [0.25, 0.5]

df = pd.DataFrame({'hidden_layer_size': np.repeat(hidden_layer_size, 4), 
                   'weight_decay': np.tile(np.repeat(weight_decay,2),3),
                   'dropout': np.tile(dropout,6),
                    'Train_loss': np.zeros(12), 'Validation Accuracy': np.zeros(12), 'Validation_loss': np.zeros(12)})
print(df.head())

   Train_loss  Validation Accuracy  Validation_loss  dropout  \
0         0.0                  0.0              0.0     0.25   
1         0.0                  0.0              0.0     0.50   
2         0.0                  0.0              0.0     0.25   
3         0.0                  0.0              0.0     0.50   
4         0.0                  0.0              0.0     0.25   

   hidden_layer_size  weight_decay  
0                300        0.0001  
1                300        0.0001  
2                300        0.0010  
3                300        0.0010  
4                400        0.0001  
CPU times: user 4 ms, sys: 4 ms, total: 8 ms
Wall time: 6.4 ms


In [52]:
i = 0
for m in hidden_layer_size:
    for decay_param in weight_decay:
        for d in dropout:
            net3 = get_model_v3(M = m, p=d)
            optimizer = optim.Adam(net3.parameters(), lr=0.001, weight_decay = decay_param)
            val_acc, train_loss, val_loss = train_model(train_loader, test_loader, num_epochs=10, model=net3, optimizer=optimizer)
            df.iloc[i,0:3] = train_loss, val_acc, val_loss
            i += 1

Epoch [1/10], Loss: 0.2725
Epoch [1/10], Valid Accuracy: 96.4400, Valid Loss: 0.1163
Epoch [2/10], Loss: 0.1263
Epoch [2/10], Valid Accuracy: 97.2200, Valid Loss: 0.0935
Epoch [3/10], Loss: 0.0988
Epoch [3/10], Valid Accuracy: 97.5600, Valid Loss: 0.0824
Epoch [4/10], Loss: 0.0836
Epoch [4/10], Valid Accuracy: 97.5300, Valid Loss: 0.0787
Epoch [5/10], Loss: 0.0723
Epoch [5/10], Valid Accuracy: 97.6800, Valid Loss: 0.0750
Epoch [6/10], Loss: 0.0695
Epoch [6/10], Valid Accuracy: 97.3400, Valid Loss: 0.0875
Epoch [7/10], Loss: 0.0612
Epoch [7/10], Valid Accuracy: 98.0100, Valid Loss: 0.0679
Epoch [8/10], Loss: 0.0569
Epoch [8/10], Valid Accuracy: 98.0300, Valid Loss: 0.0642
Epoch [9/10], Loss: 0.0557
Epoch [9/10], Valid Accuracy: 97.5100, Valid Loss: 0.0795
Epoch [10/10], Loss: 0.0543
Epoch [10/10], Valid Accuracy: 98.1300, Valid Loss: 0.0678
Epoch [1/10], Loss: 0.3266
Epoch [1/10], Valid Accuracy: 95.3500, Valid Loss: 0.1480
Epoch [2/10], Loss: 0.1762
Epoch [2/10], Valid Accuracy: 96.450

Epoch [7/10], Valid Accuracy: 97.7600, Valid Loss: 0.0760
Epoch [8/10], Loss: 0.0891
Epoch [8/10], Valid Accuracy: 97.6100, Valid Loss: 0.0769
Epoch [9/10], Loss: 0.0830
Epoch [9/10], Valid Accuracy: 97.9900, Valid Loss: 0.0696
Epoch [10/10], Loss: 0.0822
Epoch [10/10], Valid Accuracy: 97.9200, Valid Loss: 0.0734
Epoch [1/10], Loss: 0.2451
Epoch [1/10], Valid Accuracy: 95.8000, Valid Loss: 0.1343
Epoch [2/10], Loss: 0.1333
Epoch [2/10], Valid Accuracy: 96.6400, Valid Loss: 0.1065
Epoch [3/10], Loss: 0.1175
Epoch [3/10], Valid Accuracy: 97.0300, Valid Loss: 0.0917
Epoch [4/10], Loss: 0.1108
Epoch [4/10], Valid Accuracy: 97.0700, Valid Loss: 0.0912
Epoch [5/10], Loss: 0.1063
Epoch [5/10], Valid Accuracy: 97.5100, Valid Loss: 0.0821
Epoch [6/10], Loss: 0.1020
Epoch [6/10], Valid Accuracy: 97.1900, Valid Loss: 0.0920
Epoch [7/10], Loss: 0.0994
Epoch [7/10], Valid Accuracy: 96.5300, Valid Loss: 0.1006
Epoch [8/10], Loss: 0.0966
Epoch [8/10], Valid Accuracy: 97.4800, Valid Loss: 0.0807
Epoch

In [56]:
df.sort_values(['Validation Accuracy'], ascending = False)

Unnamed: 0,Train_loss,Validation Accuracy,Validation_loss,dropout,hidden_layer_size,weight_decay
4,0.061655,98.31,0.051297,0.25,400,0.0001
0,0.067766,98.13,0.054272,0.25,300,0.0001
5,0.068044,98.06,0.083824,0.5,400,0.0001
1,0.068863,98.02,0.093582,0.5,300,0.0001
9,0.073385,97.92,0.082238,0.5,500,0.0001
6,0.07328,97.67,0.094512,0.25,400,0.001
2,0.075247,97.63,0.091765,0.25,300,0.001
8,0.087734,97.51,0.049999,0.25,500,0.0001
3,0.08312,97.46,0.133356,0.5,300,0.001
10,0.08422,97.38,0.094174,0.25,500,0.001


The 3-layer NN has best validation accuracy when dropout rate = 0.25, hidden_layer_size = 400, 
weight_decay = 0.0001, which is similar to 2-layer NN, and the best validation accuracy(98.31) is also similar to 2-layer NN(98.30).