# Neural Networks for MNIST dataset

In [7]:
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
from torch.utils.data import Dataset
import numpy as np

In [8]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

## Loading MNIST
Here we load the dataset and create data loaders.

In [9]:
train_ds = datasets.MNIST('data', train=True, download=True, 
                       transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))
test_ds = datasets.MNIST('data', train=False, download=True, 
                       transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))

In [10]:
batch_size = 64
#batch_size = 5 # for testing
kwargs = {'num_workers': 1, 'pin_memory': True} 

train_loader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(test_ds, batch_size=batch_size, shuffle=False, **kwargs)

## Feed Forward Neural Network

In [11]:
# for the number of neurons in the hidden unit
def get_model(M = 300):
    net = nn.Sequential(nn.Linear(28*28, M),
                        nn.ReLU(),
                        nn.Linear(M, 10))
    return net.cuda()

In [12]:
def train_model(train_loader, test_loader, num_epochs, model, optimizer):
    model.train()
    sum_loss = 0.0
    total = 0
    for epoch in range(num_epochs):
        for i, (images, labels) in enumerate(train_loader):  
            batch = images.shape[0] # size of the batch
            # Convert torch tensor to Variable, change shape of the input
            images = Variable(images.view(-1, 28*28)).cuda()
            labels = Variable(labels).cuda()
        
            # Forward + Backward + Optimize
            optimizer.zero_grad()  # zero the gradient buffer
            outputs = model(images)
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()
        
            total += batch
            sum_loss += batch * loss.data[0]
            if (i+1) % 100 == 0:
                print ('Epoch [%d/%d], Loss: %.4f' 
                   %(epoch+1, num_epochs, sum_loss/total))
                
        train_loss = sum_loss/total
        print('Epoch [%d/%d], Loss: %.4f' %(epoch+1, num_epochs, train_loss))
        val_acc, val_loss = model_accuracy_loss(model, test_loader)
        print('Epoch [%d/%d], Valid Accuracy: %.4f, Valid Loss: %.4f' %(epoch+1, num_epochs, val_acc, val_loss))
    return val_acc, val_loss, train_loss

In [13]:
def model_accuracy_loss(model, test_loader):
    model.eval()
    correct = 0
    sum_loss = 0.0
    total = 0
    for images, labels in test_loader:
        images = Variable(images.view(-1, 28*28)).cuda()
        labels = Variable(labels).cuda()
        outputs = model(images)
        _, pred = torch.max(outputs.data, 1)
        loss = F.cross_entropy(outputs, labels)
        sum_loss += labels.size(0)*loss.data[0]
        total += labels.size(0)
        correct += pred.eq(labels.data).cpu().sum()
    return 100 * correct / float(total), sum_loss/ float(total)

### 1.

Report a table of validation accuracy for the following values of learning rate 1, 0.1, 0.01, 0.001, 0.0001, 0.00001. 

In [8]:
learning_rates = [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
dic = []
for i in learning_rates:
    net = get_model()
    optimizer = optim.Adam(net.parameters(), lr=i)
    Valid_Accuracy, _, _ = train_model(train_loader, test_loader, num_epochs=10, model=net, optimizer=optimizer)
    dic.append({'learning rate':i, 'valid accuracy':Valid_Accuracy})

Epoch [1/10], Loss: 1024.3426
Epoch [1/10], Loss: 514.0586
Epoch [1/10], Loss: 343.6475
Epoch [1/10], Loss: 265.1234
Epoch [1/10], Loss: 214.5760
Epoch [1/10], Loss: 180.5429
Epoch [1/10], Loss: 155.1018
Epoch [1/10], Loss: 136.7563
Epoch [1/10], Loss: 122.2315
Epoch [1/10], Loss: 117.4913
Epoch [1/10], Valid Accuracy: 10.7700, Valid Loss: 2.8178
Epoch [2/10], Loss: 106.5061
Epoch [2/10], Loss: 97.3512
Epoch [2/10], Loss: 89.8744
Epoch [2/10], Loss: 83.3316
Epoch [2/10], Loss: 77.7456
Epoch [2/10], Loss: 73.0718
Epoch [2/10], Loss: 68.7543
Epoch [2/10], Loss: 64.9340
Epoch [2/10], Loss: 61.5290
Epoch [2/10], Loss: 60.3460
Epoch [2/10], Valid Accuracy: 9.3800, Valid Loss: 2.7802
Epoch [3/10], Loss: 57.4102
Epoch [3/10], Loss: 54.7829
Epoch [3/10], Loss: 52.3728
Epoch [3/10], Loss: 50.1753
Epoch [3/10], Loss: 48.1618
Epoch [3/10], Loss: 46.4380
Epoch [3/10], Loss: 44.7259
Epoch [3/10], Loss: 43.1424
Epoch [3/10], Loss: 41.6737
Epoch [3/10], Loss: 41.1495
Epoch [3/10], Valid Accuracy: 10.

In [9]:
import pandas as pd
pd.DataFrame(dic, columns=['learning rate', 'valid accuracy']).sort_values('valid accuracy', ascending=False)

Unnamed: 0,learning rate,valid accuracy
3,0.001,97.75
4,0.0001,97.52
2,0.01,94.8
5,1e-05,92.84
0,1.0,10.52
1,0.1,10.2


Interpolate between the best two values

In [10]:
learning_rates = [0.001, 0.0007, 0.0005, 0.0003, 0.0001]
dic = []
for i in learning_rates:
    net = get_model()
    optimizer = optim.Adam(net.parameters(), lr=i)
    Valid_Accuracy, _, _ = train_model(train_loader, test_loader, num_epochs=10, model=net, optimizer=optimizer)
    dic.append({'learning rate':i, 'valid accuracy':Valid_Accuracy})

Epoch [1/10], Loss: 0.5552
Epoch [1/10], Loss: 0.4216
Epoch [1/10], Loss: 0.3597
Epoch [1/10], Loss: 0.3184
Epoch [1/10], Loss: 0.2905
Epoch [1/10], Loss: 0.2687
Epoch [1/10], Loss: 0.2538
Epoch [1/10], Loss: 0.2388
Epoch [1/10], Loss: 0.2275
Epoch [1/10], Loss: 0.2226
Epoch [1/10], Valid Accuracy: 96.6400, Valid Loss: 0.1036
Epoch [2/10], Loss: 0.2095
Epoch [2/10], Loss: 0.1993
Epoch [2/10], Loss: 0.1914
Epoch [2/10], Loss: 0.1840
Epoch [2/10], Loss: 0.1781
Epoch [2/10], Loss: 0.1721
Epoch [2/10], Loss: 0.1675
Epoch [2/10], Loss: 0.1630
Epoch [2/10], Loss: 0.1586
Epoch [2/10], Loss: 0.1571
Epoch [2/10], Valid Accuracy: 97.3800, Valid Loss: 0.0876
Epoch [3/10], Loss: 0.1521
Epoch [3/10], Loss: 0.1477
Epoch [3/10], Loss: 0.1439
Epoch [3/10], Loss: 0.1404
Epoch [3/10], Loss: 0.1373
Epoch [3/10], Loss: 0.1341
Epoch [3/10], Loss: 0.1312
Epoch [3/10], Loss: 0.1288
Epoch [3/10], Loss: 0.1267
Epoch [3/10], Loss: 0.1258
Epoch [3/10], Valid Accuracy: 97.3400, Valid Loss: 0.0814
Epoch [4/10], Lo

In [11]:
pd.DataFrame(dic, columns=['learning rate', 'valid accuracy']).sort_values('valid accuracy', ascending=False)

Unnamed: 0,learning rate,valid accuracy
0,0.001,98.18
3,0.0003,98.09
1,0.0007,98.04
2,0.0005,97.87
4,0.0001,97.62


### 2.

In [12]:
hidden_layer_size = [10, 50, 100, 300, 1000, 2000]
dic = []
for i in hidden_layer_size:
    net = get_model(M=i)
    optimizer = optim.Adam(net.parameters(), lr=0.01)
    Valid_Accuracy, Valid_Loss, Train_Loss = train_model(train_loader, test_loader, num_epochs=10, model=net, optimizer=optimizer)
    dic.append({'hidden layer size':i, 'valid accuracy':Valid_Accuracy, 
                'valid loss':Valid_Loss, 'train loss':Train_Loss})

Epoch [1/10], Loss: 0.7418
Epoch [1/10], Loss: 0.5795
Epoch [1/10], Loss: 0.5128
Epoch [1/10], Loss: 0.4838
Epoch [1/10], Loss: 0.4628
Epoch [1/10], Loss: 0.4538
Epoch [1/10], Loss: 0.4407
Epoch [1/10], Loss: 0.4322
Epoch [1/10], Loss: 0.4246
Epoch [1/10], Loss: 0.4207
Epoch [1/10], Valid Accuracy: 90.9900, Valid Loss: 0.3123
Epoch [2/10], Loss: 0.4142
Epoch [2/10], Loss: 0.4069
Epoch [2/10], Loss: 0.4017
Epoch [2/10], Loss: 0.3979
Epoch [2/10], Loss: 0.3950
Epoch [2/10], Loss: 0.3902
Epoch [2/10], Loss: 0.3880
Epoch [2/10], Loss: 0.3846
Epoch [2/10], Loss: 0.3826
Epoch [2/10], Loss: 0.3817
Epoch [2/10], Valid Accuracy: 90.0900, Valid Loss: 0.3344
Epoch [3/10], Loss: 0.3791
Epoch [3/10], Loss: 0.3765
Epoch [3/10], Loss: 0.3744
Epoch [3/10], Loss: 0.3714
Epoch [3/10], Loss: 0.3692
Epoch [3/10], Loss: 0.3679
Epoch [3/10], Loss: 0.3659
Epoch [3/10], Loss: 0.3643
Epoch [3/10], Loss: 0.3621
Epoch [3/10], Loss: 0.3612
Epoch [3/10], Valid Accuracy: 90.5600, Valid Loss: 0.3137
Epoch [4/10], Lo

In [13]:
import pandas as pd
pd.DataFrame(dic, columns=['hidden layer size', 'valid accuracy', 'valid loss', 'train loss']).sort_values('valid accuracy', ascending=False)

Unnamed: 0,hidden layer size,valid accuracy,valid loss,train loss
5,2000,95.33,0.274892,0.195315
4,1000,95.3,0.245478,0.189591
1,50,94.94,0.23572,0.18943
3,300,94.78,0.290227,0.186927
2,100,94.68,0.257502,0.18259
0,10,91.63,0.309325,0.312295


When hidden layer size parameter is equal to 2000, the model achieves the best performance. When hidden layer size parameter is bigger than 50, some models begin overfitting since the loss on the validation set begin to increase.

### 3.

In [14]:
weight_decay = [0, 0.0001, 0.001, 0.01, 0.1, 0.3]
dic = []
for i in weight_decay:
    net = get_model(M=300)
    optimizer = optim.Adam(net.parameters(), lr=0.001, weight_decay = i)
    Valid_Accuracy, Valid_Loss, Train_Loss = train_model(train_loader, test_loader, num_epochs=20, model=net, optimizer=optimizer)
    dic.append({'weight decay':i, 'valid accuracy':Valid_Accuracy, 
                'valid loss':Valid_Loss, 'train loss':Train_Loss})

Epoch [1/20], Loss: 0.5599
Epoch [1/20], Loss: 0.4207
Epoch [1/20], Loss: 0.3606
Epoch [1/20], Loss: 0.3194
Epoch [1/20], Loss: 0.2912
Epoch [1/20], Loss: 0.2702
Epoch [1/20], Loss: 0.2504
Epoch [1/20], Loss: 0.2367
Epoch [1/20], Loss: 0.2259
Epoch [1/20], Loss: 0.2214
Epoch [1/20], Valid Accuracy: 96.6700, Valid Loss: 0.1067
Epoch [2/20], Loss: 0.2093
Epoch [2/20], Loss: 0.1987
Epoch [2/20], Loss: 0.1910
Epoch [2/20], Loss: 0.1841
Epoch [2/20], Loss: 0.1773
Epoch [2/20], Loss: 0.1719
Epoch [2/20], Loss: 0.1667
Epoch [2/20], Loss: 0.1625
Epoch [2/20], Loss: 0.1584
Epoch [2/20], Loss: 0.1568
Epoch [2/20], Valid Accuracy: 97.2400, Valid Loss: 0.0862
Epoch [3/20], Loss: 0.1520
Epoch [3/20], Loss: 0.1476
Epoch [3/20], Loss: 0.1435
Epoch [3/20], Loss: 0.1396
Epoch [3/20], Loss: 0.1364
Epoch [3/20], Loss: 0.1334
Epoch [3/20], Loss: 0.1306
Epoch [3/20], Loss: 0.1282
Epoch [3/20], Loss: 0.1259
Epoch [3/20], Loss: 0.1251
Epoch [3/20], Valid Accuracy: 97.7200, Valid Loss: 0.0723
Epoch [4/20], Lo

In [15]:
import pandas as pd
pd.DataFrame(dic, columns=['weight decay', 'valid accuracy', 'valid loss', 'train loss']).sort_values('valid accuracy', ascending=False)

Unnamed: 0,weight decay,valid accuracy,valid loss,train loss
2,0.001,98.03,0.067723,0.071328
0,0.0,97.94,0.126481,0.033517
1,0.0001,97.83,0.085553,0.039328
3,0.01,96.14,0.14223,0.168963
4,0.1,89.95,0.440528,0.470957
5,0.3,84.24,0.787354,0.818047


### 4.

In [14]:
def get_model_v2(M = 300, p=0):
    modules = []
    modules.append(nn.Linear(28*28, M))
    modules.append(nn.ReLU())
    if p > 0:
        modules.append(nn.Dropout(p))
    modules.append(nn.Linear(M, 10))
    return nn.Sequential(*modules).cuda()

In [9]:
dropout = [0, 0.1, 0.3, 0.5, 0.7, 0.9]
dic = []
for i in dropout:
    net = get_model_v2(M = 300, p=i)
    optimizer = optim.Adam(net.parameters(), lr=0.001)
    Valid_Accuracy, Valid_Loss, Train_Loss = train_model(train_loader, test_loader, num_epochs=20, model=net, optimizer=optimizer)
    dic.append({'dropout':i, 'valid accuracy':Valid_Accuracy, 
                'valid loss':Valid_Loss, 'train loss':Train_Loss})

Epoch [1/20], Loss: 0.5528
Epoch [1/20], Loss: 0.4241
Epoch [1/20], Loss: 0.3577
Epoch [1/20], Loss: 0.3196
Epoch [1/20], Loss: 0.2914
Epoch [1/20], Loss: 0.2699
Epoch [1/20], Loss: 0.2502
Epoch [1/20], Loss: 0.2367
Epoch [1/20], Loss: 0.2246
Epoch [1/20], Loss: 0.2206
Epoch [1/20], Valid Accuracy: 96.8700, Valid Loss: 0.1032
Epoch [2/20], Loss: 0.2089
Epoch [2/20], Loss: 0.1989
Epoch [2/20], Loss: 0.1902
Epoch [2/20], Loss: 0.1831
Epoch [2/20], Loss: 0.1767
Epoch [2/20], Loss: 0.1702
Epoch [2/20], Loss: 0.1652
Epoch [2/20], Loss: 0.1615
Epoch [2/20], Loss: 0.1579
Epoch [2/20], Loss: 0.1565
Epoch [2/20], Valid Accuracy: 97.2900, Valid Loss: 0.0857
Epoch [3/20], Loss: 0.1518
Epoch [3/20], Loss: 0.1472
Epoch [3/20], Loss: 0.1433
Epoch [3/20], Loss: 0.1397
Epoch [3/20], Loss: 0.1367
Epoch [3/20], Loss: 0.1335
Epoch [3/20], Loss: 0.1307
Epoch [3/20], Loss: 0.1279
Epoch [3/20], Loss: 0.1256
Epoch [3/20], Loss: 0.1248
Epoch [3/20], Valid Accuracy: 97.2800, Valid Loss: 0.0849
Epoch [4/20], Lo

In [10]:
import pandas as pd
pd.DataFrame(dic, columns=['dropout', 'valid accuracy', 'valid loss', 'train loss']).sort_values('valid accuracy', ascending=False)

Unnamed: 0,dropout,valid accuracy,valid loss,train loss
1,0.1,98.22,0.101321,0.034461
3,0.5,98.21,0.091109,0.039493
4,0.7,98.0,0.118887,0.046584
2,0.3,97.89,0.116289,0.036282
5,0.9,97.88,0.119293,0.074395
0,0.0,97.85,0.12698,0.033881


When dropout parameter is equal to 0.1, the model achieves the best performance, because the accuracy on the validation set is the highest. When dropout parameter is equal to 0, the accuracy on the validation is the lowest, so the dropout helps to increase testing accuracy compared to the model without a dropout.  
According to the accuracy on the validation set, the dropout performs better than L2 regularization.

### 5.

In [17]:
# for the number of neurons in the hidden unit
def get_model_v3(M = 300, N = 300):
    net = nn.Sequential(nn.Linear(28*28, M),
                        nn.ReLU(),
                        nn.Linear(M, N),
                        nn.ReLU(),
                        nn.Linear(N, 10))
    return net.cuda()

tune `learning rate`

In [18]:
learning_rates = [0.01, 0.001, 0.0001]
dic = []
for i in learning_rates:
    net = get_model_v3()
    optimizer = optim.Adam(net.parameters(), lr=i)
    Valid_Accuracy, Valid_Loss, Train_Loss = train_model(train_loader, test_loader, num_epochs=10, model=net, optimizer=optimizer)
    dic.append({'learning rate':i, 'valid accuracy':Valid_Accuracy, 'valid loss':Valid_Loss, 'train loss':Train_Loss})

Epoch [1/10], Loss: 0.6579
Epoch [1/10], Loss: 0.5016
Epoch [1/10], Loss: 0.4419
Epoch [1/10], Loss: 0.4059
Epoch [1/10], Loss: 0.3815
Epoch [1/10], Loss: 0.3649
Epoch [1/10], Loss: 0.3428
Epoch [1/10], Loss: 0.3309
Epoch [1/10], Loss: 0.3302
Epoch [1/10], Loss: 0.3262
Epoch [1/10], Valid Accuracy: 93.6400, Valid Loss: 0.2228
Epoch [2/10], Loss: 0.3144
Epoch [2/10], Loss: 0.3074
Epoch [2/10], Loss: 0.3001
Epoch [2/10], Loss: 0.2958
Epoch [2/10], Loss: 0.2920
Epoch [2/10], Loss: 0.2870
Epoch [2/10], Loss: 0.2817
Epoch [2/10], Loss: 0.2778
Epoch [2/10], Loss: 0.2734
Epoch [2/10], Loss: 0.2720
Epoch [2/10], Valid Accuracy: 94.5400, Valid Loss: 0.2137
Epoch [3/10], Loss: 0.2667
Epoch [3/10], Loss: 0.2621
Epoch [3/10], Loss: 0.2587
Epoch [3/10], Loss: 0.2584
Epoch [3/10], Loss: 0.2557
Epoch [3/10], Loss: 0.2534
Epoch [3/10], Loss: 0.2514
Epoch [3/10], Loss: 0.2503
Epoch [3/10], Loss: 0.2484
Epoch [3/10], Loss: 0.2474
Epoch [3/10], Valid Accuracy: 95.7800, Valid Loss: 0.1680
Epoch [4/10], Lo

In [19]:
import pandas as pd
pd.DataFrame(dic, columns=['learning rate', 'valid accuracy', 'valid loss', 'train loss']).sort_values('valid accuracy', ascending=False)

Unnamed: 0,learning rate,valid accuracy,valid loss,train loss
2,0.0001,98.03,0.063571,0.126649
1,0.001,97.9,0.09166,0.05988
0,0.01,95.83,0.204209,0.179612


tune `hide layer size`

In [20]:
hidden_layer_size_M = [50, 100, 300]
hidden_layer_size_N = [50, 100, 300]
dic = []
for i in hidden_layer_size_M:
    for j in hidden_layer_size_N:
        net = get_model_v3(M=i,N=j)
        optimizer = optim.Adam(net.parameters(), lr=0.0001)
        Valid_Accuracy, Valid_Loss, Train_Loss = train_model(train_loader, test_loader, num_epochs=10, model=net, optimizer=optimizer)
        dic.append({'hidden layer size M':i, 'hidden layer size N':j,
                    'valid accuracy':Valid_Accuracy, 'valid loss':Valid_Loss, 'train loss':Train_Loss})

Epoch [1/10], Loss: 2.0670
Epoch [1/10], Loss: 1.7281
Epoch [1/10], Loss: 1.4488
Epoch [1/10], Loss: 1.2503
Epoch [1/10], Loss: 1.1111
Epoch [1/10], Loss: 1.0060
Epoch [1/10], Loss: 0.9246
Epoch [1/10], Loss: 0.8593
Epoch [1/10], Loss: 0.8065
Epoch [1/10], Loss: 0.7887
Epoch [1/10], Valid Accuracy: 90.3100, Valid Loss: 0.3508
Epoch [2/10], Loss: 0.7486
Epoch [2/10], Loss: 0.7130
Epoch [2/10], Loss: 0.6824
Epoch [2/10], Loss: 0.6555
Epoch [2/10], Loss: 0.6314
Epoch [2/10], Loss: 0.6101
Epoch [2/10], Loss: 0.5919
Epoch [2/10], Loss: 0.5749
Epoch [2/10], Loss: 0.5598
Epoch [2/10], Loss: 0.5543
Epoch [2/10], Valid Accuracy: 91.9400, Valid Loss: 0.2790
Epoch [3/10], Loss: 0.5415
Epoch [3/10], Loss: 0.5289
Epoch [3/10], Loss: 0.5174
Epoch [3/10], Loss: 0.5065
Epoch [3/10], Loss: 0.4958
Epoch [3/10], Loss: 0.4867
Epoch [3/10], Loss: 0.4776
Epoch [3/10], Loss: 0.4695
Epoch [3/10], Loss: 0.4625
Epoch [3/10], Loss: 0.4597
Epoch [3/10], Valid Accuracy: 92.6600, Valid Loss: 0.2500
Epoch [4/10], Lo

In [21]:
import pandas as pd
pd.DataFrame(dic, columns=['hidden layer size M', 'hidden layer size N', 'valid accuracy', 'valid loss', 'train loss']).sort_values('valid accuracy', ascending=False)

Unnamed: 0,hidden layer size M,hidden layer size N,valid accuracy,valid loss,train loss
8,300,300,97.88,0.06554,0.126097
7,300,100,97.75,0.068794,0.146043
6,300,50,97.62,0.082118,0.168407
5,100,300,97.28,0.089812,0.176632
4,100,100,96.75,0.108933,0.209025
3,100,50,96.55,0.114149,0.22156
2,50,300,96.39,0.118,0.214698
1,50,100,95.95,0.135648,0.250303
0,50,50,95.26,0.157863,0.273716


tune `weight decay` and `dropout`

In [22]:
def get_model_v4(M = 300, N = 300, p=0):
    modules = []
    modules.append(nn.Linear(28*28, M))
    modules.append(nn.ReLU())
    if p > 0:
        modules.append(nn.Dropout(p))
    modules.append(nn.Linear(M, N))
    modules.append(nn.ReLU())
    if p > 0:
        modules.append(nn.Dropout(p))
    modules.append(nn.Linear(N, 10))
    return nn.Sequential(*modules).cuda()

In [26]:
dropout = [0, 0.1, 0.2]
weight_decay = [0, 0.001, 0.01, 0.1]

dic = []
for i in dropout:
    for j in weight_decay:
        net = get_model_v4(p=i)
        optimizer = optim.Adam(net.parameters(), lr=0.0001, weight_decay=j)
        Valid_Accuracy, Valid_Loss, Train_Loss = train_model(train_loader, test_loader, num_epochs=10, model=net, optimizer=optimizer)
        dic.append({'dropout':i, 'weight_decay':j,'valid accuracy':Valid_Accuracy, 
                    'valid loss':Valid_Loss, 'train loss':Train_Loss})

Epoch [1/10], Loss: 1.4854
Epoch [1/10], Loss: 1.0118
Epoch [1/10], Loss: 0.8065
Epoch [1/10], Loss: 0.6930
Epoch [1/10], Loss: 0.6159
Epoch [1/10], Loss: 0.5666
Epoch [1/10], Loss: 0.5268
Epoch [1/10], Loss: 0.4942
Epoch [1/10], Loss: 0.4662
Epoch [1/10], Loss: 0.4574
Epoch [1/10], Valid Accuracy: 93.2600, Valid Loss: 0.2351
Epoch [2/10], Loss: 0.4355
Epoch [2/10], Loss: 0.4162
Epoch [2/10], Loss: 0.4000
Epoch [2/10], Loss: 0.3852
Epoch [2/10], Loss: 0.3727
Epoch [2/10], Loss: 0.3611
Epoch [2/10], Loss: 0.3506
Epoch [2/10], Loss: 0.3410
Epoch [2/10], Loss: 0.3319
Epoch [2/10], Loss: 0.3286
Epoch [2/10], Valid Accuracy: 95.1800, Valid Loss: 0.1661
Epoch [3/10], Loss: 0.3200
Epoch [3/10], Loss: 0.3117
Epoch [3/10], Loss: 0.3050
Epoch [3/10], Loss: 0.2976
Epoch [3/10], Loss: 0.2911
Epoch [3/10], Loss: 0.2845
Epoch [3/10], Loss: 0.2792
Epoch [3/10], Loss: 0.2744
Epoch [3/10], Loss: 0.2687
Epoch [3/10], Loss: 0.2670
Epoch [3/10], Valid Accuracy: 96.3100, Valid Loss: 0.1233
Epoch [4/10], Lo

In [27]:
import pandas as pd
pd.DataFrame(dic, columns=['dropout', 'weight_decay', 'valid accuracy', 'valid loss', 'train loss']).sort_values('valid accuracy', ascending=False)

Unnamed: 0,dropout,weight_decay,valid accuracy,valid loss,train loss
8,0.2,0.0,97.92,0.065806,0.131763
0,0.0,0.0,97.88,0.070429,0.126676
5,0.1,0.001,97.88,0.069784,0.13573
4,0.1,0.0,97.86,0.067516,0.127544
9,0.2,0.001,97.86,0.067847,0.137454
1,0.0,0.001,97.67,0.076876,0.134824
10,0.2,0.01,96.75,0.128541,0.204176
2,0.0,0.01,96.66,0.129169,0.199194
6,0.1,0.01,96.57,0.132899,0.202179
7,0.1,0.1,88.37,0.521691,0.577097
