# With MNIST data set, to randomly use 10000 images as the training set, 5000 images as the validation set and use the test set as provided, and explore how does the network architecture [number of hidden layers, number of dimensions per hidden layer, activation functions], Batch normalization, optimization algorithms, learning rate and batch size would affect the training accuracy and validation accuracy. Finally, for the best model that achieved in validation accuracy, you further train it with the whole training set, what would be your training accuracy, validation accuracy and test accuracy.  Please summarize what you have learned from this experiment.

In [2]:
!pip install torch torchvision
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/share/apps/python/3.8.6/intel/bin/python -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/share/apps/python/3.8.6/intel/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch.utils.data import random_split

In [4]:
# load MNIST dataset using torchvision module
# split the dataset into training (10000 images), validation (5000 images), and test sets
image_path = './'
transform = transforms.Compose([
        transforms.ToTensor()
])

mnist_dataset = torchvision.datasets.MNIST(
        root=image_path, train=True,
        transform=transform, download=False
)

train_size = 10000
val_size = 5000
test_size = len(mnist_dataset) - train_size - val_size

train_ds, val_ds, test_ds = random_split(mnist_dataset, [train_size, val_size, test_size])

In [5]:
# verify length of each dataset
print('Training set size: ', len(train_ds))
print('Validation set size: ', len(val_ds))
print('Test set size: ', len(test_ds))

Training set size:  10000
Validation set size:  5000
Test set size:  45000


# Explore number of hidden layers, number of dimensions per hidden layer, activation functions.

In [6]:
# construct NN model 
hidden_units = [32, 16]
image_size = mnist_dataset[0][0].shape
input_size = image_size[0] * image_size[1] * image_size[2]

all_layers = [nn.Flatten()]
for hidden_unit in hidden_units:
    layer = nn.Linear(input_size, hidden_unit)
    all_layers.append(layer)
    all_layers.append(nn.ReLU())
    input_size = hidden_unit

all_layers.append(nn.Linear(hidden_units[-1], 10))
model = nn.Sequential(*all_layers)

model

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=32, bias=True)
  (2): ReLU()
  (3): Linear(in_features=32, out_features=16, bias=True)
  (4): ReLU()
  (5): Linear(in_features=16, out_features=10, bias=True)
)

In [7]:
# define loss function
loss_fn = nn.CrossEntropyLoss()

# define Adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [8]:
# define training function
def train(model, train_dl, loss_fn, optimizer, epochs=10):
    for epoch in range(epochs):
        for batch in train_dl:
            images, labels = batch
            outputs = model(images)
            loss = loss_fn(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

In [9]:
# function call
batch_size = 32
train_dl = DataLoader(train_ds, batch_size)
val_dl = DataLoader(val_ds, batch_size)
train(model, train_dl, loss_fn, optimizer)

Epoch 1/10, Loss: 0.3033
Epoch 2/10, Loss: 0.1942
Epoch 3/10, Loss: 0.1593
Epoch 4/10, Loss: 0.1285
Epoch 5/10, Loss: 0.1064
Epoch 6/10, Loss: 0.0798
Epoch 7/10, Loss: 0.0608
Epoch 8/10, Loss: 0.0468
Epoch 9/10, Loss: 0.0392
Epoch 10/10, Loss: 0.0316


In [10]:
# evaluate model on test set
def evaluate(model, test_dl):
    correct = 0
    total = 0
    for batch in test_dl:
        images, labels = batch
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    print(f"Accuracy: {correct/total*100:.2f}%")

evaluate(model, val_dl)

Accuracy: 92.28%


In [11]:
# tweak parameters and retrain the model
# parameters: number of hidden layers, number of dimensions per hidden layer, activation functions

# construct NN model -- Trial 2
hidden_units = [32, 16]
image_size = mnist_dataset[0][0].shape
input_size = image_size[0] * image_size[1] * image_size[2]

all_layers = [nn.Flatten()]
for hidden_unit in hidden_units:
    layer = nn.Linear(input_size, hidden_unit)
    all_layers.append(layer)
    all_layers.append(nn.ReLU())
    input_size = hidden_unit

all_layers.append(nn.Linear(hidden_units[-1], 10))
model = nn.Sequential(*all_layers)
model

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=32, bias=True)
  (2): ReLU()
  (3): Linear(in_features=32, out_features=16, bias=True)
  (4): ReLU()
  (5): Linear(in_features=16, out_features=10, bias=True)
)

In [12]:
# use cross-entropy loss function
loss_fn = nn.CrossEntropyLoss()

# use SGD optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [13]:
# train the model
batch_size = 32
train_dl = DataLoader(train_ds, batch_size)
val_dl = DataLoader(val_ds, batch_size)
train(model, train_dl, loss_fn, optimizer)

Epoch 1/10, Loss: 2.3202
Epoch 2/10, Loss: 2.2983
Epoch 3/10, Loss: 2.2749
Epoch 4/10, Loss: 2.2503
Epoch 5/10, Loss: 2.2227
Epoch 6/10, Loss: 2.1915
Epoch 7/10, Loss: 2.1567
Epoch 8/10, Loss: 2.1177
Epoch 9/10, Loss: 2.0743
Epoch 10/10, Loss: 2.0264


In [14]:
# evaluate model on test set
evaluate(model, val_dl)

Accuracy: 37.88%


In [15]:
# construct NN model -- Trial 3
hidden_units = [32, 16]
image_size = mnist_dataset[0][0].shape
input_size = image_size[0] * image_size[1] * image_size[2]

all_layers = [nn.Flatten()]
for hidden_unit in hidden_units:
    layer = nn.Linear(input_size, hidden_unit)
    all_layers.append(layer)
    all_layers.append(nn.ReLU())
    input_size = hidden_unit

all_layers.append(nn.Linear(hidden_units[-1], 10))
model = nn.Sequential(*all_layers)
model

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=32, bias=True)
  (2): ReLU()
  (3): Linear(in_features=32, out_features=16, bias=True)
  (4): ReLU()
  (5): Linear(in_features=16, out_features=10, bias=True)
)

In [16]:
# use cross-entropy loss function
loss_fn = nn.CrossEntropyLoss()

# use RMSprop optimizer
optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3)

In [17]:
# train the model
batch_size = 32
train_dl = DataLoader(train_ds, batch_size)
val_dl = DataLoader(val_ds, batch_size)
train(model, train_dl, loss_fn, optimizer)

Epoch 1/10, Loss: 0.5309
Epoch 2/10, Loss: 0.3773
Epoch 3/10, Loss: 0.2972
Epoch 4/10, Loss: 0.2408
Epoch 5/10, Loss: 0.1766
Epoch 6/10, Loss: 0.1224
Epoch 7/10, Loss: 0.0871
Epoch 8/10, Loss: 0.0648
Epoch 9/10, Loss: 0.0497
Epoch 10/10, Loss: 0.0445


In [18]:
# evaluate model on test set
evaluate(model, val_dl)

Accuracy: 92.74%


# Between using SGD, Adam, and RMSProp, the best optimizer for this classification problem will be the RMSProp optimizer.

In [19]:
# construct NN model -- Trial 4 (5 hidden layers, 32, 16, 8, 4, 2 dimensions per layer, Softmax activation function)
hidden_units = [32, 16, 8, 4, 2]
image_size = mnist_dataset[0][0].shape
input_size = image_size[0] * image_size[1] * image_size[2]

all_layers = [nn.Flatten()]
for hidden_unit in hidden_units:
    layer = nn.Linear(input_size, hidden_unit)
    all_layers.append(layer)
    all_layers.append(nn.Softmax())
    input_size = hidden_unit

all_layers.append(nn.Linear(hidden_units[-1], 10))
model = nn.Sequential(*all_layers)
model

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=32, bias=True)
  (2): Softmax(dim=None)
  (3): Linear(in_features=32, out_features=16, bias=True)
  (4): Softmax(dim=None)
  (5): Linear(in_features=16, out_features=8, bias=True)
  (6): Softmax(dim=None)
  (7): Linear(in_features=8, out_features=4, bias=True)
  (8): Softmax(dim=None)
  (9): Linear(in_features=4, out_features=2, bias=True)
  (10): Softmax(dim=None)
  (11): Linear(in_features=2, out_features=10, bias=True)
)

In [20]:
# use cross-entropy loss function
loss_fn = nn.CrossEntropyLoss()

# use RMSProp optimizer but increase learning rate
optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-2)

In [21]:
# train the model with increased batch size 
batch_size = 64
train_dl = DataLoader(train_ds, batch_size)
val_dl = DataLoader(val_ds, batch_size)
train(model, train_dl, loss_fn, optimizer)

  input = module(input)


Epoch 1/10, Loss: 1.8222
Epoch 2/10, Loss: 1.7595
Epoch 3/10, Loss: 1.6966
Epoch 4/10, Loss: 1.6492
Epoch 5/10, Loss: 1.8485
Epoch 6/10, Loss: 1.6031
Epoch 7/10, Loss: 1.5109
Epoch 8/10, Loss: 1.4111
Epoch 9/10, Loss: 1.3475
Epoch 10/10, Loss: 1.3208


In [22]:
# evaluate model on test set
evaluate(model, val_dl)

Accuracy: 37.50%


In [23]:
# construct NN model -- Trial 5 (5 hidden layers, 32, 16, 8, 4, 2 dimensions per layer, ReLU activation function)
hidden_units = [32, 16, 8, 4, 2]
image_size = mnist_dataset[0][0].shape
input_size = image_size[0] * image_size[1] * image_size[2]

all_layers = [nn.Flatten()]
for hidden_unit in hidden_units:
    layer = nn.Linear(input_size, hidden_unit)
    all_layers.append(layer)
    all_layers.append(nn.ReLU())
    input_size = hidden_unit

all_layers.append(nn.Linear(hidden_units[-1], 10))
model = nn.Sequential(*all_layers)
model

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=32, bias=True)
  (2): ReLU()
  (3): Linear(in_features=32, out_features=16, bias=True)
  (4): ReLU()
  (5): Linear(in_features=16, out_features=8, bias=True)
  (6): ReLU()
  (7): Linear(in_features=8, out_features=4, bias=True)
  (8): ReLU()
  (9): Linear(in_features=4, out_features=2, bias=True)
  (10): ReLU()
  (11): Linear(in_features=2, out_features=10, bias=True)
)

In [24]:
# use cross-entropy loss function
loss_fn = nn.CrossEntropyLoss()

# use RMSProp optimizer
optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3)

In [25]:
# train the model
batch_size = 32
train_dl = DataLoader(train_ds, batch_size)
val_dl = DataLoader(val_ds, batch_size)
train(model, train_dl, loss_fn, optimizer)

Epoch 1/10, Loss: 2.0656
Epoch 2/10, Loss: 1.9504
Epoch 3/10, Loss: 1.8738
Epoch 4/10, Loss: 1.8231
Epoch 5/10, Loss: 1.7858
Epoch 6/10, Loss: 1.7349
Epoch 7/10, Loss: 1.6579
Epoch 8/10, Loss: 1.6046
Epoch 9/10, Loss: 1.5541
Epoch 10/10, Loss: 1.5313


In [26]:
# evaluate model on test set
evaluate(model, val_dl)

Accuracy: 45.02%


# Based on trials 4 and 5, it appears using a lower batch size (32) as opposed to a higher one (64) yields better accuracy. Additionally, with more hidden layers the accuracy worsens. Also, using the Softmax activation function yields a lower accuracy result than using ReLU.

In [27]:
# construct NN model -- Trial 6
hidden_units = [32, 16]
image_size = mnist_dataset[0][0].shape
input_size = image_size[0] * image_size[1] * image_size[2]

all_layers = [nn.Flatten()]
for hidden_unit in hidden_units:
    layer = nn.Linear(input_size, hidden_unit)
    all_layers.append(layer)
    all_layers.append(nn.ReLU())
    input_size = hidden_unit

all_layers.append(nn.Linear(hidden_units[-1], 10))
model = nn.Sequential(*all_layers)
model

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=32, bias=True)
  (2): ReLU()
  (3): Linear(in_features=32, out_features=16, bias=True)
  (4): ReLU()
  (5): Linear(in_features=16, out_features=10, bias=True)
)

In [28]:
# use cross-entropy loss function
loss_fn = nn.CrossEntropyLoss()

# use RMSProp optimizer
optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3)

In [29]:
# train the model
batch_size = 32
train_dl = DataLoader(train_ds, batch_size)
val_dl = DataLoader(val_ds, batch_size)
train(model, train_dl, loss_fn, optimizer)

Epoch 1/10, Loss: 0.4404
Epoch 2/10, Loss: 0.2819
Epoch 3/10, Loss: 0.2163
Epoch 4/10, Loss: 0.1818
Epoch 5/10, Loss: 0.1559
Epoch 6/10, Loss: 0.1324
Epoch 7/10, Loss: 0.1095
Epoch 8/10, Loss: 0.0950
Epoch 9/10, Loss: 0.0815
Epoch 10/10, Loss: 0.0692


In [30]:
# evaluate model on test set
evaluate(model, val_dl)

Accuracy: 91.44%


In [31]:
# construct NN model -- Trial 7
hidden_units = [32, 16]
image_size = mnist_dataset[0][0].shape
input_size = image_size[0] * image_size[1] * image_size[2]

all_layers = [nn.Flatten()]
for hidden_unit in hidden_units:
    layer = nn.Linear(input_size, hidden_unit)
    all_layers.append(layer)
    all_layers.append(nn.ReLU())
    input_size = hidden_unit

all_layers.append(nn.Linear(hidden_units[-1], 10))
model = nn.Sequential(*all_layers)
model

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=32, bias=True)
  (2): ReLU()
  (3): Linear(in_features=32, out_features=16, bias=True)
  (4): ReLU()
  (5): Linear(in_features=16, out_features=10, bias=True)
)

In [32]:
# use cross-entropy loss function
loss_fn = nn.CrossEntropyLoss()

# use RMSProp optimizer
optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-4)

In [33]:
# train the model
batch_size = 32
train_dl = DataLoader(train_ds, batch_size)
val_dl = DataLoader(val_ds, batch_size)
train(model, train_dl, loss_fn, optimizer)

Epoch 1/10, Loss: 1.6943
Epoch 2/10, Loss: 1.2404
Epoch 3/10, Loss: 0.9222
Epoch 4/10, Loss: 0.7292
Epoch 5/10, Loss: 0.6094
Epoch 6/10, Loss: 0.5321
Epoch 7/10, Loss: 0.4776
Epoch 8/10, Loss: 0.4361
Epoch 9/10, Loss: 0.4015
Epoch 10/10, Loss: 0.3714


In [34]:
# evaluate model on test set
evaluate(model, val_dl)

Accuracy: 86.54%


# Decreasing the learning rate to 1e-4 is worse than if the learning rate is 1e-3, therefore the best learning rate to train this classification problem is 1e-3.

# The best neural network architecture found is to build two hidden layers with dimensions 32 and 16. Use ReLU as an activation function, use cross-entropy loss function, use RMSProp as the optimizer with a learning rate of 1e-3, and finally to set the batch size to 32.