**Challenge: Implement a Multiclass Classification Neural Network using PyTorch**

Objective:
Build a neural network using PyTorch to predict handwritten digits of MNIST.

Steps:

1. **Data Preparation**: Load the MNIST dataset using ```torchvision.datasets.MNIST```. Standardize/normalize the features. Split the dataset into training and testing sets using, for example, ```sklearn.model_selection.train_test_split()```. **Bonus scores**: *use PyTorch's built-* ```DataLoader``` *to split the dataset*.

2. **Neural Network Architecture**: Define a simple feedforward neural network using PyTorch's ```nn.Module```. Design the input layer to match the number of features in the MNIST dataset and the output layer to have as many neurons as there are classes (10). You can experiment with the number of hidden layers and neurons to optimize the performance. **Bonus scores**: *Make your architecture flexibile to have as many hidden layers as the user wants, and use hyperparameter optimization to select the best number of hidden layeres.*

3. **Loss Function and Optimizer**: Choose an appropriate loss function for multiclass classification. Select an optimizer, like SGD (Stochastic Gradient Descent) or Adam.

4. **Training**: Write a training loop to iterate over the dataset.
Forward pass the input through the network, calculate the loss, and perform backpropagation. Update the weights of the network using the chosen optimizer.

5. **Testing**: Evaluate the trained model on the test set. Calculate the accuracy of the model.

6. **Optimization**: Experiment with hyperparameters (learning rate, number of epochs, etc.) to optimize the model's performance. Consider adjusting the neural network architecture for better results. **Notice that you can't use the optimization algorithms from scikit-learn that we saw in lab1: e.g.,** ```GridSearchCV```.


In [None]:
#imports
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms


In [None]:
#1

# we normalize the images in the range [-1,1]
transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize(0.5, 0.5)])

# get the training and test set of MNIST
trainset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
testset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# get the training and test data
batch_size = 32
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=True)

In [None]:
#2

# In this case all hidden layers have the same size. To make the dimension of every hidden layer customizable
# by the user we could insert an array as a parameter containg all the dimensions and modify the for cycle
# making each input of each layer have the same dimension of the output of the layer before.

class NET(nn.Module):
  def __init__(self, input_size, hidden_layers, hidden_size):
    super(NET, self).__init__()

    # Define input layer
    self.input_layer = nn.Linear(input_size, hidden_size)

    # Define hidden layers
    self.hidden_layers = nn.ModuleList([
      nn.Linear(hidden_size, hidden_size) for _ in range(hidden_layers)
    ])

    # Define output layer
    self.output_layer = nn.Linear(hidden_size, 10)



  def forward(self, x):

    x = x.view(x.size(0), -1)

    # Apply activation function (e.g., ReLU) to input layer
    x = torch.relu(self.input_layer(x))

    # Apply activation function and pass through each hidden layer
    for layer in self.hidden_layers:
      x = torch.relu(layer(x))

    # Output layer without activation
    x = self.output_layer(x)

    return x


In [None]:
input_size = 28*28
hidden_layers = 2
hidden_size = 256

model = NET(input_size, hidden_layers, hidden_size)

In [None]:
#3

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [None]:
#4

n_epochs = 5

# Training
for epoch in range(n_epochs):

  loss = 0

  for x,y in trainloader:

    ypred = model(x)
    loss = loss_fn(ypred, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  print(f'Finished epoch {epoch + 1}, latest loss {loss}')

Finished epoch 1, latest loss 0.00017945421859622002
Finished epoch 2, latest loss 0.001070813974365592
Finished epoch 3, latest loss 0.07683932036161423
Finished epoch 4, latest loss 0.010269719175994396
Finished epoch 5, latest loss 0.00015121296746656299


In [None]:
#5

#testing

correct = 0
total = 0
with torch.no_grad():
    for x,y in testloader:

        ypred = model(x)

        _, predicted = torch.max(ypred.data, 1)

        total += y.size(0)
        correct += (predicted == y).sum().item()

accuracy = 100 * correct / total
print(f'Accuracy of the model on the test images: {accuracy:.2f}%')

Accuracy of the model on the test images: 97.89%


In [None]:
#6

#experiments with various hyperparameters

learning_rates = [0.001, 0.01, 0.1]
hl1 = 2
hl2 = 3
hl3 = 4

best_accuracy = 0

for rate in learning_rates:

  #create model different number of hiddem layers
  model1 = NET(input_size, hl1, hidden_size)
  model2 = NET(input_size, hl2, hidden_size)
  model3 = NET(input_size, hl3, hidden_size)

  #new optim. with new learning rate
  optimizer1 = optim.Adam(model1.parameters(), lr=rate)
  optimizer2 = optim.Adam(model2.parameters(), lr=rate)
  optimizer3 = optim.Adam(model3.parameters(), lr=rate)

  #training the model

  #using different number of epochs
  for epoch in range(n_epochs):

    for x,y in trainloader:

      ypred1 = model1(x)
      ypred2 = model2(x)
      ypred3 = model3(x)
      loss1 = loss_fn(ypred1, y)
      loss2 = loss_fn(ypred2, y)
      loss3 = loss_fn(ypred3, y)
      optimizer1.zero_grad()
      optimizer2.zero_grad()
      optimizer3.zero_grad()
      loss1.backward()
      loss2.backward()
      loss3.backward()
      optimizer1.step()
      optimizer2.step()
      optimizer3.step()

  #Testing the model

  total = 0

  correct1 = 0
  correct2 = 0
  correct3 = 0

  with torch.no_grad():
    for x,y in testloader:

      ypred1 = model1(x)
      ypred2 = model2(x)
      ypred3 = model3(x)

      _, predicted1 = torch.max(ypred1.data, 1)
      _, predicted2 = torch.max(ypred2.data, 1)
      _, predicted3 = torch.max(ypred3.data, 1)

      total += y.size(0)
      correct1 += (predicted1 == y).sum().item()
      correct2 += (predicted2 == y).sum().item()
      correct3 += (predicted3 == y).sum().item()

  accuracy1 = 100 * correct1 / total
  accuracy2 = 100 * correct2 / total
  accuracy3 = 100 * correct3 / total


  if accuracy1 > best_accuracy:
    best_accuracy = accuracy1
    best_set_of_parameters = [rate, n, hl1]

  elif accuracy2 > best_accuracy:
    best_accuracy = accuracy2
    best_set_of_parameters = [rate, n, hl2]

  elif accuracy3 > best_accuracy:
    best_accuracy = accuracy3
    best_set_of_parameters = [rate, hl3]

print(f'Best Accuracy: {best_accuracy:.2f}%, optimal set of hyperparameters: Learning Rate: {best_set_of_parameters[0]}; Hidden layers: {best_set_of_parameters[1]}')

Best Accuracy: 96.58%, optimal set of hyperparameters: Learning Rate: 0.001; Hidden layers: 10


In [None]:
#If we want we can also test with different numbers of epochs and learning rates
#with the following code. I didn't run it beacuse of the very long time it would need


learning_rates = [0.001, 0.01, 0.1]
epoch_list = [5, 10, 15]
hl1 = 2
hl2 = 3
hl3 = 4

best_accuracy = 0

for rate in learning_rates:

  #create model different number of hiddem layers
  model1 = NET(input_size, hl1, hidden_size)
  model2 = NET(input_size, hl2, hidden_size)
  model3 = NET(input_size, hl3, hidden_size)

  #new optim. with new learning rate
  optimizer1 = optim.Adam(model1.parameters(), lr=rate)
  optimizer2 = optim.Adam(model2.parameters(), lr=rate)
  optimizer3 = optim.Adam(model3.parameters(), lr=rate)

  for n in epoch_list:


    #training the model

    #using different number of epochs
    for epoch in range(n):

      for x,y in trainloader:

        ypred1 = model1(x)
        ypred2 = model2(x)
        ypred3 = model3(x)
        loss1 = loss_fn(ypred1, y)
        loss2 = loss_fn(ypred2, y)
        loss3 = loss_fn(ypred3, y)
        optimizer1.zero_grad()
        optimizer2.zero_grad()
        optimizer3.zero_grad()
        loss1.backward()
        loss2.backward()
        loss3.backward()
        optimizer1.step()
        optimizer2.step()
        optimizer3.step()

    #Testing the model

    total = 0

    correct1 = 0
    correct2 = 0
    correct3 = 0

    with torch.no_grad():
      for x,y in testloader:

        ypred1 = model1(x)
        ypred2 = model2(x)
        ypred3 = model3(x)

        _, predicted1 = torch.max(ypred1.data, 1)
        _, predicted2 = torch.max(ypred2.data, 1)
        _, predicted3 = torch.max(ypred3.data, 1)

        total += y.size(0)
        correct1 += (predicted1 == y).sum().item()
        correct2 += (predicted2 == y).sum().item()
        correct3 += (predicted3 == y).sum().item()

    accuracy1 = 100 * correct1 / total
    accuracy2 = 100 * correct2 / total
    accuracy3 = 100 * correct3 / total


    if accuracy1 > best_accuracy:
      best_accuracy = accuracy1
      best_set_of_parameters = [rate, n, hl1]

    elif accuracy2 > best_accuracy:
      best_accuracy = accuracy2
      best_set_of_parameters = [rate, n, hl2]

    elif accuracy3 > best_accuracy:
      best_accuracy = accuracy3
      best_set_of_parameters = [rate, n, hl3]

print(f'Best Accuracy: {best_accuracy:.2f}%, optimal set of hyperparameters: Learning Rate: {best_set_of_parameters[0]}; Epochs: {best_set_of_parameters[1]}; Hidden layers: {best_set_of_parameters[2]}')