In [1]:
import torch
from typing import Callable


class MLP(torch.nn.Module):
    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        num_classes: int,
        hidden_count: int = 1,
        activation: Callable = torch.nn.ReLU,
        initializer: Callable = torch.nn.init.ones_,
    ) -> None:
        """
        Initialize the MLP.

        Arguments:
            input_size: The dimension D of the input data.
            hidden_size: The number of neurons H in the hidden layer.
            num_classes: The number of classes C.
            hidden_count: The number of hidden layers.
            activation: The activation function to use in the hidden layer.
            initializer: The initializer to use for the weights.
        """
        super(MLP, self).__init__()
        self.activation = activation()
        self.hidden_layers = [torch.nn.Linear(input_size, hidden_size)]
        initializer(self.hidden_layers[0].weight)
        for _ in range(1, hidden_count):
            self.hidden_layers.append(torch.nn.Linear(hidden_size, hidden_size))
            initializer(self.hidden_layers[-1].weight)
        self.output_layer = torch.nn.Linear(hidden_size, num_classes)
        initializer(self.output_layer.weight)

    def forward(self, x):
        """
        Forward pass of the network.

        Arguments:
            x: The input data.

        Returns:
            The output of the network.
        """
        for hidden_layer in self.hidden_layers:
            x = self.activation(hidden_layer(x))
        return self.output_layer(x)

In [2]:
# MNIST:
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor, Compose

# PyTorch:
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

# Other:
from typing import Tuple
from tqdm.notebook import tqdm


_transform_list = [
    ToTensor(),
    lambda x: x.view(-1),
]


def get_mnist_data() -> Tuple[DataLoader, DataLoader]:
    """
    Get the MNIST data from torchvision.

    Arguments:
        None

    Returns:
        train_loader (DataLoader): The training data loader.
        test_loader (DataLoader): The test data loader.

    """
    # Get the training data:
    train_data = MNIST(
        root="data", train=True, download=True, transform=Compose(_transform_list)
    )
    # Create a data loader for the training data:
    train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
    # Get the test data:
    test_data = MNIST(
        root="data", train=False, download=True, transform=Compose(_transform_list)
    )
    # Create a data loader for the test data:
    test_loader = DataLoader(test_data, batch_size=64, shuffle=True)
    # Return the data loaders:
    return train_loader, test_loader


def train(
    model: torch.nn.Module,
    train_loader: DataLoader,
    test_loader: DataLoader,
    num_epochs: int,
    learning_rate: float,
    device: torch.device,
) -> None:
    """
    Train a model on the MNIST data.

    Arguments:
        model (torch.nn.Module): The model to train.
        train_loader (DataLoader): The training data loader.
        test_loader (DataLoader): The test data loader.
        num_epochs (int): The number of epochs to train for.
        learning_rate (float): The learning rate to use.
        device (torch.device): The device to use for training.

    Returns:
        None

    """
    # Create an optimizer:
    optimizer = Adam(model.parameters(), lr=learning_rate)
    # Create a loss function:
    criterion = CrossEntropyLoss()
    # Move the model to the device:
    model.to(device)
    # Create a progress bar:
    progress_bar = tqdm(range(num_epochs))
    # Train the model:
    for epoch in progress_bar:
        # Set the model to training mode:
        model.train()
        # Iterate over the training data:
        for batch in train_loader:
            # Get the data and labels:
            data, labels = batch
            # Move the data and labels to the device:
            data = data.to(device)
            labels = labels.to(device)
            # Zero the gradients:
            optimizer.zero_grad()
            # Forward pass:
            outputs = model(data)
            # Calculate the loss:
            loss = criterion(outputs, labels)
            # Backward pass:
            loss.backward()
            # Update the parameters:
            optimizer.step()
        # Set the model to evaluation mode:
        model.eval()

        # Calculate the accuracy on the test data:
        correct = 0
        total = 0
        with torch.no_grad():
            for batch in test_loader:
                # Get the data and labels:
                data, labels = batch
                # Move the data and labels to the device:
                data = data.to(device)
                labels = labels.to(device)
                # Forward pass:
                outputs = model(data)
                # Get the predictions:
                _, predictions = torch.max(outputs.data, 1)
                # Update the total and correct counts:
                total += labels.size(0)
                correct += (predictions == labels).sum().item()
        # Calculate the accuracy:
        accuracy = correct / total
        # Update the progress bar:
        progress_bar.set_description(f"Epoch: {epoch}, Accuracy: {accuracy:.4f}")

In [3]:
train_loader, test_loader = get_mnist_data()

In [7]:
model1 = MLP(784, 512, 10, 2, torch.nn.LeakyReLU, torch.nn.init.xavier_normal_)

train(model=model1,
      train_loader=train_loader,
      test_loader=test_loader,
      num_epochs=10,
      learning_rate=0.001,
      device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
     )

  0%|          | 0/10 [00:00<?, ?it/s]

In [9]:
model2 = MLP(784, 1024, 10, 2, torch.nn.LeakyReLU, torch.nn.init.xavier_normal_)

train(model=model2,
      train_loader=train_loader,
      test_loader=test_loader,
      num_epochs=10,
      learning_rate=0.001,
      device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
     )

  0%|          | 0/10 [00:00<?, ?it/s]

In [11]:
model3 = MLP(784, 1024, 10, 3, torch.nn.LeakyReLU, torch.nn.init.xavier_normal_)

train(model=model3,
      train_loader=train_loader,
      test_loader=test_loader,
      num_epochs=10,
      learning_rate=0.001,
      device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
     )

  0%|          | 0/10 [00:00<?, ?it/s]

In [12]:
model4 = MLP(784, 1500, 10, 2, torch.nn.LeakyReLU, torch.nn.init.xavier_normal_)

train(model=model4,
      train_loader=train_loader,
      test_loader=test_loader,
      num_epochs=10,
      learning_rate=0.001,
      device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
     )

  0%|          | 0/10 [00:00<?, ?it/s]

In [13]:
model4 = MLP(784, 512, 10, 3, torch.nn.LeakyReLU, torch.nn.init.xavier_normal_)

train(model=model4,
      train_loader=train_loader,
      test_loader=test_loader,
      num_epochs=10,
      learning_rate=0.001,
      device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
     )

  0%|          | 0/10 [00:00<?, ?it/s]