# Number of Parameters vs Generalization

In [1]:
### Import libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device", device)

Device cuda


In [3]:
class model_class(nn.Module):
    def __init__(self, input_size, output_size, hidden_sizes=None):
        super(model_class, self).__init__()

        # Store model architecture parameters
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_sizes = hidden_sizes or []

        # Create layers
        layers = []
        in_features = input_size
        for h_size in hidden_sizes:
            layers.append(nn.Linear(in_features, h_size))
            layers.append(nn.ReLU())  # Add activation after each hidden layer
            in_features = h_size
        layers.append(nn.Linear(in_features, output_size))
        self.layers = nn.ModuleList(layers)

    def forward(self, x):
        # Forward pass through the layers
        x = x.view(-1, 784)
        for layer in self.layers:
            x = layer(x)
        return x

In [9]:
### training function for MNIST data set
def train_eval(model, num_epochs, learning_rate, train_loader, test_loader):
    # model to device
    model = model.to(device)
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    # Training loop
    total_loss = []
    total_accuracy = []
    for epoch in range(num_epochs):
        train_loss = 0.0

        #Training
        model.train()
        for images, labels in train_loader:
            images = images.to(device)
            labels = labels.to(device)
            #print(images.shape, labels.shape)

            # Zero out Gradient
            optimizer.zero_grad()

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            train_loss += loss * images.size(0)

        train_loss = train_loss/len(train_loader.sampler)
        total_loss.append(train_loss)

        #Validation
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for images, labels in test_loader:
                images = images.to(device)
                labels = labels.to(device)
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        accuracy = 100 * correct / total
        total_accuracy.append(accuracy)
        
        print(f"Epoch {epoch}; loss: {total_loss[-1]}")
    return total_loss[-1], total_accuracy[-1]

In [10]:
# MNIST dataset
train_dataset = torchvision.datasets.MNIST(root='../data', train=True, transform=transforms.ToTensor(), download=True)
test_dataset = torchvision.datasets.MNIST(root='../data', train=False, transform=transforms.ToTensor())

# Data loaders
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=32, shuffle=False)

In [11]:
model1 = model_class(input_size = 784, output_size = 10, hidden_sizes = [8,16])
model2 = model_class(input_size = 784, output_size = 10, hidden_sizes = [16,32])
model3 = model_class(input_size = 784, output_size = 10, hidden_sizes = [32,32])
model4 = model_class(input_size = 784, output_size = 10, hidden_sizes = [32,64])
model5 = model_class(input_size = 784, output_size = 10, hidden_sizes = [64,64])
model6 = model_class(input_size = 784, output_size = 10, hidden_sizes = [64,128])
model7 = model_class(input_size = 784, output_size = 10, hidden_sizes = [128,128])
model8 = model_class(input_size = 784, output_size = 10, hidden_sizes = [128,256])
model9 = model_class(input_size = 784, output_size = 10, hidden_sizes = [256,256])
model10 = model_class(input_size = 784, output_size = 10, hidden_sizes = [32,128])
model11 = model_class(input_size = 784, output_size = 10, hidden_sizes = [8,256])

In [12]:
m1_loss, m1_acc = train_eval(model1, num_epochs=10, learning_rate=0.001, train_loader=train_loader
                             , test_loader=test_loader)
m2_loss, m2_acc = train_eval(model2, num_epochs=10, learning_rate=0.001, train_loader=train_loader
                             , test_loader=test_loader)
m3_loss, m3_acc = train_eval(model3, num_epochs=10, learning_rate=0.001, train_loader=train_loader
                             , test_loader=test_loader)
m4_loss, m4_acc = train_eval(model4, num_epochs=10, learning_rate=0.001, train_loader=train_loader
                             , test_loader=test_loader)
m5_loss, m5_acc = train_eval(model5, num_epochs=10, learning_rate=0.001, train_loader=train_loader
                             , test_loader=test_loader)
m6_loss, m6_acc = train_eval(model6, num_epochs=10, learning_rate=0.001, train_loader=train_loader
                             , test_loader=test_loader)
m7_loss, m7_acc = train_eval(model7, num_epochs=10, learning_rate=0.001, train_loader=train_loader
                             , test_loader=test_loader)
m8_loss, m8_acc = train_eval(model8, num_epochs=10, learning_rate=0.001, train_loader=train_loader
                             , test_loader=test_loader)
m9_loss, m9_acc = train_eval(model9, num_epochs=10, learning_rate=0.001, train_loader=train_loader
                             , test_loader=test_loader)
m10_loss, m10_acc = train_eval(model10, num_epochs=10, learning_rate=0.001, train_loader=train_loader
                             , test_loader=test_loader)
m11_loss, m11_acc = train_eval(model11, num_epochs=10, learning_rate=0.001, train_loader=train_loader
                             , test_loader=test_loader)

Epoch 0; loss: 0.5908742547035217
Epoch 1; loss: 0.35295194387435913
Epoch 2; loss: 0.31762877106666565
Epoch 3; loss: 0.2908426523208618
Epoch 4; loss: 0.2707972228527069
Epoch 5; loss: 0.2583707571029663
Epoch 6; loss: 0.24866297841072083
Epoch 7; loss: 0.23953397572040558
Epoch 8; loss: 0.2322501838207245
Epoch 9; loss: 0.22479446232318878


In [13]:
print(f"loss: {m1_loss}; accuracy: {m1_acc}")

loss: 0.22479446232318878; accuracy: 93.17
