In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm  # For progress bars

import time


In [2]:
# Determine the device
if torch.backends.mps.is_available():
    device = torch.device('mps')
    print("Using MPS device for accelerated training.")
elif torch.cuda.is_available():
    device = torch.device('cuda')
    print("Using CUDA device for accelerated training.")
else:
    device = torch.device('cpu')
    print("Using CPU for training.")


Using MPS device for accelerated training.


In [3]:
# Define transformations: Convert images to tensors and normalize
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))  # MNIST mean and std
])

# Load training and testing datasets
train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, transform=transform, download=True)

# Define DataLoaders
initial_batch_size = 64  # Default batch size; will vary in 2.2

train_loader = DataLoader(dataset=train_dataset, batch_size=initial_batch_size, shuffle=True, num_workers=0)
test_loader = DataLoader(dataset=test_dataset, batch_size=initial_batch_size, shuffle=False, num_workers=0)


In [4]:
class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()
        # Convolutional layers
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)  # Input channels=1 for MNIST
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)  # 2x2 Max pooling
        # Fully connected layers
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)  # 10 classes for MNIST
    
    def forward(self, x):
        # Convolutional layer 1
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        # Convolutional layer 2
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        # Flatten
        x = x.view(-1, 64 * 7 * 7)
        # Fully connected layers
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [5]:
class SGD_Momentum:
    def __init__(self, params, lr=0.01, beta=0.9):
        self.params = list(params)
        self.lr = lr
        self.beta = beta
        self.velocities = [torch.zeros_like(p.data) for p in self.params]
    
    def step(self):
        with torch.no_grad():
            for i, p in enumerate(self.params):
                if p.grad is not None:
                    # Update velocity
                    self.velocities[i] = self.beta * self.velocities[i] + (1 - self.beta) * p.grad
                    # Update parameters
                    p.data -= self.lr * self.velocities[i]
    
    def zero_grad(self):
        for p in self.params:
            if p.grad is not None:
                p.grad.zero_()


In [6]:
class NAG:
    def __init__(self, params, lr=0.01, beta=0.95):
        self.params = list(params)
        self.lr = lr
        self.beta = beta
        self.velocities = [torch.zeros_like(p.data) for p in self.params]
    
    def step(self):
        with torch.no_grad():
            for i, p in enumerate(self.params):
                if p.grad is not None:
                    prev_velocity = self.velocities[i].clone()
                    # Update velocity
                    self.velocities[i] = self.beta * self.velocities[i] + (1 - self.beta) * p.grad
                    # Update parameters with NAG
                    p.data -= self.lr * (self.beta * prev_velocity + (1 - self.beta) * self.velocities[i])
    
    def zero_grad(self):
        for p in self.params:
            if p.grad is not None:
                p.grad.zero_()


In [7]:
class RMSprop:
    def __init__(self, params, lr=0.001, beta=0.95, gamma=1.0, eps=1e-8):
        self.params = list(params)
        self.lr = lr
        self.beta = beta
        self.gamma = gamma
        self.eps = eps
        self.squares = [torch.zeros_like(p.data) for p in self.params]
    
    def step(self):
        with torch.no_grad():
            for i, p in enumerate(self.params):
                if p.grad is not None:
                    # Update squared gradients
                    self.squares[i] = self.beta * self.squares[i] + (1 - self.beta) * (p.grad ** 2)
                    # Update parameters
                    p.data -= self.lr * (p.grad / (self.squares[i].sqrt() + self.eps))
    
    def zero_grad(self):
        for p in self.params:
            if p.grad is not None:
                p.grad.zero_()


In [8]:
class Adam:
    def __init__(self, params, lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8):
        self.params = list(params)
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps
        self.m = [torch.zeros_like(p.data) for p in self.params]
        self.v = [torch.zeros_like(p.data) for p in self.params]
        self.t = 0  # Time step
    
    def step(self):
        self.t += 1
        with torch.no_grad():
            for i, p in enumerate(self.params):
                if p.grad is not None:
                    # Update biased first moment estimate
                    self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * p.grad
                    # Update biased second moment estimate
                    self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * (p.grad ** 2)
                    # Compute bias-corrected first moment estimate
                    m_hat = self.m[i] / (1 - self.beta1 ** self.t)
                    # Compute bias-corrected second moment estimate
                    v_hat = self.v[i] / (1 - self.beta2 ** self.t)
                    # Update parameters
                    p.data -= self.lr * m_hat / (v_hat.sqrt() + self.eps)
    
    def zero_grad(self):
        for p in self.params:
            if p.grad is not None:
                p.grad.zero_()


In [9]:
def train_model(model, train_loader, test_loader, optimizer, num_epochs=10, l1_lambda=1e-5):
    """
    Trains the CNN model using the provided optimizer.

    Args:
        model (nn.Module): The CNN model to train.
        train_loader (DataLoader): DataLoader for training data.
        test_loader (DataLoader): DataLoader for test data.
        optimizer: Optimizer instance.
        num_epochs (int): Number of training epochs.
        l1_lambda (float): L1 regularization coefficient.
    
    Returns:
        Tuple containing training loss history, validation loss history, and validation accuracy history.
    """
    # Loss function
    criterion = nn.CrossEntropyLoss()
    
    # Lists to store losses and accuracies
    train_loss_history = []
    val_loss_history = []
    val_accuracy_history = []
    
    model.to(device)
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
        
        for images, labels in progress_bar:
            images = images.to(device)
            labels = labels.to(device)
            
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            # L1 Regularization
            l1_norm = sum(p.abs().sum() for p in model.parameters())
            loss += l1_lambda * l1_norm
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())
        
        # Calculate average training loss
        avg_train_loss = running_loss / len(train_loader)
        train_loss_history.append(avg_train_loss)
        
        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        
        with torch.no_grad():
            for images, labels in test_loader:
                images = images.to(device)
                labels = labels.to(device)
                
                outputs = model(images)
                loss = criterion(outputs, labels)
                
                # L1 Regularization
                l1_norm = sum(p.abs().sum() for p in model.parameters())
                loss += l1_lambda * l1_norm
                
                val_loss += loss.item()
                
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        avg_val_loss = val_loss / len(test_loader)
        val_loss_history.append(avg_val_loss)
        
        val_accuracy = 100 * correct / total
        val_accuracy_history.append(val_accuracy)
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%')
    
    return train_loss_history, val_loss_history, val_accuracy_history


In [10]:
# Hyperparameters for Momentum
learning_rate_momentum = 0.01
beta_momentum = 0.9
batch_size_momentum = 64  # Adjust as needed

# Define DataLoaders
train_loader_momentum = DataLoader(dataset=train_dataset, batch_size=batch_size_momentum, shuffle=True, num_workers=0)
test_loader_momentum = DataLoader(dataset=test_dataset, batch_size=batch_size_momentum, shuffle=False, num_workers=0)

# Initialize model and optimizer
model_momentum = CNNModel().to(device)
optimizer_momentum = SGD_Momentum(model_momentum.parameters(), lr=learning_rate_momentum, beta=beta_momentum)

print("Training with Momentum Optimizer")
start_time = time.time()
train_loss_momentum, val_loss_momentum, val_acc_momentum = train_model(
    model_momentum, train_loader_momentum, test_loader_momentum, optimizer_momentum, num_epochs=10, l1_lambda=1e-5)
end_time = time.time()
print(f"Momentum Optimizer Training Time: {end_time - start_time:.2f} seconds")


Training with Momentum Optimizer


Epoch 1/10: 100%|██████████| 938/938 [00:11<00:00, 78.83batch/s, loss=0.295] 


Epoch [1/10], Train Loss: 0.6743, Val Loss: 0.2585, Val Accuracy: 93.57%


Epoch 2/10: 100%|██████████| 938/938 [00:08<00:00, 106.72batch/s, loss=0.122] 


Epoch [2/10], Train Loss: 0.2121, Val Loss: 0.1663, Val Accuracy: 96.53%


Epoch 3/10: 100%|██████████| 938/938 [00:08<00:00, 108.56batch/s, loss=0.107] 


Epoch [3/10], Train Loss: 0.1516, Val Loss: 0.1250, Val Accuracy: 97.47%


Epoch 4/10: 100%|██████████| 938/938 [00:08<00:00, 107.97batch/s, loss=0.0939]


Epoch [4/10], Train Loss: 0.1261, Val Loss: 0.1077, Val Accuracy: 97.92%


Epoch 5/10: 100%|██████████| 938/938 [00:08<00:00, 108.01batch/s, loss=0.0722]


Epoch [5/10], Train Loss: 0.1122, Val Loss: 0.1038, Val Accuracy: 98.05%


Epoch 6/10: 100%|██████████| 938/938 [00:08<00:00, 106.51batch/s, loss=0.0502]


Epoch [6/10], Train Loss: 0.1025, Val Loss: 0.0912, Val Accuracy: 98.50%


Epoch 7/10: 100%|██████████| 938/938 [00:08<00:00, 107.35batch/s, loss=0.166] 


Epoch [7/10], Train Loss: 0.0961, Val Loss: 0.0884, Val Accuracy: 98.42%


Epoch 8/10: 100%|██████████| 938/938 [00:08<00:00, 108.84batch/s, loss=0.0696]


Epoch [8/10], Train Loss: 0.0899, Val Loss: 0.0851, Val Accuracy: 98.67%


Epoch 9/10: 100%|██████████| 938/938 [00:08<00:00, 106.56batch/s, loss=0.18]  


Epoch [9/10], Train Loss: 0.0860, Val Loss: 0.0879, Val Accuracy: 98.47%


Epoch 10/10: 100%|██████████| 938/938 [00:08<00:00, 106.33batch/s, loss=0.0507]


Epoch [10/10], Train Loss: 0.0835, Val Loss: 0.0803, Val Accuracy: 98.71%
Momentum Optimizer Training Time: 98.36 seconds


In [11]:
# Hyperparameters for NAG
learning_rate_nag = 0.01
beta_nag = 0.95
batch_size_nag = 64  # Adjust as needed

# Define DataLoaders
train_loader_nag = DataLoader(dataset=train_dataset, batch_size=batch_size_nag, shuffle=True, num_workers=0)
test_loader_nag = DataLoader(dataset=test_dataset, batch_size=batch_size_nag, shuffle=False, num_workers=0)

# Initialize model and optimizer
model_nag = CNNModel().to(device)
optimizer_nag = NAG(model_nag.parameters(), lr=learning_rate_nag, beta=beta_nag)

print("\nTraining with Nesterov's Accelerated Gradient (NAG) Optimizer")
start_time = time.time()
train_loss_nag, val_loss_nag, val_acc_nag = train_model(
    model_nag, train_loader_nag, test_loader_nag, optimizer_nag, num_epochs=10, l1_lambda=1e-5)
end_time = time.time()
print(f"NAG Optimizer Training Time: {end_time - start_time:.2f} seconds")



Training with Nesterov's Accelerated Gradient (NAG) Optimizer


Epoch 1/10: 100%|██████████| 938/938 [00:08<00:00, 105.72batch/s, loss=0.194]


Epoch [1/10], Train Loss: 0.6625, Val Loss: 0.2446, Val Accuracy: 93.93%


Epoch 2/10: 100%|██████████| 938/938 [00:08<00:00, 108.64batch/s, loss=0.235] 


Epoch [2/10], Train Loss: 0.2068, Val Loss: 0.1558, Val Accuracy: 96.62%


Epoch 3/10: 100%|██████████| 938/938 [00:08<00:00, 110.89batch/s, loss=0.101] 


Epoch [3/10], Train Loss: 0.1493, Val Loss: 0.1236, Val Accuracy: 97.53%


Epoch 4/10: 100%|██████████| 938/938 [00:08<00:00, 105.60batch/s, loss=0.0832]


Epoch [4/10], Train Loss: 0.1232, Val Loss: 0.1129, Val Accuracy: 97.76%


Epoch 5/10: 100%|██████████| 938/938 [00:09<00:00, 100.85batch/s, loss=0.0595]


Epoch [5/10], Train Loss: 0.1095, Val Loss: 0.1009, Val Accuracy: 98.16%


Epoch 6/10: 100%|██████████| 938/938 [00:09<00:00, 101.90batch/s, loss=0.102] 


Epoch [6/10], Train Loss: 0.1000, Val Loss: 0.0961, Val Accuracy: 98.37%


Epoch 7/10: 100%|██████████| 938/938 [00:09<00:00, 101.42batch/s, loss=0.0823]


Epoch [7/10], Train Loss: 0.0926, Val Loss: 0.0974, Val Accuracy: 98.40%


Epoch 8/10: 100%|██████████| 938/938 [00:08<00:00, 106.04batch/s, loss=0.0646]


Epoch [8/10], Train Loss: 0.0884, Val Loss: 0.0826, Val Accuracy: 98.76%


Epoch 9/10: 100%|██████████| 938/938 [00:08<00:00, 106.44batch/s, loss=0.0718]


Epoch [9/10], Train Loss: 0.0843, Val Loss: 0.0833, Val Accuracy: 98.69%


Epoch 10/10: 100%|██████████| 938/938 [00:08<00:00, 108.33batch/s, loss=0.0635]


Epoch [10/10], Train Loss: 0.0814, Val Loss: 0.0808, Val Accuracy: 98.78%
NAG Optimizer Training Time: 96.28 seconds


In [12]:
# Hyperparameters for RMSprop
learning_rate_rmsprop = 0.001
beta_rmsprop = 0.95
gamma_rmsprop = 1.0
eps_rmsprop = 1e-8
batch_size_rmsprop = 64  # Adjust as needed

# Define DataLoaders
train_loader_rmsprop = DataLoader(dataset=train_dataset, batch_size=batch_size_rmsprop, shuffle=True, num_workers=0)
test_loader_rmsprop = DataLoader(dataset=test_dataset, batch_size=batch_size_rmsprop, shuffle=False, num_workers=0)

# Initialize model and optimizer
model_rmsprop = CNNModel().to(device)
optimizer_rmsprop = RMSprop(model_rmsprop.parameters(), lr=learning_rate_rmsprop, beta=beta_rmsprop, gamma=gamma_rmsprop, eps=eps_rmsprop)

print("\nTraining with RMSprop Optimizer")
start_time = time.time()
train_loss_rmsprop, val_loss_rmsprop, val_acc_rmsprop = train_model(
    model_rmsprop, train_loader_rmsprop, test_loader_rmsprop, optimizer_rmsprop, num_epochs=10, l1_lambda=1e-5)
end_time = time.time()
print(f"RMSprop Optimizer Training Time: {end_time - start_time:.2f} seconds")



Training with RMSprop Optimizer


Epoch 1/10: 100%|██████████| 938/938 [00:08<00:00, 104.66batch/s, loss=0.0385]


Epoch [1/10], Train Loss: 0.1756, Val Loss: 0.0670, Val Accuracy: 98.85%


Epoch 2/10: 100%|██████████| 938/938 [00:09<00:00, 96.29batch/s, loss=0.0393] 


Epoch [2/10], Train Loss: 0.0753, Val Loss: 0.0634, Val Accuracy: 98.87%


Epoch 3/10: 100%|██████████| 938/938 [00:08<00:00, 105.58batch/s, loss=0.0425]


Epoch [3/10], Train Loss: 0.0618, Val Loss: 0.0695, Val Accuracy: 98.75%


Epoch 4/10: 100%|██████████| 938/938 [00:08<00:00, 104.74batch/s, loss=0.0498]


Epoch [4/10], Train Loss: 0.0552, Val Loss: 0.0609, Val Accuracy: 98.80%


Epoch 5/10: 100%|██████████| 938/938 [00:08<00:00, 111.14batch/s, loss=0.0277]


Epoch [5/10], Train Loss: 0.0505, Val Loss: 0.0604, Val Accuracy: 99.02%


Epoch 6/10: 100%|██████████| 938/938 [00:08<00:00, 109.12batch/s, loss=0.0342]


Epoch [6/10], Train Loss: 0.0457, Val Loss: 0.0512, Val Accuracy: 99.10%


Epoch 7/10: 100%|██████████| 938/938 [00:08<00:00, 110.12batch/s, loss=0.0253]


Epoch [7/10], Train Loss: 0.0426, Val Loss: 0.0540, Val Accuracy: 99.03%


Epoch 8/10: 100%|██████████| 938/938 [00:08<00:00, 109.50batch/s, loss=0.0531]


Epoch [8/10], Train Loss: 0.0401, Val Loss: 0.0567, Val Accuracy: 98.99%


Epoch 9/10: 100%|██████████| 938/938 [00:08<00:00, 110.22batch/s, loss=0.0227]


Epoch [9/10], Train Loss: 0.0382, Val Loss: 0.0573, Val Accuracy: 99.03%


Epoch 10/10: 100%|██████████| 938/938 [00:08<00:00, 109.89batch/s, loss=0.0246]


Epoch [10/10], Train Loss: 0.0362, Val Loss: 0.0536, Val Accuracy: 99.06%
RMSprop Optimizer Training Time: 94.99 seconds


In [13]:
# Hyperparameters for Adam
learning_rate_adam = 0.001
beta1_adam = 0.9
beta2_adam = 0.999
eps_adam = 1e-8
batch_size_adam = 64  # Adjust as needed

# Define DataLoaders
train_loader_adam = DataLoader(dataset=train_dataset, batch_size=batch_size_adam, shuffle=True, num_workers=0)
test_loader_adam = DataLoader(dataset=test_dataset, batch_size=batch_size_adam, shuffle=False, num_workers=0)

# Initialize model and optimizer
model_adam = CNNModel().to(device)
optimizer_adam = Adam(model_adam.parameters(), lr=learning_rate_adam, beta1=beta1_adam, beta2=beta2_adam, eps=eps_adam)

print("\nTraining with Adam Optimizer")
start_time = time.time()
train_loss_adam, val_loss_adam, val_acc_adam = train_model(
    model_adam, train_loader_adam, test_loader_adam, optimizer_adam, num_epochs=10, l1_lambda=1e-5)
end_time = time.time()
print(f"Adam Optimizer Training Time: {end_time - start_time:.2f} seconds")



Training with Adam Optimizer


Epoch 1/10: 100%|██████████| 938/938 [00:08<00:00, 104.85batch/s, loss=0.0916]


Epoch [1/10], Train Loss: 0.1764, Val Loss: 0.1265, Val Accuracy: 97.17%


Epoch 2/10: 100%|██████████| 938/938 [00:08<00:00, 104.58batch/s, loss=0.0398]


Epoch [2/10], Train Loss: 0.0856, Val Loss: 0.0841, Val Accuracy: 98.54%


Epoch 3/10: 100%|██████████| 938/938 [00:10<00:00, 90.94batch/s, loss=0.0697] 


Epoch [3/10], Train Loss: 0.0723, Val Loss: 0.0783, Val Accuracy: 98.79%


Epoch 4/10: 100%|██████████| 938/938 [00:09<00:00, 98.18batch/s, loss=0.0728] 


Epoch [4/10], Train Loss: 0.0659, Val Loss: 0.0646, Val Accuracy: 99.29%


Epoch 5/10: 100%|██████████| 938/938 [00:09<00:00, 98.62batch/s, loss=0.0415] 


Epoch [5/10], Train Loss: 0.0619, Val Loss: 0.0643, Val Accuracy: 99.05%


Epoch 6/10: 100%|██████████| 938/938 [00:09<00:00, 99.64batch/s, loss=0.0547] 


Epoch [6/10], Train Loss: 0.0553, Val Loss: 0.0706, Val Accuracy: 98.94%


Epoch 7/10: 100%|██████████| 938/938 [00:09<00:00, 98.93batch/s, loss=0.0378] 


Epoch [7/10], Train Loss: 0.0521, Val Loss: 0.0657, Val Accuracy: 99.02%


Epoch 8/10: 100%|██████████| 938/938 [00:09<00:00, 99.74batch/s, loss=0.0996] 


Epoch [8/10], Train Loss: 0.0512, Val Loss: 0.0629, Val Accuracy: 99.16%


Epoch 9/10: 100%|██████████| 938/938 [00:09<00:00, 98.77batch/s, loss=0.0497] 


Epoch [9/10], Train Loss: 0.0474, Val Loss: 0.0647, Val Accuracy: 98.95%


Epoch 10/10: 100%|██████████| 938/938 [00:10<00:00, 87.08batch/s, loss=0.102]  


Epoch [10/10], Train Loss: 0.0440, Val Loss: 0.0665, Val Accuracy: 99.09%
Adam Optimizer Training Time: 103.43 seconds


In [14]:
print("\nFinal Validation Accuracies:")

print("\nMomentum Optimizer:")
for batch_size in [64]:
    acc = val_acc_momentum
    print(f"Batch Size {batch_size}: {acc:.2f}%")

print("\nNesterov's Accelerated Gradient Optimizer:")
for batch_size in [64]:
    acc = val_acc_nag
    print(f"Batch Size {batch_size}: {acc:.2f}%")

print("\nRMSprop Optimizer:")
for batch_size in [64]:
    acc = val_acc_rmsprop
    print(f"Batch Size {batch_size}: {acc:.2f}%")

print("\nAdam Optimizer:")
for batch_size in [64]:
    acc = val_acc_adam
    print(f"Batch Size {batch_size}: {acc:.2f}%")



Final Validation Accuracies:

Momentum Optimizer:


TypeError: unsupported format string passed to list.__format__

# 2.2

In [15]:
batch_sizes = [4, 8, 16, 32]

def compare_optimizers(optimizer_class, optimizer_name, batch_sizes, learning_rate, **kwargs):
    """
    Trains the CNN model using a specific optimizer across different batch sizes.

    Args:
        optimizer_class: The optimizer class to instantiate.
        optimizer_name (str): Name of the optimizer for display.
        batch_sizes (list): List of batch sizes to test.
        learning_rate (float): Learning rate for the optimizer.
        **kwargs: Additional hyperparameters for the optimizer.
    
    Returns:
        dict: A dictionary containing training and validation losses and accuracies for each batch size.
    """
    results = {}
    for batch_size in batch_sizes:
        print(f"\nTraining with {optimizer_name}, Batch Size: {batch_size}")
        # Define DataLoaders
        train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
        test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
        
        # Initialize model and optimizer
        model = CNNModel().to(device)
        optimizer = optimizer_class(model.parameters(), lr=learning_rate, **kwargs)
        
        # Train the model
        train_loss, val_loss, val_acc = train_model(
            model, train_loader, test_loader, optimizer, num_epochs=10, l1_lambda=1e-5)
        
        # Store the results
        results[batch_size] = {
            'train_loss': train_loss,
            'val_loss': val_loss,
            'val_accuracy': val_acc
        }
    return results


In [16]:
# Hyperparameters for Momentum
learning_rate_momentum = 0.01
beta_momentum = 0.9

momentum_results = compare_optimizers(
    optimizer_class=SGD_Momentum,
    optimizer_name="Momentum",
    batch_sizes=batch_sizes,
    learning_rate=learning_rate_momentum,
    beta=beta_momentum
)



Training with Momentum, Batch Size: 4


Epoch 1/10: 100%|██████████| 15000/15000 [01:52<00:00, 133.87batch/s, loss=0.0447]


Epoch [1/10], Train Loss: 0.1795, Val Loss: 0.0914, Val Accuracy: 98.60%


Epoch 2/10: 100%|██████████| 15000/15000 [01:49<00:00, 137.27batch/s, loss=0.0479]


Epoch [2/10], Train Loss: 0.0863, Val Loss: 0.0765, Val Accuracy: 98.81%


Epoch 3/10: 100%|██████████| 15000/15000 [01:50<00:00, 135.52batch/s, loss=0.0392]


Epoch [3/10], Train Loss: 0.0705, Val Loss: 0.0702, Val Accuracy: 98.96%


Epoch 4/10: 100%|██████████| 15000/15000 [01:49<00:00, 136.60batch/s, loss=0.0369]


Epoch [4/10], Train Loss: 0.0610, Val Loss: 0.0657, Val Accuracy: 99.05%


Epoch 5/10: 100%|██████████| 15000/15000 [01:47<00:00, 138.97batch/s, loss=0.0345]


Epoch [5/10], Train Loss: 0.0525, Val Loss: 0.0649, Val Accuracy: 98.97%


Epoch 6/10: 100%|██████████| 15000/15000 [02:18<00:00, 108.30batch/s, loss=0.0388]


Epoch [6/10], Train Loss: 0.0476, Val Loss: 0.0627, Val Accuracy: 99.06%


Epoch 7/10: 100%|██████████| 15000/15000 [01:47<00:00, 139.89batch/s, loss=0.0299]


Epoch [7/10], Train Loss: 0.0427, Val Loss: 0.0531, Val Accuracy: 99.27%


Epoch 8/10: 100%|██████████| 15000/15000 [01:47<00:00, 139.01batch/s, loss=0.0282]


Epoch [8/10], Train Loss: 0.0388, Val Loss: 0.0576, Val Accuracy: 99.16%


Epoch 9/10: 100%|██████████| 15000/15000 [01:47<00:00, 139.35batch/s, loss=0.027] 


Epoch [9/10], Train Loss: 0.0372, Val Loss: 0.0513, Val Accuracy: 99.26%


Epoch 10/10: 100%|██████████| 15000/15000 [01:47<00:00, 139.39batch/s, loss=0.0258]


Epoch [10/10], Train Loss: 0.0350, Val Loss: 0.0564, Val Accuracy: 99.13%

Training with Momentum, Batch Size: 8


Epoch 1/10: 100%|██████████| 7500/7500 [00:57<00:00, 130.55batch/s, loss=0.248] 


Epoch [1/10], Train Loss: 0.2160, Val Loss: 0.0951, Val Accuracy: 98.22%


Epoch 2/10: 100%|██████████| 7500/7500 [00:53<00:00, 139.94batch/s, loss=0.0659]


Epoch [2/10], Train Loss: 0.0972, Val Loss: 0.0945, Val Accuracy: 98.38%


Epoch 3/10: 100%|██████████| 7500/7500 [01:24<00:00, 88.53batch/s, loss=0.0469] 


Epoch [3/10], Train Loss: 0.0794, Val Loss: 0.0809, Val Accuracy: 98.71%


Epoch 4/10: 100%|██████████| 7500/7500 [01:41<00:00, 73.96batch/s, loss=0.0407]  


Epoch [4/10], Train Loss: 0.0704, Val Loss: 0.0696, Val Accuracy: 99.01%


Epoch 5/10: 100%|██████████| 7500/7500 [01:10<00:00, 106.57batch/s, loss=0.0447]


Epoch [5/10], Train Loss: 0.0629, Val Loss: 0.0737, Val Accuracy: 98.82%


Epoch 6/10: 100%|██████████| 7500/7500 [01:07<00:00, 110.99batch/s, loss=0.0378]


Epoch [6/10], Train Loss: 0.0561, Val Loss: 0.0635, Val Accuracy: 99.12%


Epoch 7/10: 100%|██████████| 7500/7500 [01:52<00:00, 66.73batch/s, loss=0.0886] 


Epoch [7/10], Train Loss: 0.0523, Val Loss: 0.0618, Val Accuracy: 99.13%


Epoch 8/10: 100%|██████████| 7500/7500 [00:55<00:00, 134.06batch/s, loss=0.0419]


Epoch [8/10], Train Loss: 0.0480, Val Loss: 0.0571, Val Accuracy: 99.20%


Epoch 9/10: 100%|██████████| 7500/7500 [01:56<00:00, 64.13batch/s, loss=0.0332]  


Epoch [9/10], Train Loss: 0.0451, Val Loss: 0.0554, Val Accuracy: 99.22%


Epoch 10/10: 100%|██████████| 7500/7500 [00:50<00:00, 147.74batch/s, loss=0.0329]


Epoch [10/10], Train Loss: 0.0426, Val Loss: 0.0620, Val Accuracy: 98.92%

Training with Momentum, Batch Size: 16


Epoch 1/10: 100%|██████████| 3750/3750 [00:42<00:00, 87.58batch/s, loss=0.128]  


Epoch [1/10], Train Loss: 0.3055, Val Loss: 0.1211, Val Accuracy: 97.77%


Epoch 2/10: 100%|██████████| 3750/3750 [00:52<00:00, 71.46batch/s, loss=0.0467] 


Epoch [2/10], Train Loss: 0.1133, Val Loss: 0.0960, Val Accuracy: 98.13%


Epoch 3/10: 100%|██████████| 3750/3750 [00:30<00:00, 124.69batch/s, loss=0.0466]


Epoch [3/10], Train Loss: 0.0924, Val Loss: 0.0865, Val Accuracy: 98.61%


Epoch 4/10: 100%|██████████| 3750/3750 [00:53<00:00, 70.11batch/s, loss=0.171]  


Epoch [4/10], Train Loss: 0.0807, Val Loss: 0.0730, Val Accuracy: 98.92%


Epoch 5/10: 100%|██████████| 3750/3750 [04:45<00:00, 13.11batch/s, loss=0.0424] 


Epoch [5/10], Train Loss: 0.0743, Val Loss: 0.0732, Val Accuracy: 98.96%


Epoch 6/10: 100%|██████████| 3750/3750 [00:29<00:00, 129.06batch/s, loss=0.0466]


Epoch [6/10], Train Loss: 0.0679, Val Loss: 0.0683, Val Accuracy: 99.02%


Epoch 7/10: 100%|██████████| 3750/3750 [00:27<00:00, 134.30batch/s, loss=0.0415]


Epoch [7/10], Train Loss: 0.0632, Val Loss: 0.0657, Val Accuracy: 99.15%


Epoch 8/10: 100%|██████████| 3750/3750 [00:28<00:00, 132.50batch/s, loss=0.0469]


Epoch [8/10], Train Loss: 0.0590, Val Loss: 0.0709, Val Accuracy: 99.00%


Epoch 9/10: 100%|██████████| 3750/3750 [03:01<00:00, 20.68batch/s, loss=0.0394] 


Epoch [9/10], Train Loss: 0.0558, Val Loss: 0.0641, Val Accuracy: 99.04%


Epoch 10/10: 100%|██████████| 3750/3750 [00:59<00:00, 63.17batch/s, loss=0.0378] 


Epoch [10/10], Train Loss: 0.0528, Val Loss: 0.0625, Val Accuracy: 99.09%

Training with Momentum, Batch Size: 32


Epoch 1/10: 100%|██████████| 1875/1875 [00:17<00:00, 107.71batch/s, loss=0.222] 


Epoch [1/10], Train Loss: 0.3949, Val Loss: 0.1543, Val Accuracy: 96.63%


Epoch 2/10: 100%|██████████| 1875/1875 [00:17<00:00, 105.52batch/s, loss=0.262] 


Epoch [2/10], Train Loss: 0.1470, Val Loss: 0.1178, Val Accuracy: 97.66%


Epoch 3/10: 100%|██████████| 1875/1875 [00:17<00:00, 106.19batch/s, loss=0.106] 


Epoch [3/10], Train Loss: 0.1153, Val Loss: 0.1029, Val Accuracy: 98.08%


Epoch 4/10: 100%|██████████| 1875/1875 [00:17<00:00, 109.70batch/s, loss=0.15]  


Epoch [4/10], Train Loss: 0.1002, Val Loss: 0.0971, Val Accuracy: 98.28%


Epoch 5/10: 100%|██████████| 1875/1875 [00:17<00:00, 105.15batch/s, loss=0.0519]


Epoch [5/10], Train Loss: 0.0913, Val Loss: 0.0983, Val Accuracy: 98.29%


Epoch 6/10: 100%|██████████| 1875/1875 [00:18<00:00, 102.58batch/s, loss=0.186] 


Epoch [6/10], Train Loss: 0.0838, Val Loss: 0.0859, Val Accuracy: 98.43%


Epoch 7/10: 100%|██████████| 1875/1875 [00:18<00:00, 102.24batch/s, loss=0.0657]


Epoch [7/10], Train Loss: 0.0793, Val Loss: 0.0839, Val Accuracy: 98.64%


Epoch 8/10: 100%|██████████| 1875/1875 [00:18<00:00, 102.30batch/s, loss=0.15]  


Epoch [8/10], Train Loss: 0.0742, Val Loss: 0.0793, Val Accuracy: 98.73%


Epoch 9/10: 100%|██████████| 1875/1875 [00:17<00:00, 104.25batch/s, loss=0.0617]


Epoch [9/10], Train Loss: 0.0701, Val Loss: 0.0801, Val Accuracy: 98.70%


Epoch 10/10: 100%|██████████| 1875/1875 [00:18<00:00, 102.72batch/s, loss=0.114] 


Epoch [10/10], Train Loss: 0.0674, Val Loss: 0.0798, Val Accuracy: 98.70%


In [None]:
# Hyperparameters for NAG
learning_rate_nag = 0.01
beta_nag = 0.95

nag_results = compare_optimizers(
    optimizer_class=NAG,
    optimizer_name="Nesterov's Accelerated Gradient",
    batch_sizes=batch_sizes,
    learning_rate=learning_rate_nag,
    beta=beta_nag
)



Training with Nesterov's Accelerated Gradient, Batch Size: 4


Epoch 1/10: 100%|██████████| 15000/15000 [02:12<00:00, 113.06batch/s, loss=0.114] 


Epoch [1/10], Train Loss: 0.1894, Val Loss: 0.1029, Val Accuracy: 98.10%


Epoch 2/10: 100%|██████████| 15000/15000 [01:58<00:00, 126.06batch/s, loss=0.0418]


Epoch [2/10], Train Loss: 0.0860, Val Loss: 0.0781, Val Accuracy: 98.64%


Epoch 3/10: 100%|██████████| 15000/15000 [05:52<00:00, 42.58batch/s, loss=0.0391]  


Epoch [3/10], Train Loss: 0.0710, Val Loss: 0.0645, Val Accuracy: 99.19%


Epoch 4/10: 100%|██████████| 15000/15000 [05:04<00:00, 49.29batch/s, loss=0.0365] 


Epoch [4/10], Train Loss: 0.0614, Val Loss: 0.0683, Val Accuracy: 98.95%


Epoch 5/10: 100%|██████████| 15000/15000 [01:58<00:00, 126.38batch/s, loss=0.158] 


Epoch [5/10], Train Loss: 0.0537, Val Loss: 0.0585, Val Accuracy: 99.24%


Epoch 6/10: 100%|██████████| 15000/15000 [02:00<00:00, 124.37batch/s, loss=0.0317]


Epoch [6/10], Train Loss: 0.0472, Val Loss: 0.0632, Val Accuracy: 98.97%


Epoch 7/10: 100%|██████████| 15000/15000 [02:01<00:00, 123.33batch/s, loss=0.0307]


Epoch [7/10], Train Loss: 0.0441, Val Loss: 0.0591, Val Accuracy: 99.10%


Epoch 8/10: 100%|██████████| 15000/15000 [02:01<00:00, 123.61batch/s, loss=0.0279]


Epoch [8/10], Train Loss: 0.0385, Val Loss: 0.0589, Val Accuracy: 99.09%


Epoch 9/10: 100%|██████████| 15000/15000 [01:51<00:00, 133.99batch/s, loss=0.0274]


Epoch [9/10], Train Loss: 0.0355, Val Loss: 0.0542, Val Accuracy: 99.20%


Epoch 10/10: 100%|██████████| 15000/15000 [01:52<00:00, 133.19batch/s, loss=0.0879]


Epoch [10/10], Train Loss: 0.0346, Val Loss: 0.0524, Val Accuracy: 99.17%

Training with Nesterov's Accelerated Gradient, Batch Size: 8


Epoch 1/10: 100%|██████████| 7500/7500 [00:57<00:00, 129.40batch/s, loss=0.048] 


Epoch [1/10], Train Loss: 0.2367, Val Loss: 0.1014, Val Accuracy: 98.14%


Epoch 2/10: 100%|██████████| 7500/7500 [01:02<00:00, 120.15batch/s, loss=0.0491]


Epoch [2/10], Train Loss: 0.0951, Val Loss: 0.0814, Val Accuracy: 98.78%


Epoch 3/10: 100%|██████████| 7500/7500 [01:02<00:00, 120.22batch/s, loss=0.042] 


Epoch [3/10], Train Loss: 0.0778, Val Loss: 0.0839, Val Accuracy: 98.64%


Epoch 4/10: 100%|██████████| 7500/7500 [01:01<00:00, 122.47batch/s, loss=0.0418]


Epoch [4/10], Train Loss: 0.0687, Val Loss: 0.0779, Val Accuracy: 98.71%


Epoch 5/10: 100%|██████████| 7500/7500 [01:01<00:00, 122.75batch/s, loss=0.0452]


Epoch [5/10], Train Loss: 0.0616, Val Loss: 0.0679, Val Accuracy: 99.03%


Epoch 6/10: 100%|██████████| 7500/7500 [01:02<00:00, 120.13batch/s, loss=0.0374]


Epoch [6/10], Train Loss: 0.0564, Val Loss: 0.0624, Val Accuracy: 99.14%


Epoch 7/10: 100%|██████████| 7500/7500 [01:02<00:00, 119.56batch/s, loss=0.0435]


Epoch [7/10], Train Loss: 0.0515, Val Loss: 0.0688, Val Accuracy: 98.97%


Epoch 8/10: 100%|██████████| 7500/7500 [01:02<00:00, 119.93batch/s, loss=0.0345]


Epoch [8/10], Train Loss: 0.0476, Val Loss: 0.0660, Val Accuracy: 99.04%


Epoch 9/10: 100%|██████████| 7500/7500 [01:02<00:00, 120.81batch/s, loss=0.033] 


Epoch [9/10], Train Loss: 0.0438, Val Loss: 0.0619, Val Accuracy: 99.04%


Epoch 10/10: 100%|██████████| 7500/7500 [00:55<00:00, 136.10batch/s, loss=0.0319]


Epoch [10/10], Train Loss: 0.0410, Val Loss: 0.0593, Val Accuracy: 99.11%

Training with Nesterov's Accelerated Gradient, Batch Size: 16


Epoch 1/10: 100%|██████████| 3750/3750 [01:15<00:00, 49.74batch/s, loss=0.0544] 


Epoch [1/10], Train Loss: 0.3213, Val Loss: 0.1185, Val Accuracy: 97.83%


Epoch 2/10: 100%|██████████| 3750/3750 [00:27<00:00, 138.83batch/s, loss=0.0918]


Epoch [2/10], Train Loss: 0.1123, Val Loss: 0.0950, Val Accuracy: 98.26%


Epoch 3/10: 100%|██████████| 3750/3750 [00:24<00:00, 151.40batch/s, loss=0.102] 


Epoch [3/10], Train Loss: 0.0910, Val Loss: 0.0831, Val Accuracy: 98.63%


Epoch 4/10: 100%|██████████| 3750/3750 [00:24<00:00, 155.13batch/s, loss=0.149] 


Epoch [4/10], Train Loss: 0.0800, Val Loss: 0.0805, Val Accuracy: 98.59%


Epoch 5/10: 100%|██████████| 3750/3750 [00:24<00:00, 152.42batch/s, loss=0.0479]


Epoch [5/10], Train Loss: 0.0727, Val Loss: 0.0705, Val Accuracy: 98.98%


Epoch 6/10: 100%|██████████| 3750/3750 [00:27<00:00, 137.76batch/s, loss=0.113] 


Epoch [6/10], Train Loss: 0.0676, Val Loss: 0.0797, Val Accuracy: 98.66%


Epoch 7/10: 100%|██████████| 3750/3750 [00:27<00:00, 137.74batch/s, loss=0.048] 


Epoch [7/10], Train Loss: 0.0624, Val Loss: 0.0691, Val Accuracy: 98.86%


Epoch 8/10:  42%|████▏     | 1563/3750 [00:11<00:16, 134.16batch/s, loss=0.0405]

In [None]:
# Hyperparameters for RMSprop
learning_rate_rmsprop = 0.001
beta_rmsprop = 0.95
gamma_rmsprop = 1.0
eps_rmsprop = 1e-8

rmsprop_results = compare_optimizers(
    optimizer_class=RMSprop,
    optimizer_name="RMSprop",
    batch_sizes=batch_sizes,
    learning_rate=learning_rate_rmsprop,
    beta=beta_rmsprop,
    gamma=gamma_rmsprop,
    eps=eps_rmsprop
)


In [None]:
# Hyperparameters for Adam
learning_rate_adam = 0.001
beta1_adam = 0.9
beta2_adam = 0.999
eps_adam = 1e-8

adam_results = compare_optimizers(
    optimizer_class=Adam,
    optimizer_name="Adam",
    batch_sizes=batch_sizes,
    learning_rate=learning_rate_adam,
    beta1=beta1_adam,
    beta2=beta2_adam,
    eps=eps_adam
)


In [None]:
def plot_results(optimizer_results, optimizer_name):
    for batch_size, result in optimizer_results.items():
        epochs = range(1, len(result['train_loss']) + 1)
        plt.figure(figsize=(12, 5))
        
        # Plot Training and Validation Loss
        plt.subplot(1, 2, 1)
        plt.plot(epochs, result['train_loss'], 'b-', label='Training Loss')
        plt.plot(epochs, result['val_loss'], 'r-', label='Validation Loss')
        plt.title(f'{optimizer_name} - Batch Size {batch_size}')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        
        # Plot Validation Accuracy
        plt.subplot(1, 2, 2)
        plt.plot(epochs, result['val_accuracy'], 'g-', label='Validation Accuracy')
        plt.title(f'{optimizer_name} - Batch Size {batch_size}')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy (%)')
        plt.legend()
        
        plt.tight_layout()
        plt.show()


In [None]:
print("\nPlotting Results for Momentum Optimizer")
plot_results(momentum_results, "Momentum")

print("\nPlotting Results for Nesterov's Accelerated Gradient Optimizer")
plot_results(nag_results, "Nesterov's Accelerated Gradient")

print("\nPlotting Results for RMSprop Optimizer")
plot_results(rmsprop_results, "RMSprop")

print("\nPlotting Results for Adam Optimizer")
plot_results(adam_results, "Adam")


In [None]:
print("\nFinal Validation Accuracies:")

print("\nMomentum Optimizer:")
for batch_size in batch_sizes:
    acc = momentum_results[batch_size]['val_accuracy'][-1]
    print(f"Batch Size {batch_size}: {acc:.2f}%")

print("\nNesterov's Accelerated Gradient Optimizer:")
for batch_size in batch_sizes:
    acc = nag_results[batch_size]['val_accuracy'][-1]
    print(f"Batch Size {batch_size}: {acc:.2f}%")

print("\nRMSprop Optimizer:")
for batch_size in batch_sizes:
    acc = rmsprop_results[batch_size]['val_accuracy'][-1]
    print(f"Batch Size {batch_size}: {acc:.2f}%")

print("\nAdam Optimizer:")
for batch_size in batch_sizes:
    acc = adam_results[batch_size]['val_accuracy'][-1]
    print(f"Batch Size {batch_size}: {acc:.2f}%")
