# Problem 2

In [54]:
# Problem 2: Optimizers from Scratch

## 2.1 Optimizer Implementation

### **Imports and Device Configuration**


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm  # For progress bars

import time


In [55]:
# Determine the device

device = torch.device('cpu')


In [56]:
# Define transformations: Convert images to tensors and normalize
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))  # MNIST mean and std
])

# Load training and testing datasets
train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, transform=transform, download=True)

# Define DataLoaders
batch_size = 64  # Initial batch size; will vary in 2.2

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)


In [57]:
class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()
        # Convolutional layers
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)  # Input channels=1 for MNIST
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)  # 2x2 Max pooling
        # Fully connected layers
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)  # 10 classes for MNIST
    
    def forward(self, x):
        # Convolutional layer 1
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        # Convolutional layer 2
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        # Flatten
        x = x.view(-1, 64 * 7 * 7)
        # Fully connected layers
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [58]:
class SGD_Momentum:
    def __init__(self, params, lr=0.01, beta=0.9):
        self.params = list(params)
        self.lr = lr
        self.beta = beta
        self.velocities = [torch.zeros_like(p.data) for p in self.params]
    
    def step(self):
        with torch.no_grad():
            for i, p in enumerate(self.params):
                if p.grad is not None:
                    # Update velocity
                    self.velocities[i] = self.beta * self.velocities[i] + (1 - self.beta) * p.grad
                    # Update parameters
                    p.data -= self.lr * self.velocities[i]
    
    def zero_grad(self):
        for p in self.params:
            if p.grad is not None:
                p.grad.zero_()


In [59]:
class NAG:
    def __init__(self, params, lr=0.01, beta=0.95):
        self.params = list(params)
        self.lr = lr
        self.beta = beta
        self.velocities = [torch.zeros_like(p.data) for p in self.params]
    
    def step(self):
        with torch.no_grad():
            for i, p in enumerate(self.params):
                if p.grad is not None:
                    prev_velocity = self.velocities[i]
                    # Update velocity
                    self.velocities[i] = self.beta * self.velocities[i] + p.grad
                    # Update parameters
                    p.data -= self.lr * (self.beta * prev_velocity + (1 - self.beta) * self.velocities[i])
    
    def zero_grad(self):
        for p in self.params:
            if p.grad is not None:
                p.grad.zero_()


In [60]:
class RMSprop:
    def __init__(self, params, lr=0.001, beta=0.95, gamma=1.0, eps=1e-8):
        self.params = list(params)
        self.lr = lr
        self.beta = beta
        self.gamma = gamma
        self.eps = eps
        self.squares = [torch.zeros_like(p.data) for p in self.params]
    
    def step(self):
        with torch.no_grad():
            for i, p in enumerate(self.params):
                if p.grad is not None:
                    # Update squared gradients
                    self.squares[i] = self.beta * self.squares[i] + (1 - self.beta) * (p.grad ** 2)
                    # Update parameters
                    p.data -= self.lr * (p.grad / (self.squares[i].sqrt() + self.eps))
    
    def zero_grad(self):
        for p in self.params:
            if p.grad is not None:
                p.grad.zero_()


In [61]:
class Adam:
    def __init__(self, params, lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8):
        self.params = list(params)
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps
        self.m = [torch.zeros_like(p.data) for p in self.params]
        self.v = [torch.zeros_like(p.data) for p in self.params]
        self.t = 0  # Time step
    
    def step(self):
        self.t += 1
        with torch.no_grad():
            for i, p in enumerate(self.params):
                if p.grad is not None:
                    # Update biased first moment estimate
                    self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * p.grad
                    # Update biased second moment estimate
                    self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * (p.grad ** 2)
                    # Compute bias-corrected first moment estimate
                    m_hat = self.m[i] / (1 - self.beta1 ** self.t)
                    # Compute bias-corrected second moment estimate
                    v_hat = self.v[i] / (1 - self.beta2 ** self.t)
                    # Update parameters
                    p.data -= self.lr * m_hat / (v_hat.sqrt() + self.eps)
    
    def zero_grad(self):
        for p in self.params:
            if p.grad is not None:
                p.grad.zero_()


In [62]:
def train_model(model, train_loader, test_loader, optimizer, num_epochs=10, l1_lambda=1e-5):
    """
    Trains the CNN model using the provided optimizer.

    Args:
        model (nn.Module): The CNN model to train.
        train_loader (DataLoader): DataLoader for training data.
        test_loader (DataLoader): DataLoader for test data.
        optimizer: Optimizer instance.
        num_epochs (int): Number of training epochs.
        l1_lambda (float): L1 regularization coefficient.
    
    Returns:
        Tuple containing training loss history, validation loss history, and validation accuracy history.
    """
    # Loss function
    criterion = nn.CrossEntropyLoss()
    
    # Lists to store losses and accuracies
    train_loss_history = []
    val_loss_history = []
    val_accuracy_history = []
    
    model.to(device)
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
        
        for images, labels in progress_bar:
            images = images.to(device)
            labels = labels.to(device)
            
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            # L1 Regularization
            l1_norm = sum(p.abs().sum() for p in model.parameters())
            loss += l1_lambda * l1_norm
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())
        
        # Calculate average training loss
        avg_train_loss = running_loss / len(train_loader)
        train_loss_history.append(avg_train_loss)
        
        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        
        with torch.no_grad():
            for images, labels in test_loader:
                images = images.to(device)
                labels = labels.to(device)
                
                outputs = model(images)
                loss = criterion(outputs, labels)
                
                # L1 Regularization
                l1_norm = sum(p.abs().sum() for p in model.parameters())
                loss += l1_lambda * l1_norm
                
                val_loss += loss.item()
                
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        avg_val_loss = val_loss / len(test_loader)
        val_loss_history.append(avg_val_loss)
        
        val_accuracy = 100 * correct / total
        val_accuracy_history.append(val_accuracy)
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%')
    
    return train_loss_history, val_loss_history, val_accuracy_history


In [63]:
# Hyperparameters for Momentum
learning_rate_momentum = 0.01
beta_momentum = 0.9
batch_size_momentum = 64  # Adjust as needed

# Define DataLoaders
train_loader_momentum = DataLoader(dataset=train_dataset, batch_size=batch_size_momentum, shuffle=True, num_workers=0)
test_loader_momentum = DataLoader(dataset=test_dataset, batch_size=batch_size_momentum, shuffle=False, num_workers=0)

# Initialize model and optimizer
model_momentum = CNNModel()
optimizer_momentum = SGD_Momentum(model_momentum.parameters(), lr=learning_rate_momentum, beta=beta_momentum)

print("Training with Momentum Optimizer")
start_time = time.time()
train_loss_momentum, val_loss_momentum, val_acc_momentum = train_model(
    model_momentum, train_loader_momentum, test_loader_momentum, optimizer_momentum, num_epochs=10, l1_lambda=1e-5)
end_time = time.time()
print(f"Momentum Optimizer Training Time: {end_time - start_time:.2f} seconds")


Training with Momentum Optimizer


Epoch 1/10: 100%|██████████| 938/938 [00:19<00:00, 48.05batch/s, loss=0.194]


Epoch [1/10], Train Loss: 0.6748, Val Loss: 0.2399, Val Accuracy: 94.01%


Epoch 2/10: 100%|██████████| 938/938 [00:17<00:00, 52.15batch/s, loss=0.126] 


Epoch [2/10], Train Loss: 0.2037, Val Loss: 0.1577, Val Accuracy: 96.50%


Epoch 3/10: 100%|██████████| 938/938 [00:18<00:00, 51.46batch/s, loss=0.127] 


Epoch [3/10], Train Loss: 0.1472, Val Loss: 0.1360, Val Accuracy: 97.10%


Epoch 4/10: 100%|██████████| 938/938 [00:18<00:00, 51.84batch/s, loss=0.0843]


Epoch [4/10], Train Loss: 0.1236, Val Loss: 0.1090, Val Accuracy: 97.83%


Epoch 5/10: 100%|██████████| 938/938 [00:18<00:00, 51.88batch/s, loss=0.14]  


Epoch [5/10], Train Loss: 0.1099, Val Loss: 0.1040, Val Accuracy: 98.03%


Epoch 6/10: 100%|██████████| 938/938 [00:18<00:00, 50.86batch/s, loss=0.048] 


Epoch [6/10], Train Loss: 0.1011, Val Loss: 0.0937, Val Accuracy: 98.33%


Epoch 7/10: 100%|██████████| 938/938 [00:18<00:00, 51.58batch/s, loss=0.0738]


Epoch [7/10], Train Loss: 0.0946, Val Loss: 0.0866, Val Accuracy: 98.52%


Epoch 8/10: 100%|██████████| 938/938 [00:18<00:00, 51.86batch/s, loss=0.0816]


Epoch [8/10], Train Loss: 0.0895, Val Loss: 0.0898, Val Accuracy: 98.42%


Epoch 9/10: 100%|██████████| 938/938 [00:18<00:00, 51.58batch/s, loss=0.289] 


Epoch [9/10], Train Loss: 0.0858, Val Loss: 0.0831, Val Accuracy: 98.61%


Epoch 10/10: 100%|██████████| 938/938 [00:18<00:00, 51.87batch/s, loss=0.0927]


Epoch [10/10], Train Loss: 0.0817, Val Loss: 0.0831, Val Accuracy: 98.66%
Momentum Optimizer Training Time: 193.50 seconds


In [64]:
# Hyperparameters for NAG
learning_rate_nag = 0.01
beta_nag = 0.95
batch_size_nag = 64  # Adjust as needed

# Define DataLoaders
train_loader_nag = DataLoader(dataset=train_dataset, batch_size=batch_size_nag, shuffle=True, num_workers=0)
test_loader_nag = DataLoader(dataset=test_dataset, batch_size=batch_size_nag, shuffle=False, num_workers=0)

# Initialize model and optimizer
model_nag = CNNModel()
optimizer_nag = NAG(model_nag.parameters(), lr=learning_rate_nag, beta=beta_nag)

print("\nTraining with Nesterov's Accelerated Gradient (NAG) Optimizer")
start_time = time.time()
train_loss_nag, val_loss_nag, val_acc_nag = train_model(
    model_nag, train_loader_nag, test_loader_nag, optimizer_nag, num_epochs=10, l1_lambda=1e-5)
end_time = time.time()
print(f"NAG Optimizer Training Time: {end_time - start_time:.2f} seconds")



Training with Nesterov's Accelerated Gradient (NAG) Optimizer


Epoch 1/10: 100%|██████████| 938/938 [00:18<00:00, 51.40batch/s, loss=0.0856]


Epoch [1/10], Train Loss: 0.4008, Val Loss: 0.1189, Val Accuracy: 98.21%


Epoch 2/10: 100%|██████████| 938/938 [00:18<00:00, 51.14batch/s, loss=0.0641]


Epoch [2/10], Train Loss: 0.1185, Val Loss: 0.0934, Val Accuracy: 98.93%


Epoch 3/10: 100%|██████████| 938/938 [00:18<00:00, 51.53batch/s, loss=0.0557]


Epoch [3/10], Train Loss: 0.0951, Val Loss: 0.0969, Val Accuracy: 98.61%


Epoch 4/10: 100%|██████████| 938/938 [00:18<00:00, 51.70batch/s, loss=0.261] 


Epoch [4/10], Train Loss: 0.0819, Val Loss: 0.0845, Val Accuracy: 98.92%


Epoch 5/10: 100%|██████████| 938/938 [00:18<00:00, 51.54batch/s, loss=0.0504]


Epoch [5/10], Train Loss: 0.0729, Val Loss: 0.0771, Val Accuracy: 99.03%


Epoch 6/10: 100%|██████████| 938/938 [00:18<00:00, 51.59batch/s, loss=0.0729]


Epoch [6/10], Train Loss: 0.0638, Val Loss: 0.0743, Val Accuracy: 99.01%


Epoch 7/10: 100%|██████████| 938/938 [00:18<00:00, 50.52batch/s, loss=0.0474]


Epoch [7/10], Train Loss: 0.0572, Val Loss: 0.0725, Val Accuracy: 99.03%


Epoch 8/10: 100%|██████████| 938/938 [00:18<00:00, 50.79batch/s, loss=0.039] 


Epoch [8/10], Train Loss: 0.0541, Val Loss: 0.0748, Val Accuracy: 98.91%


Epoch 9/10: 100%|██████████| 938/938 [00:18<00:00, 50.20batch/s, loss=0.0371]


Epoch [9/10], Train Loss: 0.0510, Val Loss: 0.0717, Val Accuracy: 98.97%


Epoch 10/10: 100%|██████████| 938/938 [00:18<00:00, 50.84batch/s, loss=0.0341]


Epoch [10/10], Train Loss: 0.0442, Val Loss: 0.0624, Val Accuracy: 99.16%
NAG Optimizer Training Time: 194.19 seconds


In [65]:
# Hyperparameters for RMSprop
learning_rate_rmsprop = 0.001
beta_rmsprop = 0.95
gamma_rmsprop = 1.0
eps_rmsprop = 1e-8
batch_size_rmsprop = 64  # Adjust as needed

# Define DataLoaders
train_loader_rmsprop = DataLoader(dataset=train_dataset, batch_size=batch_size_rmsprop, shuffle=True, num_workers=0)
test_loader_rmsprop = DataLoader(dataset=test_dataset, batch_size=batch_size_rmsprop, shuffle=False, num_workers=0)

# Initialize model and optimizer
model_rmsprop = CNNModel()
optimizer_rmsprop = RMSprop(model_rmsprop.parameters(), lr=learning_rate_rmsprop, beta=beta_rmsprop, gamma=gamma_rmsprop, eps=eps_rmsprop)

print("\nTraining with RMSprop Optimizer")
start_time = time.time()
train_loss_rmsprop, val_loss_rmsprop, val_acc_rmsprop = train_model(
    model_rmsprop, train_loader_rmsprop, test_loader_rmsprop, optimizer_rmsprop, num_epochs=10, l1_lambda=1e-5)
end_time = time.time()
print(f"RMSprop Optimizer Training Time: {end_time - start_time:.2f} seconds")



Training with RMSprop Optimizer


Epoch 1/10: 100%|██████████| 938/938 [00:17<00:00, 53.57batch/s, loss=0.0384]


Epoch [1/10], Train Loss: 0.1814, Val Loss: 0.0668, Val Accuracy: 98.83%


Epoch 2/10: 100%|██████████| 938/938 [00:17<00:00, 53.99batch/s, loss=0.0421]


Epoch [2/10], Train Loss: 0.0764, Val Loss: 0.0729, Val Accuracy: 98.64%


Epoch 3/10: 100%|██████████| 938/938 [00:17<00:00, 53.94batch/s, loss=0.0297]


Epoch [3/10], Train Loss: 0.0629, Val Loss: 0.0612, Val Accuracy: 98.94%


Epoch 4/10: 100%|██████████| 938/938 [00:17<00:00, 53.95batch/s, loss=0.0284]


Epoch [4/10], Train Loss: 0.0553, Val Loss: 0.0544, Val Accuracy: 99.13%


Epoch 5/10: 100%|██████████| 938/938 [00:17<00:00, 54.05batch/s, loss=0.0292]


Epoch [5/10], Train Loss: 0.0505, Val Loss: 0.0525, Val Accuracy: 99.11%


Epoch 6/10: 100%|██████████| 938/938 [00:17<00:00, 53.90batch/s, loss=0.0274]


Epoch [6/10], Train Loss: 0.0462, Val Loss: 0.0497, Val Accuracy: 99.28%


Epoch 7/10: 100%|██████████| 938/938 [00:17<00:00, 54.07batch/s, loss=0.0288]


Epoch [7/10], Train Loss: 0.0437, Val Loss: 0.0543, Val Accuracy: 99.07%


Epoch 8/10: 100%|██████████| 938/938 [00:17<00:00, 54.08batch/s, loss=0.0277]


Epoch [8/10], Train Loss: 0.0407, Val Loss: 0.0558, Val Accuracy: 99.04%


Epoch 9/10: 100%|██████████| 938/938 [00:17<00:00, 53.91batch/s, loss=0.0737]


Epoch [9/10], Train Loss: 0.0394, Val Loss: 0.0504, Val Accuracy: 99.15%


Epoch 10/10: 100%|██████████| 938/938 [00:17<00:00, 53.79batch/s, loss=0.0231]


Epoch [10/10], Train Loss: 0.0377, Val Loss: 0.0543, Val Accuracy: 99.04%
RMSprop Optimizer Training Time: 184.43 seconds


In [66]:
# Hyperparameters for Adam
learning_rate_adam = 0.001
beta1_adam = 0.9
beta2_adam = 0.999
eps_adam = 1e-8
batch_size_adam = 64  # Adjust as needed

# Define DataLoaders
train_loader_adam = DataLoader(dataset=train_dataset, batch_size=batch_size_adam, shuffle=True, num_workers=0)
test_loader_adam = DataLoader(dataset=test_dataset, batch_size=batch_size_adam, shuffle=False, num_workers=0)

# Initialize model and optimizer
model_adam = CNNModel()
optimizer_adam = Adam(model_adam.parameters(), lr=learning_rate_adam, beta1=beta1_adam, beta2=beta2_adam, eps=eps_adam)

print("\nTraining with Adam Optimizer")
start_time = time.time()
train_loss_adam, val_loss_adam, val_acc_adam = train_model(
    model_adam, train_loader_adam, test_loader_adam, optimizer_adam, num_epochs=10, l1_lambda=1e-5)
end_time = time.time()
print(f"Adam Optimizer Training Time: {end_time - start_time:.2f} seconds")



Training with Adam Optimizer


Epoch 1/10: 100%|██████████| 938/938 [00:17<00:00, 52.49batch/s, loss=0.0871]


Epoch [1/10], Train Loss: 0.1718, Val Loss: 0.0956, Val Accuracy: 98.10%


Epoch 2/10: 100%|██████████| 938/938 [00:17<00:00, 52.91batch/s, loss=0.0712]


Epoch [2/10], Train Loss: 0.0839, Val Loss: 0.0737, Val Accuracy: 98.77%


Epoch 3/10: 100%|██████████| 938/938 [00:17<00:00, 52.54batch/s, loss=0.0575]


Epoch [3/10], Train Loss: 0.0718, Val Loss: 0.0820, Val Accuracy: 98.67%


Epoch 4/10: 100%|██████████| 938/938 [03:23<00:00,  4.61batch/s, loss=0.0429]  


Epoch [4/10], Train Loss: 0.0672, Val Loss: 0.0739, Val Accuracy: 98.98%


Epoch 5/10: 100%|██████████| 938/938 [00:17<00:00, 53.73batch/s, loss=0.0697]


Epoch [5/10], Train Loss: 0.0611, Val Loss: 0.0687, Val Accuracy: 99.01%


Epoch 6/10: 100%|██████████| 938/938 [08:04<00:00,  1.94batch/s, loss=0.533]   


Epoch [6/10], Train Loss: 0.0565, Val Loss: 0.0778, Val Accuracy: 98.80%


Epoch 7/10: 100%|██████████| 938/938 [00:17<00:00, 54.49batch/s, loss=0.048] 


Epoch [7/10], Train Loss: 0.0549, Val Loss: 0.0693, Val Accuracy: 99.02%


Epoch 8/10: 100%|██████████| 938/938 [00:17<00:00, 54.94batch/s, loss=0.039] 


Epoch [8/10], Train Loss: 0.0536, Val Loss: 0.0687, Val Accuracy: 99.09%


Epoch 9/10: 100%|██████████| 938/938 [00:32<00:00, 29.15batch/s, loss=0.0358]


Epoch [9/10], Train Loss: 0.0474, Val Loss: 0.0632, Val Accuracy: 99.19%


Epoch 10/10: 100%|██████████| 938/938 [00:17<00:00, 54.51batch/s, loss=0.0357]


Epoch [10/10], Train Loss: 0.0458, Val Loss: 0.0632, Val Accuracy: 99.22%
Adam Optimizer Training Time: 852.53 seconds


In [67]:
print("\nFinal Validation Accuracies:")

print("Momentum Optimizer:")
for batch_size in [64]:
    acc = val_acc_momentum
    print(f"Batch Size {batch_size}: {acc:.2f}%")

print("\nNesterov's Accelerated Gradient Optimizer:")
for batch_size in [64]:
    acc = val_acc_nag
    print(f"Batch Size {batch_size}: {acc:.2f}%")

print("\nRMSprop Optimizer:")
for batch_size in [64]:
    acc = val_acc_rmsprop
    print(f"Batch Size {batch_size}: {acc:.2f}%")

print("\nAdam Optimizer:")
for batch_size in [64]:
    acc = val_acc_adam
    print(f"Batch Size {batch_size}: {acc:.2f}%")



Final Validation Accuracies:
Momentum Optimizer:


TypeError: unsupported format string passed to list.__format__

# 2.2

In [68]:
batch_sizes = [4, 8, 16, 32]

def compare_optimizers(optimizer_class, optimizer_name, batch_sizes, learning_rate, **kwargs):
    """
    Trains the CNN model using a specific optimizer across different batch sizes.

    Args:
        optimizer_class: The optimizer class to instantiate.
        optimizer_name (str): Name of the optimizer for display.
        batch_sizes (list): List of batch sizes to test.
        learning_rate (float): Learning rate for the optimizer.
        **kwargs: Additional hyperparameters for the optimizer.
    
    Returns:
        dict: A dictionary containing training and validation losses and accuracies for each batch size.
    """
    results = {}
    for batch_size in batch_sizes:
        print(f"\nTraining with {optimizer_name}, Batch Size: {batch_size}")
        # Define DataLoaders
        train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
        test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
        
        # Initialize model and optimizer
        model = CNNModel()
        optimizer = optimizer_class(model.parameters(), lr=learning_rate, **kwargs)
        
        # Train the model
        train_loss, val_loss, val_acc = train_model(
            model, train_loader, test_loader, optimizer, num_epochs=10, l1_lambda=1e-5)
        
        # Store the results
        results[batch_size] = {
            'train_loss': train_loss,
            'val_loss': val_loss,
            'val_accuracy': val_acc
        }
    return results


In [69]:
# Hyperparameters for Momentum
learning_rate_momentum = 0.01
beta_momentum = 0.9

momentum_results = compare_optimizers(
    optimizer_class=SGD_Momentum,
    optimizer_name="Momentum",
    batch_sizes=batch_sizes,
    learning_rate=learning_rate_momentum,
    beta=beta_momentum
)



Training with Momentum, Batch Size: 4


Epoch 1/10: 100%|██████████| 15000/15000 [00:53<00:00, 280.09batch/s, loss=0.0604]


Epoch [1/10], Train Loss: 0.1888, Val Loss: 0.0906, Val Accuracy: 98.63%


Epoch 2/10: 100%|██████████| 15000/15000 [01:12<00:00, 207.11batch/s, loss=0.0671]


Epoch [2/10], Train Loss: 0.0882, Val Loss: 0.0760, Val Accuracy: 98.93%


Epoch 3/10: 100%|██████████| 15000/15000 [01:11<00:00, 209.43batch/s, loss=0.0399]


Epoch [3/10], Train Loss: 0.0708, Val Loss: 0.0837, Val Accuracy: 98.61%


Epoch 4/10: 100%|██████████| 15000/15000 [01:11<00:00, 211.20batch/s, loss=0.0387]


Epoch [4/10], Train Loss: 0.0607, Val Loss: 0.0840, Val Accuracy: 98.48%


Epoch 5/10: 100%|██████████| 15000/15000 [01:12<00:00, 208.09batch/s, loss=0.0347]


Epoch [5/10], Train Loss: 0.0529, Val Loss: 0.0804, Val Accuracy: 98.70%


Epoch 6/10: 100%|██████████| 15000/15000 [01:13<00:00, 204.89batch/s, loss=0.0327]


Epoch [6/10], Train Loss: 0.0487, Val Loss: 0.0625, Val Accuracy: 99.05%


Epoch 7/10: 100%|██████████| 15000/15000 [01:12<00:00, 207.43batch/s, loss=0.0308]


Epoch [7/10], Train Loss: 0.0431, Val Loss: 0.0555, Val Accuracy: 99.25%


Epoch 8/10: 100%|██████████| 15000/15000 [01:21<00:00, 184.56batch/s, loss=0.0288]


Epoch [8/10], Train Loss: 0.0397, Val Loss: 0.0611, Val Accuracy: 99.02%


Epoch 9/10: 100%|██████████| 15000/15000 [01:27<00:00, 170.93batch/s, loss=0.0317]


Epoch [9/10], Train Loss: 0.0371, Val Loss: 0.0569, Val Accuracy: 99.11%


Epoch 10/10: 100%|██████████| 15000/15000 [01:45<00:00, 141.66batch/s, loss=0.0261]


Epoch [10/10], Train Loss: 0.0350, Val Loss: 0.0521, Val Accuracy: 99.25%

Training with Momentum, Batch Size: 8


Epoch 1/10: 100%|██████████| 7500/7500 [00:47<00:00, 159.21batch/s, loss=0.0481]


Epoch [1/10], Train Loss: 0.2227, Val Loss: 0.0975, Val Accuracy: 98.32%


Epoch 2/10: 100%|██████████| 7500/7500 [00:46<00:00, 162.79batch/s, loss=0.343] 


Epoch [2/10], Train Loss: 0.0987, Val Loss: 0.0944, Val Accuracy: 98.28%


Epoch 3/10: 100%|██████████| 7500/7500 [00:46<00:00, 159.62batch/s, loss=0.0468]


Epoch [3/10], Train Loss: 0.0808, Val Loss: 0.0695, Val Accuracy: 99.03%


Epoch 4/10: 100%|██████████| 7500/7500 [00:47<00:00, 159.05batch/s, loss=0.042] 


Epoch [4/10], Train Loss: 0.0701, Val Loss: 0.0702, Val Accuracy: 99.06%


Epoch 5/10: 100%|██████████| 7500/7500 [00:46<00:00, 160.47batch/s, loss=0.0426]


Epoch [5/10], Train Loss: 0.0635, Val Loss: 0.0612, Val Accuracy: 99.21%


Epoch 6/10: 100%|██████████| 7500/7500 [00:47<00:00, 158.37batch/s, loss=0.0379]


Epoch [6/10], Train Loss: 0.0574, Val Loss: 0.0685, Val Accuracy: 99.08%


Epoch 7/10: 100%|██████████| 7500/7500 [00:47<00:00, 158.34batch/s, loss=0.0389]


Epoch [7/10], Train Loss: 0.0534, Val Loss: 0.0751, Val Accuracy: 98.73%


Epoch 8/10: 100%|██████████| 7500/7500 [00:47<00:00, 156.79batch/s, loss=0.0369]


Epoch [8/10], Train Loss: 0.0489, Val Loss: 0.0601, Val Accuracy: 99.16%


Epoch 9/10: 100%|██████████| 7500/7500 [00:47<00:00, 157.53batch/s, loss=0.0334]


Epoch [9/10], Train Loss: 0.0455, Val Loss: 0.0638, Val Accuracy: 99.04%


Epoch 10/10: 100%|██████████| 7500/7500 [00:47<00:00, 158.42batch/s, loss=0.0339]


Epoch [10/10], Train Loss: 0.0418, Val Loss: 0.0582, Val Accuracy: 99.23%

Training with Momentum, Batch Size: 16


Epoch 1/10: 100%|██████████| 3750/3750 [00:37<00:00, 98.91batch/s, loss=0.241]  


Epoch [1/10], Train Loss: 0.3061, Val Loss: 0.1234, Val Accuracy: 97.55%


Epoch 2/10: 100%|██████████| 3750/3750 [00:38<00:00, 97.69batch/s, loss=0.071]  


Epoch [2/10], Train Loss: 0.1152, Val Loss: 0.0954, Val Accuracy: 98.30%


Epoch 3/10: 100%|██████████| 3750/3750 [01:08<00:00, 54.74batch/s, loss=0.0442] 


Epoch [3/10], Train Loss: 0.0930, Val Loss: 0.0847, Val Accuracy: 98.71%


Epoch 4/10: 100%|██████████| 3750/3750 [01:26<00:00, 43.12batch/s, loss=0.0436] 


Epoch [4/10], Train Loss: 0.0821, Val Loss: 0.0757, Val Accuracy: 98.88%


Epoch 5/10: 100%|██████████| 3750/3750 [00:56<00:00, 66.40batch/s, loss=0.0435] 


Epoch [5/10], Train Loss: 0.0749, Val Loss: 0.0732, Val Accuracy: 98.87%


Epoch 6/10: 100%|██████████| 3750/3750 [00:49<00:00, 75.72batch/s, loss=0.0414] 


Epoch [6/10], Train Loss: 0.0691, Val Loss: 0.0734, Val Accuracy: 98.83%


Epoch 7/10: 100%|██████████| 3750/3750 [00:42<00:00, 89.09batch/s, loss=0.0948] 


Epoch [7/10], Train Loss: 0.0637, Val Loss: 0.0662, Val Accuracy: 99.14%


Epoch 8/10: 100%|██████████| 3750/3750 [01:34<00:00, 39.80batch/s, loss=0.0475] 


Epoch [8/10], Train Loss: 0.0595, Val Loss: 0.0701, Val Accuracy: 98.98%


Epoch 9/10: 100%|██████████| 3750/3750 [00:38<00:00, 98.56batch/s, loss=0.04]   


Epoch [9/10], Train Loss: 0.0561, Val Loss: 0.0649, Val Accuracy: 99.09%


Epoch 10/10: 100%|██████████| 3750/3750 [00:41<00:00, 89.60batch/s, loss=0.048]  


Epoch [10/10], Train Loss: 0.0533, Val Loss: 0.0643, Val Accuracy: 99.16%

Training with Momentum, Batch Size: 32


Epoch 1/10: 100%|██████████| 1875/1875 [01:35<00:00, 19.68batch/s, loss=0.082] 


Epoch [1/10], Train Loss: 0.4024, Val Loss: 0.1700, Val Accuracy: 96.24%


Epoch 2/10: 100%|██████████| 1875/1875 [00:28<00:00, 65.26batch/s, loss=0.22]  


Epoch [2/10], Train Loss: 0.1416, Val Loss: 0.1145, Val Accuracy: 97.78%


Epoch 3/10: 100%|██████████| 1875/1875 [00:28<00:00, 66.73batch/s, loss=0.0864]


Epoch [3/10], Train Loss: 0.1126, Val Loss: 0.1156, Val Accuracy: 97.73%


Epoch 4/10: 100%|██████████| 1875/1875 [00:42<00:00, 43.94batch/s, loss=0.0755]


Epoch [4/10], Train Loss: 0.0990, Val Loss: 0.0886, Val Accuracy: 98.55%


Epoch 5/10: 100%|██████████| 1875/1875 [00:52<00:00, 35.77batch/s, loss=0.108] 


Epoch [5/10], Train Loss: 0.0891, Val Loss: 0.0880, Val Accuracy: 98.49%


Epoch 6/10: 100%|██████████| 1875/1875 [00:30<00:00, 62.36batch/s, loss=0.142] 


Epoch [6/10], Train Loss: 0.0824, Val Loss: 0.0836, Val Accuracy: 98.53%


Epoch 7/10: 100%|██████████| 1875/1875 [00:53<00:00, 35.08batch/s, loss=0.0471]


Epoch [7/10], Train Loss: 0.0771, Val Loss: 0.0761, Val Accuracy: 98.83%


Epoch 8/10: 100%|██████████| 1875/1875 [04:45<00:00,  6.56batch/s, loss=0.0991]


Epoch [8/10], Train Loss: 0.0735, Val Loss: 0.0746, Val Accuracy: 98.92%


Epoch 9/10: 100%|██████████| 1875/1875 [00:28<00:00, 65.52batch/s, loss=0.0429]


Epoch [9/10], Train Loss: 0.0694, Val Loss: 0.0721, Val Accuracy: 98.97%


Epoch 10/10: 100%|██████████| 1875/1875 [00:27<00:00, 67.18batch/s, loss=0.0476]


Epoch [10/10], Train Loss: 0.0659, Val Loss: 0.0806, Val Accuracy: 98.71%


In [70]:
# Hyperparameters for NAG
learning_rate_nag = 0.01
beta_nag = 0.95

nag_results = compare_optimizers(
    optimizer_class=NAG,
    optimizer_name="Nesterov's Accelerated Gradient",
    batch_sizes=batch_sizes,
    learning_rate=learning_rate_nag,
    beta=beta_nag
)



Training with Nesterov's Accelerated Gradient, Batch Size: 4


Epoch 1/10: 100%|██████████| 15000/15000 [04:16<00:00, 58.53batch/s, loss=2.42]  


Epoch [1/10], Train Loss: 2.1128, Val Loss: 2.3843, Val Accuracy: 11.35%


Epoch 2/10: 100%|██████████| 15000/15000 [01:17<00:00, 193.23batch/s, loss=2.33]


Epoch [2/10], Train Loss: 2.3748, Val Loss: 2.3628, Val Accuracy: 10.10%


Epoch 3/10: 100%|██████████| 15000/15000 [01:17<00:00, 192.80batch/s, loss=2.5] 


Epoch [3/10], Train Loss: 2.3622, Val Loss: 2.3630, Val Accuracy: 8.92%


Epoch 4/10: 100%|██████████| 15000/15000 [01:23<00:00, 179.77batch/s, loss=2.43]


Epoch [4/10], Train Loss: 2.3537, Val Loss: 2.3472, Val Accuracy: 9.80%


Epoch 5/10: 100%|██████████| 15000/15000 [01:30<00:00, 165.13batch/s, loss=2.36]


Epoch [5/10], Train Loss: 2.3484, Val Loss: 2.3525, Val Accuracy: 9.80%


Epoch 6/10: 100%|██████████| 15000/15000 [01:16<00:00, 197.26batch/s, loss=2.33]


Epoch [6/10], Train Loss: 2.3442, Val Loss: 2.3350, Val Accuracy: 10.10%


Epoch 7/10: 100%|██████████| 15000/15000 [01:16<00:00, 197.34batch/s, loss=2.33]


Epoch [7/10], Train Loss: 2.3392, Val Loss: 2.3325, Val Accuracy: 10.10%


Epoch 8/10: 100%|██████████| 15000/15000 [01:18<00:00, 190.76batch/s, loss=2.38]


Epoch [8/10], Train Loss: 2.3358, Val Loss: 2.3391, Val Accuracy: 9.80%


Epoch 9/10: 100%|██████████| 15000/15000 [04:20<00:00, 57.55batch/s, loss=2.36]  


Epoch [9/10], Train Loss: 2.3335, Val Loss: 2.3281, Val Accuracy: 10.10%


Epoch 10/10: 100%|██████████| 15000/15000 [01:17<00:00, 193.49batch/s, loss=2.44]


Epoch [10/10], Train Loss: 2.3312, Val Loss: 2.3256, Val Accuracy: 10.28%

Training with Nesterov's Accelerated Gradient, Batch Size: 8


Epoch 1/10: 100%|██████████| 7500/7500 [00:45<00:00, 165.09batch/s, loss=0.223] 


Epoch [1/10], Train Loss: 0.4396, Val Loss: 0.3415, Val Accuracy: 92.82%


Epoch 2/10: 100%|██████████| 7500/7500 [00:45<00:00, 163.05batch/s, loss=0.0569]


Epoch [2/10], Train Loss: 0.2783, Val Loss: 0.3359, Val Accuracy: 92.59%


Epoch 3/10: 100%|██████████| 7500/7500 [00:46<00:00, 160.52batch/s, loss=0.0741]


Epoch [3/10], Train Loss: 0.2601, Val Loss: 0.2862, Val Accuracy: 94.67%


Epoch 4/10: 100%|██████████| 7500/7500 [00:44<00:00, 170.19batch/s, loss=0.48]  


Epoch [4/10], Train Loss: 0.2716, Val Loss: 0.2774, Val Accuracy: 94.59%


Epoch 5/10: 100%|██████████| 7500/7500 [00:49<00:00, 152.69batch/s, loss=0.0711]


Epoch [5/10], Train Loss: 0.3022, Val Loss: 0.3091, Val Accuracy: 95.18%


Epoch 6/10: 100%|██████████| 7500/7500 [00:46<00:00, 160.54batch/s, loss=0.705] 


Epoch [6/10], Train Loss: 0.3411, Val Loss: 0.2805, Val Accuracy: 95.38%


Epoch 7/10: 100%|██████████| 7500/7500 [00:49<00:00, 152.04batch/s, loss=0.0764]


Epoch [7/10], Train Loss: 0.2867, Val Loss: 0.3089, Val Accuracy: 95.22%


Epoch 8/10: 100%|██████████| 7500/7500 [00:46<00:00, 161.69batch/s, loss=0.71]  


Epoch [8/10], Train Loss: 0.2868, Val Loss: 0.3503, Val Accuracy: 93.84%


Epoch 9/10: 100%|██████████| 7500/7500 [00:47<00:00, 158.08batch/s, loss=0.451] 


Epoch [9/10], Train Loss: 0.3869, Val Loss: 0.2825, Val Accuracy: 95.58%


Epoch 10/10: 100%|██████████| 7500/7500 [00:46<00:00, 162.26batch/s, loss=0.0964]


Epoch [10/10], Train Loss: 0.4721, Val Loss: 0.4419, Val Accuracy: 92.85%

Training with Nesterov's Accelerated Gradient, Batch Size: 16


Epoch 1/10: 100%|██████████| 3750/3750 [00:38<00:00, 96.90batch/s, loss=0.164]  


Epoch [1/10], Train Loss: 0.2983, Val Loss: 0.1569, Val Accuracy: 97.01%


Epoch 2/10: 100%|██████████| 3750/3750 [00:38<00:00, 98.26batch/s, loss=0.0586] 


Epoch [2/10], Train Loss: 0.1352, Val Loss: 0.1027, Val Accuracy: 98.29%


Epoch 3/10: 100%|██████████| 3750/3750 [00:38<00:00, 96.94batch/s, loss=0.0724] 


Epoch [3/10], Train Loss: 0.1162, Val Loss: 0.1711, Val Accuracy: 96.60%


Epoch 4/10: 100%|██████████| 3750/3750 [00:38<00:00, 96.80batch/s, loss=0.09]   


Epoch [4/10], Train Loss: 0.1012, Val Loss: 0.1003, Val Accuracy: 98.25%


Epoch 5/10: 100%|██████████| 3750/3750 [00:38<00:00, 96.43batch/s, loss=0.0397] 


Epoch [5/10], Train Loss: 0.0951, Val Loss: 0.1120, Val Accuracy: 98.16%


Epoch 6/10: 100%|██████████| 3750/3750 [00:38<00:00, 97.95batch/s, loss=0.0512] 


Epoch [6/10], Train Loss: 0.0982, Val Loss: 0.1013, Val Accuracy: 98.26%


Epoch 7/10: 100%|██████████| 3750/3750 [00:38<00:00, 96.35batch/s, loss=0.0574] 


Epoch [7/10], Train Loss: 0.0972, Val Loss: 0.1097, Val Accuracy: 98.11%


Epoch 8/10: 100%|██████████| 3750/3750 [00:38<00:00, 97.16batch/s, loss=0.116]  


Epoch [8/10], Train Loss: 0.0995, Val Loss: 0.1051, Val Accuracy: 98.42%


Epoch 9/10: 100%|██████████| 3750/3750 [00:40<00:00, 92.04batch/s, loss=0.0424] 


Epoch [9/10], Train Loss: 0.0960, Val Loss: 0.1135, Val Accuracy: 98.17%


Epoch 10/10: 100%|██████████| 3750/3750 [00:42<00:00, 88.63batch/s, loss=0.0421] 


Epoch [10/10], Train Loss: 0.0975, Val Loss: 0.1227, Val Accuracy: 98.30%

Training with Nesterov's Accelerated Gradient, Batch Size: 32


Epoch 1/10: 100%|██████████| 1875/1875 [00:28<00:00, 65.71batch/s, loss=0.115] 


Epoch [1/10], Train Loss: 0.3108, Val Loss: 0.1197, Val Accuracy: 97.96%


Epoch 2/10: 100%|██████████| 1875/1875 [00:29<00:00, 62.66batch/s, loss=0.0543]


Epoch [2/10], Train Loss: 0.1104, Val Loss: 0.0910, Val Accuracy: 98.66%


Epoch 3/10: 100%|██████████| 1875/1875 [00:32<00:00, 57.33batch/s, loss=0.0768]


Epoch [3/10], Train Loss: 0.0867, Val Loss: 0.0924, Val Accuracy: 98.41%


Epoch 4/10: 100%|██████████| 1875/1875 [00:29<00:00, 62.64batch/s, loss=0.0386]


Epoch [4/10], Train Loss: 0.0714, Val Loss: 0.0735, Val Accuracy: 98.84%


Epoch 5/10: 100%|██████████| 1875/1875 [00:29<00:00, 64.30batch/s, loss=0.0616]


Epoch [5/10], Train Loss: 0.0616, Val Loss: 0.0781, Val Accuracy: 98.63%


Epoch 6/10: 100%|██████████| 1875/1875 [00:31<00:00, 58.79batch/s, loss=0.0351]


Epoch [6/10], Train Loss: 0.0588, Val Loss: 0.0707, Val Accuracy: 98.89%


Epoch 7/10: 100%|██████████| 1875/1875 [00:30<00:00, 61.18batch/s, loss=0.0356]


Epoch [7/10], Train Loss: 0.0526, Val Loss: 0.0720, Val Accuracy: 98.85%


Epoch 8/10: 100%|██████████| 1875/1875 [00:30<00:00, 61.58batch/s, loss=0.0316]


Epoch [8/10], Train Loss: 0.0499, Val Loss: 0.0686, Val Accuracy: 99.00%


Epoch 9/10: 100%|██████████| 1875/1875 [00:31<00:00, 59.66batch/s, loss=0.0303]


Epoch [9/10], Train Loss: 0.0464, Val Loss: 0.0754, Val Accuracy: 98.76%


Epoch 10/10: 100%|██████████| 1875/1875 [00:31<00:00, 60.32batch/s, loss=0.0625]


Epoch [10/10], Train Loss: 0.0437, Val Loss: 0.0690, Val Accuracy: 99.02%


In [71]:
# Hyperparameters for RMSprop
learning_rate_rmsprop = 0.001
beta_rmsprop = 0.95
gamma_rmsprop = 1.0
eps_rmsprop = 1e-8

rmsprop_results = compare_optimizers(
    optimizer_class=RMSprop,
    optimizer_name="RMSprop",
    batch_sizes=batch_sizes,
    learning_rate=learning_rate_rmsprop,
    beta=beta_rmsprop,
    gamma=gamma_rmsprop,
    eps=eps_rmsprop
)



Training with RMSprop, Batch Size: 4


Epoch 1/10: 100%|██████████| 15000/15000 [01:21<00:00, 184.24batch/s, loss=0.0253]


Epoch [1/10], Train Loss: 0.1449, Val Loss: 0.0791, Val Accuracy: 98.24%


Epoch 2/10:  36%|███▌      | 5401/15000 [00:28<00:51, 186.65batch/s, loss=0.0218]


KeyboardInterrupt: 

In [None]:
# Hyperparameters for Adam
learning_rate_adam = 0.001
beta1_adam = 0.9
beta2_adam = 0.999
eps_adam = 1e-8

adam_results = compare_optimizers(
    optimizer_class=Adam,
    optimizer_name="Adam",
    batch_sizes=batch_sizes,
    learning_rate=learning_rate_adam,
    beta1=beta1_adam,
    beta2=beta2_adam,
    eps=eps_adam
)


In [43]:
def plot_results(optimizer_results, optimizer_name):
    for batch_size, result in optimizer_results.items():
        epochs = range(1, len(result['train_loss']) + 1)
        plt.figure(figsize=(12, 5))
        
        # Plot Training and Validation Loss
        plt.subplot(1, 2, 1)
        plt.plot(epochs, result['train_loss'], 'b-', label='Training Loss')
        plt.plot(epochs, result['val_loss'], 'r-', label='Validation Loss')
        plt.title(f'{optimizer_name} - Batch Size {batch_size}')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        
        # Plot Validation Accuracy
        plt.subplot(1, 2, 2)
        plt.plot(epochs, result['val_accuracy'], 'g-', label='Validation Accuracy')
        plt.title(f'{optimizer_name} - Batch Size {batch_size}')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy (%)')
        plt.legend()
        
        plt.tight_layout()
        plt.show()


In [None]:
print("\nPlotting Results for Momentum Optimizer")
plot_results(momentum_results, "Momentum")

print("\nPlotting Results for Nesterov's Accelerated Gradient Optimizer")
plot_results(nag_results, "Nesterov's Accelerated Gradient")

print("\nPlotting Results for RMSprop Optimizer")
plot_results(rmsprop_results, "RMSprop")

print("\nPlotting Results for Adam Optimizer")
plot_results(adam_results, "Adam")


In [None]:
print("\nFinal Validation Accuracies:")

print("Momentum Optimizer:")
for batch_size in batch_sizes:
    acc = momentum_results[batch_size]['val_accuracy'][-1]
    print(f"Batch Size {batch_size}: {acc:.2f}%")

print("\nNesterov's Accelerated Gradient Optimizer:")
for batch_size in batch_sizes:
    acc = nag_results[batch_size]['val_accuracy'][-1]
    print(f"Batch Size {batch_size}: {acc:.2f}%")

print("\nRMSprop Optimizer:")
for batch_size in batch_sizes:
    acc = rmsprop_results[batch_size]['val_accuracy'][-1]
    print(f"Batch Size {batch_size}: {acc:.2f}%")

print("\nAdam Optimizer:")
for batch_size in batch_sizes:
    acc = adam_results[batch_size]['val_accuracy'][-1]
    print(f"Batch Size {batch_size}: {acc:.2f}%")
