# Discussion - Module 4
# Pros and Cons of Neural Network Optimization Algorithms

**Directions:**
In the following discussion, you must select one of the following optimization algorithms: (1) Nesterov momentum, (2) RMSProp, (3) Adam, or (4) BFGS algorithm; try it on the machine learning dataset Fashion-MNIST; and respond to the following prompts:

1. Report the validation error on Fashion-MNIST using your selected algorithm as the optimizer.   
2. Compare the convergence speed of SGD against your selected algorithm. What improvements did your algorithm have over SGD?
3. Experiment with different choices for adjusting the learning rate. Which learning rate returns the best validation error?
4. What challenges involved in this deep optimization algorithm can you think of?

In [1]:
! pip install torch torchvision

Defaulting to user installation because normal site-packages is not writeable


# Simple Python code snippet to train a neural network on the Fashion-MNIST dataset using the PyTorch framework and the Adam optimizer. 

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Subset, DataLoader

# Set random seed for reproducibility
torch.manual_seed(3)

# Define a simple neural network architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
        x = x.view(-1, 28 * 28)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Hyperparameters
batch_size = 64
learning_rate = 0.001  # Adjust the learning rates here
epochs = 10

# Load Fashion-MNIST dataset
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
trainset = torchvision.datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
testset = torchvision.datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)

# Split the dataset into training and validation sets (e.g., 80% train, 20% validation)
train_size = int(0.8 * len(trainset))
val_size = len(trainset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(trainset, [train_size, val_size])

trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False)

# Initialize the model and optimizer
model = Net()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)  # Use Adam optimizer
criterion = nn.CrossEntropyLoss()

# Training loop with validation
for epoch in range(epochs):
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    # Calculate validation loss
    val_loss = 0.0
    with torch.no_grad():
        for data in valloader:
            inputs, labels = data
            outputs = model(inputs)
            val_loss += criterion(outputs, labels).item()
    
    print(f"Epoch {epoch+1}, Training Loss: {running_loss / len(trainloader)}, Validation Loss: {val_loss / len(valloader)}")

print("Finished Training")


Epoch 1, Training Loss: 0.5430803653200468, Validation Loss: 0.4048712338380357
Epoch 2, Training Loss: 0.3924556744496028, Validation Loss: 0.38634490190034215
Epoch 3, Training Loss: 0.3522614185611407, Validation Loss: 0.342245960409971
Epoch 4, Training Loss: 0.32539155301451683, Validation Loss: 0.34133574984809184
Epoch 5, Training Loss: 0.3049181672135989, Validation Loss: 0.3376363071196891
Epoch 6, Training Loss: 0.2906282551288605, Validation Loss: 0.3440606260394796
Epoch 7, Training Loss: 0.27523438665270805, Validation Loss: 0.3186814149326466
Epoch 8, Training Loss: 0.2636294405559699, Validation Loss: 0.3324200415468596
Epoch 9, Training Loss: 0.253545482908686, Validation Loss: 0.32546375137060246
Epoch 10, Training Loss: 0.24232167422771453, Validation Loss: 0.3303912768655635
Finished Training


# Compare the convergence speed of SGD against your selected algorithm. What improvements did your algorithm have over SGD?

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Subset, DataLoader

# Set random seed for reproducibility
torch.manual_seed(3)

# Define a simple neural network architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
        x = x.view(-1, 28 * 28)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Hyperparameters
batch_size = 64
learning_rate = 0.001 # Adjust the learning rates here
epochs = 10

# Load Fashion-MNIST dataset
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
trainset = torchvision.datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
testset = torchvision.datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)

# Split the dataset into training and validation sets (e.g., 80% train, 20% validation)
train_size = int(0.8 * len(trainset))
val_size = len(trainset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(trainset, [train_size, val_size])

trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False)

# Initialize the model and optimizer for SGD
model_sgd = Net()
optimizer_sgd = optim.SGD(model_sgd.parameters(), lr=learning_rate)  # Use SGD optimizer
criterion = nn.CrossEntropyLoss()

# Training loop with SGD
for epoch in range(epochs):
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data
        optimizer_sgd.zero_grad()
        outputs = model_sgd(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer_sgd.step()
        running_loss += loss.item()
    
    # Calculate validation loss
    val_loss = 0.0
    with torch.no_grad():
        for data in valloader:
            inputs, labels = data
            outputs = model_sgd(inputs)
            val_loss += criterion(outputs, labels).item()
    
    print(f"Epoch {epoch+1}, Training Loss (SGD): {running_loss / len(trainloader)}, Validation Loss: {val_loss / len(valloader)}")

print("Finished Training with SGD")

Epoch 1, Training Loss (SGD): 2.2087842995325726, Validation Loss: 2.0642517513417182
Epoch 2, Training Loss (SGD): 1.835270226319631, Validation Loss: 1.6002116894468348
Epoch 3, Training Loss (SGD): 1.4162894668579102, Validation Loss: 1.2574609961915524
Epoch 4, Training Loss (SGD): 1.1445761154492695, Validation Loss: 1.042324146691789
Epoch 5, Training Loss (SGD): 0.9742124049663544, Validation Loss: 0.9061808991939464
Epoch 6, Training Loss (SGD): 0.8652092943191528, Validation Loss: 0.8169343173503876
Epoch 7, Training Loss (SGD): 0.7922242699464163, Validation Loss: 0.7563707964851502
Epoch 8, Training Loss (SGD): 0.7414061694939931, Validation Loss: 0.7131430947400154
Epoch 9, Training Loss (SGD): 0.7043368597428004, Validation Loss: 0.6826441643085885
Epoch 10, Training Loss (SGD): 0.6761515806516012, Validation Loss: 0.6576209036593742
Finished Training with SGD
