In [None]:
# a simple pytorch neural network for adding two numbers

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# define the neural network
class Adder(nn.Module):
    def __init__(self):
        super(Adder, self).__init__()
        self.fc = nn.Linear(2, 1)
    def forward(self, x):
        return self.fc(x)
    
# create the neural network
net = Adder()

# create the optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

# create the loss function
criterion = nn.MSELoss()

# train the neural network
for i in range(10000):
    x = torch.tensor(np.random.rand(2), dtype=torch.float32)
    y = x.sum()
    optimizer.zero_grad()
    y_pred = net(x)
    loss = criterion(y_pred, y)
    loss.backward()
    optimizer.step()
    print('loss:', loss.item())

# test the neural network
x = torch.tensor([0.1, 0.2], dtype=torch.float32)
y = x.sum()
y_pred = net(x)
print('x:', x)
print('y:', y)
print('y_pred:', y_pred)
print('error:', y_pred.item() - y.item())

In [None]:
#viszulize the neural network using torchviz
from torchviz import make_dot
x = torch.tensor([0.1, 0.2], dtype=torch.float32)
y = x.sum()
y_pred = net(x)
make_dot(y_pred, params=dict(net.named_parameters()))

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from ipywidgets import interactive, IntSlider, FloatSlider, Dropdown, fixed

# ---------------------
# Generate synthetic data
# ---------------------
def generate_data(num_samples=200, noise=0.1):
    # Two-class data
    x = np.random.rand(num_samples, 2) * 2 - 1
    y = (x[:, 0] * x[:, 1] > 0).astype(np.float32)  # XOR-like pattern with noise
    # Add small random noise
    x += np.random.randn(*x.shape) * noise
    return x, y

# ---------------------
# Define a small neural network
# ---------------------
class SimpleMLP(nn.Module):
    def __init__(self, input_dim=2, hidden_dim=16, output_dim=1):
        super(SimpleMLP, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)
    
# ---------------------
# Training function
# ---------------------
def train_model(optimizer_name, lr, epochs, batch_size):
    # Data
    x_data, y_data = generate_data(num_samples=200, noise=0.1)
    x_tensor = torch.from_numpy(x_data).float()
    y_tensor = torch.from_numpy(y_data).float().view(-1, 1)

    model = SimpleMLP()
    criterion = nn.BCELoss()

    # Select optimizer
    if optimizer_name == "SGD":
        optimizer = optim.SGD(model.parameters(), lr=lr)
    elif optimizer_name == "Adam":
        optimizer = optim.Adam(model.parameters(), lr=lr)
    elif optimizer_name == "RMSProp":
        optimizer = optim.RMSprop(model.parameters(), lr=lr)
    else:
        optimizer = optim.Adam(model.parameters(), lr=lr)  # default fallback

    losses = []
    model.train()
    dataset_size = x_tensor.shape[0]

    for epoch in range(epochs):
        # Mini-batch updates
        perm = torch.randperm(dataset_size)
        epoch_loss = 0.0

        for i in range(0, dataset_size, batch_size):
            idx = perm[i:i+batch_size]
            batch_x = x_tensor[idx]
            batch_y = y_tensor[idx]

            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        avg_loss = epoch_loss / (dataset_size // batch_size)
        losses.append(avg_loss)

    # Plot the loss curve
    plt.figure(figsize=(6,4))
    plt.plot(losses, label=f'{optimizer_name} (lr={lr})')
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training Loss")
    plt.legend()
    plt.show()

    # Decision boundary visualization
    model.eval()
    x_min, x_max = x_data[:, 0].min() - 0.2, x_data[:, 0].max() + 0.2
    y_min, y_max = x_data[:, 1].min() - 0.2, x_data[:, 1].max() + 0.2
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))
    grid = torch.from_numpy(np.c_[xx.ravel(), yy.ravel()]).float()
    preds = model(grid).detach().numpy().reshape(xx.shape)

    plt.figure(figsize=(6,4))
    plt.contourf(xx, yy, preds, levels=[0, 0.5, 1], alpha=0.5, cmap="RdBu")
    plt.scatter(x_data[:,0], x_data[:,1], c=y_data, edgecolors='k', cmap="RdBu")
    plt.title("Decision Regions")
    plt.show()

In [None]:
# ---------------------
# Interactive widget
# ---------------------
def interactive_demo(optimizer_name, lr, epochs, batch_size):
    train_model(optimizer_name, lr, epochs, batch_size)

optimizer_options = ["SGD", "Adam", "RMSProp"]
lr_slider = FloatSlider(value=0.01, min=0.0001, max=0.1, step=0.001, description='Learning Rate')
epoch_slider = IntSlider(value=30, min=1, max=200, step=1, description='Epochs')
batch_slider = IntSlider(value=32, min=1, max=128, step=1, description='Batch Size')

demo = interactive(
    interactive_demo,
    optimizer_name=Dropdown(options=optimizer_options, value="SGD", description='Optimizer'),
    lr=lr_slider,
    epochs=epoch_slider,
    batch_size=batch_slider
)

demo

---

#### Adam Visualizer

# Adam Optimizer Parameters Explained

## Optimizer Parameters

| Parameter | Description | Default | Effect | When to Adjust |
|-----------|-------------|---------|--------|----------------|
| **Learning Rate (lr)** | Controls the step size for parameter updates | 0.001 | **Too high**: Oscillation or divergence<br>**Too low**: Slow convergence, may get stuck<br>**Just right**: Steady progress | Increase if loss decreases too slowly; decrease if training is unstable |
| **Beta1 (β₁)** | Decay rate for the first moment estimate (momentum) | 0.9 | **Higher values** (closer to 1): Stronger momentum, smoother path<br>**Lower values**: More responsive to recent gradients | Increase when loss curve is noisy; decrease when stuck in plateaus |
| **Beta2 (β₂)** | Decay rate for the second moment estimate (adaptive learning rates) | 0.999 | **Higher values**: More stable, slower adaptation<br>**Lower values**: Faster adaptation, potentially less stable | Rarely need to modify; lower if features have very different scales |
| **Epsilon (ε)** | Small constant for numerical stability | 1e-8 | Prevents division by zero<br>Higher values reduce effective learning rate for small gradients | Rarely needs changing; increase if experiencing numerical instability |
| **Weight Decay** | L2 regularization term to prevent overfitting | 0.0 | **Higher values**: Stronger regularization, simpler boundaries<br>**Lower values**: Less regularization, more complex models | Increase when overfitting; decrease when underfitting |

## Data and Network Details

| Component | Description | Details |
|-----------|-------------|---------|
| **Synthetic Dataset** | Binary classification with XOR-like pattern | • **Features**: 2D input space (x,y coordinates)<br>• **Samples**: 200 data points<br>• **Pattern**: Classification based on product of coordinates<br>• **Noise**: Random Gaussian noise added<br>• **Difficulty**: Non-linearly separable |
| **Neural Network** | Simple Multi-Layer Perceptron (MLP) | • **Input layer**: 2 neurons<br>• **Hidden layer**: 16 neurons with ReLU activation<br>• **Output layer**: 1 neuron with Sigmoid activation<br>• **Parameters**: ~65 trainable parameters<br>• **Loss Function**: Binary Cross-Entropy (BCE) |
| **Training Process** | Batch training over multiple epochs | • **Batch Size**: Controls samples per update<br>  - Larger: Smoother updates, better estimates<br>  - Smaller: Noisier updates, can escape local minima<br>• **Epochs**: Complete passes through dataset |

## Visualization Panels

| Panel | Description | What to Look For |
|-------|-------------|------------------|
| **Training Loss** | Shows loss decrease over epochs | Smooth decreasing curve indicates good learning |
| **First Moment** | Shows momentum term evolution | Stabilizes as training progresses |
| **Second Moment** | Shows adaptive learning rate evolution | Grows when gradients are consistent |
| **Decision Regions** | Visualizes classification boundary | Should separate the XOR-like pattern correctly |

## Tips for Experimentation

| Tip | Description |
|-----|-------------|
| 1 | Start with default Adam parameters (lr=0.001, β₁=0.9, β₂=0.999) |
| 2 | Modify learning rate first if training is too slow or unstable |
| 3 | For this toy problem, 30-50 epochs are usually sufficient |
| 4 | Watch how the decision boundary evolves with different parameters |
| 5 | Note how moment magnitudes correlate with learning efficiency |

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from ipywidgets import interactive, FloatSlider, IntSlider, fixed, VBox, HBox, Output
from IPython.display import display, clear_output

# ---------------------
# Generate synthetic data
# ---------------------
def generate_data(num_samples=200, noise=0.1):
    # Two-class data
    x = np.random.rand(num_samples, 2) * 2 - 1
    y = (x[:, 0] * x[:, 1] > 0).astype(np.float32)  # XOR-like pattern with noise
    # Add small random noise
    x += np.random.randn(*x.shape) * noise
    return x, y

# ---------------------
# Define a small neural network
# ---------------------
class SimpleMLP(nn.Module):
    def __init__(self, input_dim=2, hidden_dim=16, output_dim=1):
        super(SimpleMLP, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

# ---------------------
# Training function with Adam-specific tracking
# ---------------------
def train_model(lr=0.01, beta1=0.9, beta2=0.999, eps=1e-8, weight_decay=0.0, 
                epochs=30, batch_size=32, seed=42):
    # Set seed for reproducibility
    torch.manual_seed(seed)
    np.random.seed(seed)
    
    # Data
    x_data, y_data = generate_data(num_samples=200, noise=0.1)
    x_tensor = torch.from_numpy(x_data).float()
    y_tensor = torch.from_numpy(y_data).float().view(-1, 1)

    model = SimpleMLP()
    criterion = nn.BCELoss()

    # Initialize Adam optimizer with specified parameters
    optimizer = optim.Adam(
        model.parameters(), 
        lr=lr, 
        betas=(beta1, beta2), 
        eps=eps, 
        weight_decay=weight_decay
    )

    # For tracking optimizer state
    tracked_param = next(model.parameters())
    
    losses = []
    param_updates = []
    first_moments = []
    second_moments = []
    update_ratios = []
    
    model.train()
    dataset_size = x_tensor.shape[0]

    for epoch in range(epochs):
        # Mini-batch updates
        perm = torch.randperm(dataset_size)
        epoch_loss = 0.0

        for i in range(0, dataset_size, batch_size):
            idx = perm[i:i+batch_size]
            batch_x = x_tensor[idx]
            batch_y = y_tensor[idx]

            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            
            # Record state before optimizer step
            if i == 0:  # Only track the first batch per epoch for clarity
                with torch.no_grad():
                    # Store the gradient for later use (for parameter tracking)
                    grad_norm = torch.norm(tracked_param.grad).item()
                
            optimizer.step()
            
            # Track optimizer state after step
            if i == 0:
                # Access Adam state dict for the tracked parameter
                adam_state = optimizer.state[tracked_param]
                
                if 'exp_avg' in adam_state:
                    # First moment estimate (momentum)
                    m_norm = torch.norm(adam_state['exp_avg']).item()
                    first_moments.append(m_norm)
                    
                    # Second moment estimate
                    v_norm = torch.norm(adam_state['exp_avg_sq']).item()
                    second_moments.append(v_norm)
                    
                    # Calculate update magnitude ratio (m/sqrt(v))
                    if v_norm > 0:
                        update_ratio = m_norm / (np.sqrt(v_norm) + eps)
                        update_ratios.append(update_ratio)
                    else:
                        update_ratios.append(0)
                
                # Parameter update magnitude
                param_updates.append(grad_norm)
            
            epoch_loss += loss.item()

        avg_loss = epoch_loss / (dataset_size // batch_size)
        losses.append(avg_loss)

    # Create multi-part figure for visualization
    fig = plt.figure(figsize=(15, 10))
    
    # Plot 1: Training Loss
    ax1 = fig.add_subplot(2, 2, 1)
    ax1.plot(losses)
    ax1.set_xlabel("Epoch")
    ax1.set_ylabel("Loss")
    ax1.set_title(f"Training Loss (Adam lr={lr}, β1={beta1}, β2={beta2})")
    
    # Plot 2: Adam's First Moment (momentum) Magnitude
    ax2 = fig.add_subplot(2, 2, 2)
    if first_moments:
        ax2.plot(first_moments, label="First Moment (m)")
        ax2.set_xlabel("Epoch")
        ax2.set_ylabel("Magnitude")
        ax2.set_title("Adam's First Moment Magnitude")
        ax2.legend()
    
    # Plot 3: Adam's Second Moment Magnitude
    ax3 = fig.add_subplot(2, 2, 3)
    if second_moments:
        ax3.plot(second_moments, label="Second Moment (v)")
        ax3.set_xlabel("Epoch")
        ax3.set_ylabel("Magnitude")
        ax3.set_title("Adam's Second Moment Magnitude")
        ax3.legend()
    
    # Plot 4: Decision boundary visualization
    ax4 = fig.add_subplot(2, 2, 4)
    model.eval()
    x_min, x_max = x_data[:, 0].min() - 0.2, x_data[:, 0].max() + 0.2
    y_min, y_max = x_data[:, 1].min() - 0.2, x_data[:, 1].max() + 0.2
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))
    grid = torch.from_numpy(np.c_[xx.ravel(), yy.ravel()]).float()
    preds = model(grid).detach().numpy().reshape(xx.shape)

    ax4.contourf(xx, yy, preds, levels=[0, 0.5, 1], alpha=0.5, cmap="RdBu")
    ax4.scatter(x_data[:,0], x_data[:,1], c=y_data, edgecolors='k', cmap="RdBu")
    ax4.set_title("Decision Regions")
    
    plt.tight_layout()
    plt.show()
    
    # Create a text description of Adam's algorithm and parameters
    plt.figure(figsize=(12, 3))
    plt.axis('off')
    adam_description = (
        f"Adam Optimizer Parameters Used:\n"
        f"  - Learning Rate (lr): {lr}\n"
        f"  - Beta1 (β₁): {beta1} (decay rate for first moment estimate)\n"
        f"  - Beta2 (β₂): {beta2} (decay rate for second moment estimate)\n"
        f"  - Epsilon (ε): {eps} (numerical stability term)\n"
        f"  - Weight Decay: {weight_decay}\n\n"
        f"Adam Algorithm Summary:\n"
        f"  1. Calculates first moment (mean) of gradients with decay rate β₁\n"
        f"  2. Calculates second moment (variance) of gradients with decay rate β₂\n"
        f"  3. Applies bias correction to both moments\n"
        f"  4. Updates parameters using the bias-corrected moments"
    )
    plt.text(0.05, 0.5, adam_description, fontsize=12, va='center', ha='left')
    plt.tight_layout()
    plt.show()

# ---------------------
# Interactive widget
# ---------------------
def interactive_demo():
    output = Output()
    
    # Define the interactive widgets
    lr_slider = FloatSlider(value=0.01, min=0.0001, max=0.1, step=0.001, description='Learning Rate')
    beta1_slider = FloatSlider(value=0.9, min=0.5, max=0.999, step=0.01, description='Beta1 (β₁)')
    beta2_slider = FloatSlider(value=0.999, min=0.9, max=0.9999, step=0.0001, description='Beta2 (β₂)')
    eps_slider = FloatSlider(value=1e-8, min=1e-10, max=1e-6, step=1e-10, 
                           description='Epsilon (ε)', readout_format='.1e')
    weight_decay_slider = FloatSlider(value=0.0, min=0.0, max=0.1, step=0.001, description='Weight Decay')
    epochs_slider = IntSlider(value=30, min=5, max=200, step=5, description='Epochs')
    batch_slider = IntSlider(value=32, min=1, max=128, step=8, description='Batch Size')
    
    # Function to handle updating the output
    def update_output(**kwargs):
        with output:
            clear_output(wait=True)
            train_model(**kwargs)
    
    # Create the interactive widget
    widget = interactive(
        update_output,
        lr=lr_slider,
        beta1=beta1_slider,
        beta2=beta2_slider,
        eps=eps_slider,
        weight_decay=weight_decay_slider,
        epochs=epochs_slider,
        batch_size=batch_slider,
        seed=fixed(42)  # Fixed seed for reproducibility
    )
    
    # Layout the widgets in a nice format
    controls = VBox([
        HBox([lr_slider, epochs_slider]),
        HBox([beta1_slider, beta2_slider]),
        HBox([eps_slider, weight_decay_slider]),
        batch_slider
    ])
    
    # Create the main UI
    main_ui = VBox([controls, output])
    display(main_ui)
    
    # Initial call to display the visualization
    update_output(
        lr=lr_slider.value,
        beta1=beta1_slider.value, 
        beta2=beta2_slider.value,
        eps=eps_slider.value,
        weight_decay=weight_decay_slider.value,
        epochs=epochs_slider.value,
        batch_size=batch_slider.value,
        seed=42
    )

# Run the interactive demo
if __name__ == "__main__":
    interactive_demo()