## Setup

This cell just imports PyTorch.

`!pip install colab-xterm` to install a terminal if needed. `%reload_ext colabxterm` to reload and use `%xterm` to launch. `%lsmagic` is useful too.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

print(f"PyTorch Version: {torch.__version__}")

In [None]:
# Cell 2: Simple Model Definition
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.linear1 = nn.Linear(in_features=10, out_features=5) # 10 input features, 5 output features
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(in_features=5, out_features=1)  # 5 input features, 1 output feature

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        return x

print("SimpleModel class defined.")

In [None]:
# Cell 3: Model Instantiation and Initial Parameter Inspection
model = SimpleModel()
print("Model Instantiated:\n", model)

# Let's inspect the initial weights of the first linear layer
print("\nInitial weights of linear1 (first 2 rows):")
print(model.linear1.weight.data[:2])

# Store a reference to this specific parameter tensor to track changes
initial_linear1_weight_param = model.linear1.weight
print(f"\nID of linear1.weight tensor: {id(initial_linear1_weight_param)}")

In [None]:
# Cell 4: Optimizer Instantiation and Initial Learning Rate
initial_lr = 0.1
optimizer = optim.SGD(model.parameters(), lr=initial_lr)

print("Optimizer Instantiated (SGD).")
print(f"Initial Learning Rate in Optimizer: {optimizer.param_groups[0]['lr']}")

In [None]:
# Cell 5: Scheduler Instantiation and Effect on Optimizer's LR
# StepLR decays the learning rate by 'gamma' every 'step_size' epochs.
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.5)
print("StepLR Scheduler Instantiated (step_size=2, gamma=0.5).")

print(f"\n--- Simulating Epochs/Scheduler Steps ---")
for i in range(5):
    # In a real loop, optimizer.step() would happen before scheduler.step() typically for epoch-based schedulers
    # But here we just want to see the LR change due to the scheduler.
    # We also typically call scheduler.step() at the end of an epoch.
    scheduler.step() # Simulate an epoch passing
    current_lr_in_optimizer = optimizer.param_groups[0]['lr']
    print(f"After scheduler step {i+1}: Optimizer LR = {current_lr_in_optimizer:.4f}")

In [None]:
# Cell 6: A Single Training Step - Loss, Backward, Optimizer Step

# 0. Reset model and optimizer for a clean demonstration for this cell
model = SimpleModel() # Re-initialize model to get fresh parameters
initial_lr_for_step_demo = 0.1
optimizer = optim.SGD(model.parameters(), lr=initial_lr_for_step_demo)
# For this demo, we'll use a fixed LR from the optimizer, not the scheduler from Cell 5
print(f"Model and Optimizer reset. Current Optimizer LR: {optimizer.param_groups[0]['lr']}\n")

# 1. Create dummy input data and target
dummy_input = torch.randn(1, 10) # Batch size 1, 10 input features
dummy_target = torch.randn(1, 1) # Batch size 1, 1 output feature

# 2. Define a loss function
criterion = nn.MSELoss() # Mean Squared Error Loss

# --- Before any updates ---
print("--- BEFORE BACKWARD & OPTIMIZER STEP ---")
param_to_watch = model.linear1.weight # The actual nn.Parameter object
print(f"Value of linear1.weight[0,0] BEFORE: {param_to_watch.data[0,0].item():.6f}")
if param_to_watch.grad is not None:
    print(f"Gradient of linear1.weight[0,0] BEFORE: {param_to_watch.grad[0,0].item():.6f}")
else:
    print(f"Gradient of linear1.weight[0,0] BEFORE: None (as expected)")

# 3. Forward pass: Get model's prediction
output = model(dummy_input)

# 4. Calculate loss
loss = criterion(output, dummy_target)
print(f"\nCalculated Loss: {loss.item():.6f}")

# 5. Zero previous gradients (important!)
optimizer.zero_grad()
print("Optimizer gradients zeroed.")
if param_to_watch.grad is not None: # Should be None or zeros after zero_grad()
     print(f"Gradient of linear1.weight[0,0] after zero_grad: {param_to_watch.grad[0,0].item() if param_to_watch.grad is not None else 'None'}")
else:
    print(f"Gradient of linear1.weight[0,0] after zero_grad: None")


# 6. Backward pass: Compute gradients
loss.backward()
print("\n--- AFTER loss.backward() ---")
print(f"Gradient of linear1.weight[0,0] AFTER backward(): {param_to_watch.grad[0,0].item():.6f}")
print(f"Value of linear1.weight[0,0] (still unchanged): {param_to_watch.data[0,0].item():.6f}")


# 7. Optimizer step: Update parameters
optimizer.step()
print("\n--- AFTER optimizer.step() ---")
print(f"Gradient of linear1.weight[0,0] AFTER step (may be unchanged or None): {param_to_watch.grad[0,0].item() if param_to_watch.grad is not None else 'None'}") # Grads are used up, not zeroed by step
print(f"Value of linear1.weight[0,0] AFTER step (SHOULD BE DIFFERENT): {param_to_watch.data[0,0].item():.6f}")

# Confirm we are looking at the same tensor object that was initially created
print(f"\nID of linear1.weight tensor now: {id(model.linear1.weight)}")
print(f"Matches initial ID: {id(model.linear1.weight) == id(initial_linear1_weight_param) if 'initial_linear1_weight_param' in locals() else 'initial_linear1_weight_param not from this cell run'}")
# Note: If you re-ran Cell 3 without re-running this cell, the ID comparison might be tricky.
# The key is that model.linear1.weight is the object the optimizer updates.