# Check for Understanding — Autograded (PyTorch)
Run each cell. **Do not delete the asserts.**

**Passing condition:** all asserts pass.

Tip: If an assert fails, read its message, fix your code, and rerun the cell.


In [32]:
# Setup
import torch
import torch.nn as nn

torch.manual_seed(42)

def _is_close(a, b, tol=1e-5):
    return torch.allclose(a, b, atol=tol, rtol=0)

print("PyTorch version:", torch.__version__)


PyTorch version: 2.9.1+cpu


## Part 1 — Tensors & Representations

In [33]:
# Exercise 1: Tensor basics
# TODO:
# 1) Create a 2x3 tensor of random values called X
# 2) Print X, X.shape, X.dtype
# 3) Compute the mean of all elements and store it in x_mean (a 0-d tensor)

X = torch.randn(2, 3)
x_mean = X.mean()

# Print statements (uncomment after implementing)
print("X=\n", X)
print("shape:", X.shape)
print("dtype:", X.dtype)
print("mean:", x_mean)

# --- autograder asserts (do not delete) ---
assert isinstance(X, torch.Tensor), "X must be a torch.Tensor"
assert X.shape == (2, 3), f"X must have shape (2,3), got {tuple(X.shape)}"
assert X.dtype in (torch.float32, torch.float64), f"X should be float32/float64, got {X.dtype}"
assert isinstance(x_mean, torch.Tensor) and x_mean.shape == (), "x_mean must be a scalar (0-d) tensor"
assert _is_close(x_mean, X.sum() / X.numel()), "x_mean should equal X.sum()/X.numel()"


X=
 tensor([[ 0.3367,  0.1288,  0.2345],
        [ 0.2303, -1.1229, -0.1863]])
shape: torch.Size([2, 3])
dtype: torch.float32
mean: tensor(-0.0631)


In [34]:
# Exercise 2: Manual vector operations
# TODO:
# 1) Create v1 and v2 as 1-D tensors of length 5
# 2) Compute element-wise sum: v_sum
# 3) Compute dot product: v_dot (scalar tensor)

v1 = torch.tensor([1,2,3,4,5]).float()
v2 = torch.randn(5)

v_sum = v1 + v2
v_dot = torch.dot(v1, v2)

# Print statements (uncomment after implementing)
print("v1:", v1)
print("v2:", v2)
print("v_sum:", v_sum)
print("v_dot:", v_dot)

# --- autograder asserts (do not delete) ---
assert v1.shape == (5,) and v2.shape == (5,), "v1 and v2 must both be shape (5,)"
assert v_sum.shape == (5,), "v_sum must be a length-5 vector"
assert v_dot.shape == (), "v_dot must be a scalar (0-d) tensor"
manual_dot = (v1 * v2).sum()
assert _is_close(v_dot, manual_dot), "v_dot must equal (v1*v2).sum()"


v1: tensor([1., 2., 3., 4., 5.])
v2: tensor([ 2.2082, -0.6380,  0.4617,  0.2674,  0.5349])
v_sum: tensor([3.2082, 1.3620, 3.4617, 4.2674, 5.5349])
v_dot: tensor(6.0611)


## Part 2 — Embeddings

In [35]:
# Exercise 3: Simple embedding lookup
# TODO:
# 1) Create an nn.Embedding called emb with vocab_size=10 and emb_dim=4
# 2) Create token_ids as a LongTensor of shape (3,) with values in [0, 9]
# 3) Lookup embeddings: E = emb(token_ids)
# 4) Print E and E.shape

vocab_size, emb_dim = 10, 4
emb = torch.nn.Embedding(vocab_size, emb_dim);

token_ids = torch.tensor([1,4,5])
E = emb(token_ids)

# Print statements (uncomment after implementing)
print("token_ids:", token_ids)
print("E=\n", E)
print("E.shape:", E.shape)

# --- autograder asserts (do not delete) ---
assert isinstance(emb, nn.Embedding), "emb must be an nn.Embedding"
assert token_ids.dtype == torch.long, "token_ids must be torch.long"
assert token_ids.shape == (3,), f"token_ids must be shape (3,), got {tuple(token_ids.shape)}"
assert E.shape == (3, 4), f"E must have shape (3,4), got {tuple(E.shape)}"
assert E.requires_grad, "Embedding output should require gradients by default"


token_ids: tensor([1, 4, 5])
E=
 tensor([[-1.1109,  0.0915, -2.3169, -0.2168],
        [ 0.0349,  0.3211,  1.5736, -0.8455],
        [ 1.3123,  0.6872, -1.0892, -0.3553]], grad_fn=<EmbeddingBackward0>)
E.shape: torch.Size([3, 4])


In [36]:
# Exercise 4: From embeddings to a prediction
# NOTE: This exercise depends on Exercise 3 — complete that first.
# TODO:
# 1) Compute mean embedding across tokens: mean_E of shape (4,)
# 2) Create a Linear layer (4 -> 1) called head
# 3) Produce y_pred as shape (1,) or scalar

mean_E = E.mean(0)
head = nn.Linear(4,1)

y_pred = head(mean_E)

# Print statements (uncomment after implementing)
print("mean_E.shape:", mean_E.shape)
print("y_pred:", y_pred, "shape:", y_pred.shape)

# --- autograder asserts (do not delete) ---
assert mean_E.shape == (4,), f"mean_E must be shape (4,), got {tuple(mean_E.shape)}"
assert isinstance(head, nn.Linear) and head.in_features == 4 and head.out_features == 1, "head must be Linear(4->1)"
assert y_pred.numel() == 1, "y_pred must have exactly 1 element"
assert y_pred.requires_grad, "y_pred should require gradients"


mean_E.shape: torch.Size([4])
y_pred: tensor([0.0916], grad_fn=<ViewBackward0>) shape: torch.Size([1])


## Part 3 — Build a Tiny Network

In [37]:
# Exercise 5: Define a simple feed-forward network
# Requirements:
# - input_dim = 6
# - hidden_dim = 8
# - output_dim = 1
# - 1 hidden layer + ReLU
# Implement SimpleNet so forward(x) returns shape (batch, 1)

class SimpleNet(nn.Module):
    def __init__(self, input_dim=6, hidden_dim=8, output_dim=1):
        super().__init__()
        self.fc1 = nn.Linear(6,8)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(8,1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

model = SimpleNet()
print(model)

# --- autograder asserts (do not delete) ---
assert isinstance(model, nn.Module), "model must be an nn.Module"
params = dict(model.named_parameters())
assert "fc1.weight" in params and "fc2.weight" in params, "Model must have two Linear layers (fc1, fc2)"


SimpleNet(
  (fc1): Linear(in_features=6, out_features=8, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=8, out_features=1, bias=True)
)


In [38]:
# Exercise 6: Forward pass with dummy data
# NOTE: This exercise depends on Exercise 5 — complete that first.
# TODO:
# 1) Create dummy input x of shape (4, 6)
# 2) Run out = model(x)
# 3) Print out and out.shape

from ast import mod


x = torch.randn(4,6)
out = model(x)

# Print statements (uncomment after implementing)
print("out=\n", out)
print("out.shape:", out.shape)

# --- autograder asserts (do not delete) ---
assert x.shape == (4, 6), f"x must be shape (4,6), got {tuple(x.shape)}"
assert out.shape == (4, 1), f"out must be shape (4,1), got {tuple(out.shape)}"


out=
 tensor([[ 0.2229],
        [-0.3347],
        [-0.2475],
        [-0.4379]], grad_fn=<AddmmBackward0>)
out.shape: torch.Size([4, 1])


## Part 4 — One Training Step

In [39]:
# Exercise 7: One training step
# NOTE: This exercise depends on Exercise 5 — complete that first.
# TODO:
# 1) Create inputs x_train (batch=8, input_dim=6) and targets y_train (shape (8,1))
# 2) Define loss_fn = MSELoss and opt = SGD(model.parameters(), lr=0.1)
# 3) Perform exactly one update step and print loss_before and loss_after

torch.manual_seed(123)  # deterministic for this part

# Create training data (provided for you)
x_train = torch.randn(8, 6)
true_w = torch.tensor([[0.5], [-1.0], [0.3], [0.0], [1.2], [-0.7]])
y_train = x_train @ true_w + 0.01 * torch.randn(8, 1)

loss_fn = nn.MSELoss()
opt = torch.optim.SGD(model.parameters(), lr=0.1)

pred_b = model(x_train)
loss_before = loss_fn(pred_b, y_train)

loss_before.backward()

opt.step()

opt.zero_grad()


pred_a = model(x_train)
loss_after = loss_fn(pred_a, y_train)

# Print statements (uncomment after implementing)
print("loss_before:", float(loss_before))
print("loss_after :", float(loss_after))

# --- autograder asserts (do not delete) ---
assert loss_before.shape == (), "loss_before must be a scalar tensor"
assert loss_after.shape == (), "loss_after must be a scalar tensor"
assert float(loss_after) < float(loss_before), "loss_after should be < loss_before after one SGD step"


loss_before: 1.8027592897415161
loss_after : 1.4821903705596924


## Optional Stretch (No grade)
If you finish early:
1. Add a second training step and show loss keeps decreasing.
2. Change activation to Tanh and compare loss curves.


## Note on Stretch Goal Code

The stretch goal code example above was provided by the AI assistant (Cursor) as a learning reference. I requested to see a complete implementation to better understand:
- How multiple training steps work in practice
- How to compare different activation functions (ReLU vs Tanh)
- The structure and flow of a complete training comparison

This was for educational purposes to help me learn the concepts, not as a way to have the assistant complete the assignment for me.

In [46]:
# Optional Stretch: Multiple Training Steps + ReLU vs Tanh Comparison

import torch
import torch.nn as nn

# Set seed for reproducibility
torch.manual_seed(123)

# Reuse the training data from Exercise 7
x_train = torch.randn(8, 6)
true_w = torch.tensor([[0.5], [-1.0], [0.3], [0.0], [1.2], [-0.7]])
y_train = x_train @ true_w + 0.01 * torch.randn(8, 1)

# ============================================================================
# PART 1: Multiple Training Steps with ReLU
# ============================================================================

print("=" * 60)
print("PART 1: Multiple Training Steps (ReLU Model)")
print("=" * 60)

# Create ReLU model (your existing SimpleNet)
class SimpleNetReLU(nn.Module):
    def __init__(self, input_dim=6, hidden_dim=8, output_dim=1):
        super().__init__()
        self.fc1 = nn.Linear(6, 8)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(8, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Create model, loss function, and optimizer
model_relu = SimpleNetReLU()
loss_fn = nn.MSELoss()
opt_relu = torch.optim.SGD(model_relu.parameters(), lr=0.1)

# Store losses for plotting
losses_relu = []

# Perform 5 training steps
print("\nTraining ReLU Model:")
for step in range(5):
    # Forward pass
    y_pred = model_relu(x_train)
    loss = loss_fn(y_pred, y_train)

    # Backward pass and update
    opt_relu.zero_grad()
    loss.backward()
    opt_relu.step()

    # Store loss
    losses_relu.append(float(loss))
    print(f"  Step {step+1}: Loss = {loss:.4f}")

# ============================================================================
# PART 2: Tanh Model for Comparison
# ============================================================================

print("\n" + "=" * 60)
print("PART 2: Training Tanh Model for Comparison")
print("=" * 60)

# Create Tanh model (same architecture, different activation)
class SimpleNetTanh(nn.Module):
    def __init__(self, input_dim=6, hidden_dim=8, output_dim=1):
        super().__init__()
        self.fc1 = nn.Linear(6, 8)
        self.tanh = nn.Tanh()  # Tanh instead of ReLU
        self.fc2 = nn.Linear(8, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.tanh(x)  # Tanh activation
        x = self.fc2(x)
        return x

# Create new model with Tanh
model_tanh = SimpleNetTanh()
opt_tanh = torch.optim.SGD(model_tanh.parameters(), lr=0.1)

# Store losses for plotting
losses_tanh = []

# Perform 5 training steps with Tanh model
print("\nTraining Tanh Model:")
for step in range(5):
    # Forward pass
    y_pred = model_tanh(x_train)
    loss = loss_fn(y_pred, y_train)

    # Backward pass and update
    opt_tanh.zero_grad()
    loss.backward()
    opt_tanh.step()

    # Store loss
    losses_tanh.append(float(loss))
    print(f"  Step {step+1}: Loss = {loss:.4f}")

# ============================================================================
# PART 3: Text-Based Comparison
# ============================================================================

print("\n" + "=" * 60)
print("PART 3: Loss Comparison")
print("=" * 60)

# Print side-by-side comparison
print("\nStep-by-Step Comparison:")
print(f"{'Step':<8} {'ReLU Loss':<15} {'Tanh Loss':<15} {'Difference':<15}")
print("-" * 60)
for i in range(5):
    diff = losses_relu[i] - losses_tanh[i]
    print(f"{i+1:<8} {losses_relu[i]:<15.4f} {losses_tanh[i]:<15.4f} {diff:<15.4f}")

# Print summary
print("\n" + "=" * 60)
print("Summary:")
print("=" * 60)
print(f"ReLU - Initial Loss: {losses_relu[0]:.4f}, Final Loss: {losses_relu[-1]:.4f}")
relu_improvement = losses_relu[0] - losses_relu[-1]
relu_percent = (relu_improvement / losses_relu[0]) * 100
print(f"      Improvement: {relu_improvement:.4f} ({relu_percent:.1f}% reduction)")

print(f"\nTanh - Initial Loss: {losses_tanh[0]:.4f}, Final Loss: {losses_tanh[-1]:.4f}")
tanh_improvement = losses_tanh[0] - losses_tanh[-1]
tanh_percent = (tanh_improvement / losses_tanh[0]) * 100
print(f"      Improvement: {tanh_improvement:.4f} ({tanh_percent:.1f}% reduction)")
print("\n" + "=" * 60)

PART 1: Multiple Training Steps (ReLU Model)

Training ReLU Model:
  Step 1: Loss = 1.2374
  Step 2: Loss = 1.0672
  Step 3: Loss = 0.9489
  Step 4: Loss = 0.8531
  Step 5: Loss = 0.7742

PART 2: Training Tanh Model for Comparison

Training Tanh Model:
  Step 1: Loss = 1.9452
  Step 2: Loss = 1.3961
  Step 3: Loss = 1.1174
  Step 4: Loss = 0.9450
  Step 5: Loss = 0.8250

PART 3: Loss Comparison

Step-by-Step Comparison:
Step     ReLU Loss       Tanh Loss       Difference     
------------------------------------------------------------
1        1.2374          1.9452          -0.7078        
2        1.0672          1.3961          -0.3289        
3        0.9489          1.1174          -0.1685        
4        0.8531          0.9450          -0.0919        
5        0.7742          0.8250          -0.0508        

Summary:
ReLU - Initial Loss: 1.2374, Final Loss: 0.7742
      Improvement: 0.4632 (37.4% reduction)

Tanh - Initial Loss: 1.9452, Final Loss: 0.8250
      Improvement: 1.1