# Check for Understanding — Autograded (PyTorch)
Run each cell. **Do not delete the asserts.**

**Passing condition:** all asserts pass.

Tip: If an assert fails, read its message, fix your code, and rerun the cell.


In [5]:
%pip install torch

Collecting torch
  Using cached torch-2.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (31 kB)
Using cached torch-2.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (915.6 MB)
Installing collected packages: torch
Successfully installed torch-2.10.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
%pip install numpy

Collecting numpy
  Downloading numpy-2.4.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Downloading numpy-2.4.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.9/16.9 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: numpy
Successfully installed numpy-2.4.2
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Setup
import torch
import torch.nn as nn

torch.manual_seed(42)

def _is_close(a, b, tol=1e-5):
    return torch.allclose(a, b, atol=tol, rtol=0)

print("PyTorch version:", torch.__version__)


PyTorch version: 2.10.0+cu128


## Part 1 — Tensors & Representations

In [3]:
# Exercise 1: Tensor basics
# TODO:
# 1) Create a 2x3 tensor of random values called X
# 2) Print X, X.shape, X.dtype
# 3) Compute the mean of all elements and store it in x_mean (a 0-d tensor)

print("starting")

X = torch.rand(2, 3)  # YOUR CODE HERE
x_mean = X.mean()  # YOUR CODE HERE

# Print statements (uncomment after implementing)
print("X=\n", X)
print("shape:", X.shape)
print("dtype:", X.dtype)
print("mean:", x_mean)

# --- autograder asserts (do not delete) ---
assert isinstance(X, torch.Tensor), "X must be a torch.Tensor"
assert X.shape == (2, 3), f"X must have shape (2,3), got {tuple(X.shape)}"
assert X.dtype in (torch.float32, torch.float64), f"X should be float32/float64, got {X.dtype}"
assert isinstance(x_mean, torch.Tensor) and x_mean.shape == (), "x_mean must be a scalar (0-d) tensor"
assert _is_close(x_mean, X.sum() / X.numel()), "x_mean should equal X.sum()/X.numel()"


starting
X=
 tensor([[0.8823, 0.9150, 0.3829],
        [0.9593, 0.3904, 0.6009]])
shape: torch.Size([2, 3])
dtype: torch.float32
mean: tensor(0.6885)


In [4]:
# Exercise 2: Manual vector operations
# TODO:
# 1) Create v1 and v2 as 1-D tensors of length 5
# 2) Compute element-wise sum: v_sum
# 3) Compute dot product: v_dot (scalar tensor)

v1 = torch.tensor([0, 1, 2, 3, 4])  # YOUR CODE HERE
v2 = torch.tensor([5, 6, 7, 8, 9])  # YOUR CODE HERE

v_sum = v1 + v2  # YOUR CODE HERE
v_dot = torch.dot(v1, v2)  # YOUR CODE HERE

# Print statements (uncomment after implementing)
print("v1:", v1)
print("v2:", v2)
print("v_sum:", v_sum)
print("v_dot:", v_dot)

# --- autograder asserts (do not delete) ---
assert v1.shape == (5,) and v2.shape == (5,), "v1 and v2 must both be shape (5,)"
assert v_sum.shape == (5,), "v_sum must be a length-5 vector"
assert v_dot.shape == (), "v_dot must be a scalar (0-d) tensor"
manual_dot = (v1 * v2).sum()
assert _is_close(v_dot, manual_dot), "v_dot must equal (v1*v2).sum()"


v1: tensor([0, 1, 2, 3, 4])
v2: tensor([5, 6, 7, 8, 9])
v_sum: tensor([ 5,  7,  9, 11, 13])
v_dot: tensor(80)


## Part 2 — Embeddings

In [6]:
# Exercise 3: Simple embedding lookup
# TODO:
# 1) Create an nn.Embedding called emb with vocab_size=10 and emb_dim=4
# 2) Create token_ids as a LongTensor of shape (3,) with values in [0, 9]
# 3) Lookup embeddings: E = emb(token_ids)
# 4) Print E and E.shape

vocab_size, emb_dim = 10, 4
emb = nn.Embedding(vocab_size, emb_dim)  # YOUR CODE HERE

token_ids = torch.tensor([1, 5, 8])  # YOUR CODE HERE
E = emb(token_ids)  # YOUR CODE HERE

# Print statements (uncomment after implementing)
print("token_ids:", token_ids)
print("E=\n", E)
print("E.shape:", E.shape)

# --- autograder asserts (do not delete) ---
assert isinstance(emb, nn.Embedding), "emb must be an nn.Embedding"
assert token_ids.dtype == torch.long, "token_ids must be torch.long"
assert token_ids.shape == (3,), f"token_ids must be shape (3,), got {tuple(token_ids.shape)}"
assert E.shape == (3, 4), f"E must have shape (3,4), got {tuple(E.shape)}"
assert E.requires_grad, "Embedding output should require gradients by default"


token_ids: tensor([1, 5, 8])
E=
 tensor([[ 2.2181e+00,  5.2317e-01,  3.4665e-01, -1.9733e-01],
        [-1.3407e+00, -5.8537e-01,  5.3619e-01,  5.2462e-01],
        [-1.3109e-03, -3.0360e-01, -9.8644e-01,  1.2330e-01]],
       grad_fn=<EmbeddingBackward0>)
E.shape: torch.Size([3, 4])


In [7]:
# Exercise 4: From embeddings to a prediction
# NOTE: This exercise depends on Exercise 3 — complete that first.
# TODO:
# 1) Compute mean embedding across tokens: mean_E of shape (4,)
# 2) Create a Linear layer (4 -> 1) called head
# 3) Produce y_pred as shape (1,) or scalar

mean_E = torch.mean(E, dim=0)  # YOUR CODE HERE
head = nn.Linear(4, 1)  # YOUR CODE HERE

y_pred = head(mean_E)  # YOUR CODE HERE

# Print statements (uncomment after implementing)
print("mean_E.shape:", mean_E.shape)
print("y_pred:", y_pred, "shape:", y_pred.shape)

# --- autograder asserts (do not delete) ---
assert mean_E.shape == (4,), f"mean_E must be shape (4,), got {tuple(mean_E.shape)}"
assert isinstance(head, nn.Linear) and head.in_features == 4 and head.out_features == 1, "head must be Linear(4->1)"
assert y_pred.numel() == 1, "y_pred must have exactly 1 element"
assert y_pred.requires_grad, "y_pred should require gradients"


mean_E.shape: torch.Size([4])
y_pred: tensor([-0.0891], grad_fn=<ViewBackward0>) shape: torch.Size([1])


## Part 3 — Build a Tiny Network

In [8]:
# Exercise 5: Define a simple feed-forward network
# Requirements:
# - input_dim = 6
# - hidden_dim = 8
# - output_dim = 1
# - 1 hidden layer + ReLU
# Implement SimpleNet so forward(x) returns shape (batch, 1)

class SimpleNet(nn.Module):
    def __init__(self, input_dim=6, hidden_dim=8, output_dim=1):
        super().__init__()
        # YOUR CODE HERE — define layers (fc1, fc2, activation)
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.activation = nn.ReLU()

    def forward(self, x):
        # YOUR CODE HERE — implement forward pass
        x = self.fc1(x)
        x = self.activation(x)
        x = self.fc2(x)
        return x

model = SimpleNet()
print(model)

# --- autograder asserts (do not delete) ---
assert isinstance(model, nn.Module), "model must be an nn.Module"
params = dict(model.named_parameters())
assert "fc1.weight" in params and "fc2.weight" in params, "Model must have two Linear layers (fc1, fc2)"


SimpleNet(
  (fc1): Linear(in_features=6, out_features=8, bias=True)
  (fc2): Linear(in_features=8, out_features=1, bias=True)
  (activation): ReLU()
)


In [9]:
# Exercise 6: Forward pass with dummy data
# NOTE: This exercise depends on Exercise 5 — complete that first.
# TODO:
# 1) Create dummy input x of shape (4, 6)
# 2) Run out = model(x)
# 3) Print out and out.shape

x = torch.ones(4, 6)  # YOUR CODE HERE
out = model(x)  # YOUR CODE HERE

# Print statements (uncomment after implementing)
print("out=\n", out)
print("out.shape:", out.shape)

# --- autograder asserts (do not delete) ---
assert x.shape == (4, 6), f"x must be shape (4,6), got {tuple(x.shape)}"
assert out.shape == (4, 1), f"out must be shape (4,1), got {tuple(out.shape)}"


out=
 tensor([[0.0162],
        [0.0162],
        [0.0162],
        [0.0162]], grad_fn=<AddmmBackward0>)
out.shape: torch.Size([4, 1])


## Part 4 — One Training Step

In [11]:
# Exercise 7: One training step
# NOTE: This exercise depends on Exercise 5 — complete that first.
# TODO:
# 1) Create inputs x_train (batch=8, input_dim=6) and targets y_train (shape (8,1))
# 2) Define loss_fn = MSELoss and opt = SGD(model.parameters(), lr=0.1)
# 3) Perform exactly one update step and print loss_before and loss_after

torch.manual_seed(123)  # deterministic for this part

# Create training data (provided for you)
x_train = torch.randn(8, 6)
true_w = torch.tensor([[0.5], [-1.0], [0.3], [0.0], [1.2], [-0.7]])
y_train = x_train @ true_w + 0.01 * torch.randn(8, 1)

loss_fn = nn.MSELoss()  # YOUR CODE HERE
opt = torch.optim.SGD(model.parameters(), lr=0.1)  # YOUR CODE HERE

# YOUR CODE HERE — compute loss_before (forward pass + loss)
y_pred_before = model(x_train)
loss_before = loss_fn(y_pred_before, y_train)

# YOUR CODE HERE — perform backward pass and optimizer step
loss_before.backward()

opt.step()
opt.zero_grad()

# YOUR CODE HERE — compute loss_after (forward pass + loss)
y_pred_after = model(x_train)
loss_after = loss_fn(y_pred_after, y_train)

# Print statements (uncomment after implementing)
print("loss_before:", float(loss_before))
print("loss_after :", float(loss_after))

# --- autograder asserts (do not delete) ---
assert loss_before.shape == (), "loss_before must be a scalar tensor"
assert loss_after.shape == (), "loss_after must be a scalar tensor"
assert float(loss_after) < float(loss_before), "loss_after should be < loss_before after one SGD step"


loss_before: 1.315673828125
loss_after : 0.9555887579917908


## Optional Stretch (No grade)
If you finish early:
1. Add a second training step and show loss keeps decreasing.
2. Change activation to Tanh and compare loss curves.
