# Nested Learning Quickstart (Notebook)

A fast, notebook-friendly way to sanity-check the HOPE model. This runs a tiny forward/backward step and prints device info. For full training, use `train_hope.py` from the README.

## Install dependencies (optional)

If your environment does not already have the project installed, uncomment the line below. Colab users can run it as-is; local users with `uv` can skip this and use the CLI instead.

```bash
# !pip install -q -r requirements.txt
```


In [None]:
import pathlib, sys, os

# Make sure the repo root is on sys.path whether we start in root or notebooks/
ROOT = pathlib.Path().resolve()
if (ROOT / "src").exists():
    sys.path.append(str(ROOT))
elif (ROOT.parent / "src").exists():
    ROOT = ROOT.parent
    sys.path.append(str(ROOT))
else:
    raise RuntimeError("Run this notebook from the repo root or the notebooks/ directory.")


In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("Using CPU (works for quick smoke tests)")


In [None]:
from src.models.hope import Hope, HopeConfig

torch.manual_seed(0)

# Tiny config for quick runs; fits on CPU or small GPUs
config = HopeConfig(
    d_model=64,
    d_hidden=256,
    d_key=16,
    d_value=16,
    num_heads=4,
    num_layers=1,
    vocab_size=256,
    max_seq_len=128,
    cms_num_levels=2,
    cms_base_chunk_size=4,
)

model = Hope(config).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

# Dummy tokens
batch_size, seq_len = 2, 16
input_ids = torch.randint(0, config.vocab_size, (batch_size, seq_len), device=device)
labels = torch.randint(0, config.vocab_size, (batch_size, seq_len), device=device)

model.train()
out = model(input_ids, labels=labels)
loss = out["loss"]
loss.backward()
optimizer.step()
optimizer.zero_grad()

print("Loss:", float(loss))
print("Logits shape:", out["logits"].shape)
print("First token logits (5 dims):", out["logits"][0, 0, :5].detach().cpu())
