# Colab-First ML Training Pipeline

This notebook trains a simple CNN on CIFAR-10 in Google Colab with GPU acceleration.

**Architecture:**
- **Code:** Stored in GitHub (`/content/ml-colab-agentic` in Colab VM) — edit locally with Copilot, push to GitHub
- **Data/Runs/Checkpoints:** Stored in Google Drive (`MyDrive/ml-colab-agentic/`) — persists across sessions

## Section A — Setup (Drive + Repo)

In [None]:
# A0) Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
print("✅ Google Drive mounted at /content/drive")

In [None]:
# A1) Paths: code in Colab VM; data/runs in Drive
import os, sys, pathlib, time, shutil, subprocess, yaml, pandas as pd

# ---- Drive anchors ----
GDRIVE_ROOT   = "/content/drive/MyDrive"
PROJECT_NAME  = "ml-colab-agentic"   # change if you fork/rename
PROJECT_DRIVE = f"{GDRIVE_ROOT}/{PROJECT_NAME}"

# Storage in Drive
DATA_DIR   = f"{PROJECT_DRIVE}/data"      # data/raw, data/processed
RUNS_DIR   = f"{PROJECT_DRIVE}/runs"      # one folder per training run
LATEST_DIR = f"{PROJECT_DRIVE}/latest"    # points to latest run

# Ensure Drive skeleton
for p in [f"{DATA_DIR}/raw", f"{DATA_DIR}/processed", RUNS_DIR, LATEST_DIR]:
    pathlib.Path(p).mkdir(parents=True, exist_ok=True)

print("📁 DATA_DIR:", DATA_DIR)
print("📁 RUNS_DIR:", RUNS_DIR)
print("📁 LATEST_DIR:", LATEST_DIR)

In [None]:
# A2) Clone/Update code from GitHub → /content (code stays in the VM)
REPO_OWNER = "armanfeili"                              # <— your GitHub username
REPO_NAME  = "ml-colab-agentic"
REPO_URL   = f"https://github.com/{REPO_OWNER}/{REPO_NAME}.git"
REPO_PATH  = f"/content/{REPO_NAME}"

# Optional: for private repos set a PAT in GITHUB_TOKEN (Colab session env)
token = os.environ.get("GITHUB_TOKEN", "").strip()
if token:
    REPO_URL = f"https://{token}:x-oauth-basic@github.com/{REPO_OWNER}/{REPO_NAME}.git"

if os.path.exists(REPO_PATH):
    print(f"{REPO_PATH} exists → pulling latest...")
    subprocess.run(["git", "-C", REPO_PATH, "fetch", "--prune"], check=True)
    subprocess.run(["git", "-C", REPO_PATH, "checkout", "main"], check=True)
    subprocess.run(["git", "-C", REPO_PATH, "pull", "--ff-only"], check=True)
else:
    print(f"Cloning {REPO_URL} → {REPO_PATH} ...")
    subprocess.run(["git", "clone", "--depth=1", REPO_URL, REPO_PATH], check=True)

print("✅ Repository ready at:", REPO_PATH)

In [None]:
# A3) Install Python deps (from the repo's requirements.txt)
subprocess.run(["pip", "install", "-q", "-r", f"{REPO_PATH}/requirements.txt"], check=True)
print("✅ Dependencies installed")

In [None]:
# A4) Add repo to import path and verify imports
sys.path.insert(0, REPO_PATH)

from src.utils import (
    set_seed, get_device, prepare_dataloaders_cifar10, SimpleNet,
    train_one_epoch, evaluate, save_checkpoint, append_metrics_csv
)
import torch, torch.optim as optim
from tqdm import tqdm

print("✅ Imports OK | torch:", torch.__version__)

In [None]:
# A5) (Optional) GPU info
!nvidia-smi

In [None]:
## Section B — Run Config (frozen to Drive)

In [None]:
# B0) Create a timestamped run folder in Drive and freeze config
def new_run_id(dataset, model, note=""):
    ts = time.strftime("%Y-%m-%d_%H-%M")
    return "_".join(x for x in [ts, dataset, model, note] if x)

RUN_ID  = new_run_id("cifar10", "simplenet", "amp")
RUN_DIR = f"{RUNS_DIR}/{RUN_ID}"

subfolders = [
    "checkpoints",
    "plots/train","plots/val","plots/test","plots/calib",
    "artifacts/train","artifacts/val","artifacts/test","artifacts/calib",
    "cache",
]
for s in subfolders:
    pathlib.Path(f"{RUN_DIR}/{s}").mkdir(parents=True, exist_ok=True)

CFG = {
    "seed": 42,
    "epochs": 5,
    "batch_size": 128,
    "lr": 1e-3,
    "dataset": "CIFAR10",
    # IMPORTANT: dataset root is in Drive (cloud storage), not in /content
    "data_root": f"{DATA_DIR}/raw",
    "num_workers": 2,
    "amp": True,
}

with open(f"{RUN_DIR}/cfg.yaml", "w") as f:
    yaml.safe_dump(CFG, f)

# Point "latest" → this run (copy if symlink fails on Drive)
def safe_point_latest(src, dst):
    try:
        if os.path.islink(dst) or os.path.exists(dst):
            if os.path.islink(dst): os.unlink(dst)
            else: shutil.rmtree(dst)
        os.symlink(src, dst)
    except Exception:
        shutil.copytree(src, dst)

safe_point_latest(RUN_DIR, f"{LATEST_DIR}/run")

print("🏷️ RUN_ID :", RUN_ID)
print("📁 RUN_DIR:", RUN_DIR)
print("✅ Config frozen at:", f"{RUN_DIR}/cfg.yaml")

# B1) Metrics logger (long-form CSV on Drive)
METRICS_CSV = f"{RUN_DIR}/metrics.csv"

def log_metrics(rows):
    # rows = list[dict]: {split, epoch, metric, value}
    df = pd.DataFrame(rows, columns=["split","epoch","metric","value"])
    df.to_csv(METRICS_CSV, mode="a", header=not os.path.exists(METRICS_CSV), index=False)

print("✅ Metrics will be appended to:", METRICS_CSV)

In [None]:
## Section C — Train (data & artifacts on Drive)

# C0) Seed & device
set_seed(CFG["seed"])
device = get_device()
print(f"Device: {device} | CUDA available: {torch.cuda.is_available()}")

In [None]:
# C1) DataLoaders — downloads cached to Drive (CFG['data_root'])
print(f"Loading {CFG['dataset']} from {CFG['data_root']} ...")
train_dl, test_dl = prepare_dataloaders_cifar10(
    root=CFG["data_root"],
    batch_size=CFG["batch_size"],
    num_workers=CFG["num_workers"],
)
print(f"✅ Train batches: {len(train_dl)} | Test batches: {len(test_dl)}")

In [None]:
# C2) Model + Optimizer
model = SimpleNet(num_classes=10).to(device)
opt   = optim.Adam(model.parameters(), lr=CFG["lr"])
print("✅ Model initialized on", device)
print("📦 Outputs will be stored in:", RUN_DIR)

In [None]:
# C3) Training loop → saves checkpoints & metrics to Drive
best_val_acc = 0.0
E = CFG["epochs"]

print(f"\n🚀 Training for {E} epochs ...\n")
for epoch in range(1, E+1):
    train_loss, train_acc = train_one_epoch(model, train_dl, opt, device)
    val_loss,   val_acc   = evaluate(model, test_dl, device)

    # Save epoch checkpoint in Drive
    ckpt_path = f"{RUN_DIR}/checkpoints/epoch_{epoch:03d}.pt"
    save_checkpoint(model, ckpt_path)

    # Update best
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        shutil.copy2(ckpt_path, f"{RUN_DIR}/checkpoints/best.pt")

    # Log metrics to Drive
    log_metrics([
        {"split":"train","epoch":epoch,"metric":"loss","value":train_loss},
        {"split":"train","epoch":epoch,"metric":"acc","value":train_acc},
        {"split":"val","epoch":epoch,"metric":"loss","value":val_loss},
        {"split":"val","epoch":epoch,"metric":"acc","value":val_acc},
    ])

    print(
        f"Epoch {epoch:02d}/{E} | "
        f"Train: loss={train_loss:.4f} acc={train_acc:.4f} | "
        f"Val: loss={val_loss:.4f} acc={val_acc:.4f}"
    )

print(f"\n✅ Training complete! Best val acc: {best_val_acc:.4f}")
print("📁 Run saved at:", RUN_DIR)

In [None]:
## Section D — Inspect (everything is already on Drive)

# D0) Pretty-print run contents (Drive)
for root, dirs, files in os.walk(RUN_DIR):
    level = root.replace(RUN_DIR, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = ' ' * 2 * (level + 1)
    for file in files:
        fpath = os.path.join(root, file)
        size_kb = os.path.getsize(fpath) / 1024
        print(f"{subindent}{file} ({size_kb:.1f} KB)")

print("\n✅ All artifacts are in Google Drive:")
print(f"   MyDrive → {PROJECT_NAME} → runs → {RUN_ID}")
print("🔗 Latest run pointer:", f"{LATEST_DIR}/run")

In [None]:
# D1) Show metrics (Drive CSV)
import pandas as pd

if os.path.exists(METRICS_CSV):
    df = pd.read_csv(METRICS_CSV)
    print("\nTraining Metrics (long-form):")
    display(df)

    print("\nPivoted view:")
    pivot = df.pivot_table(index=['split','epoch'], columns='metric', values='value')
    display(pivot)
else:
    print("No metrics file found at:", METRICS_CSV)