# Colab-First Minimal Trainer

This notebook trains a simple CNN on CIFAR-10 in Google Colab with GPU acceleration.

## Section A: Setup & Mount

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
print("‚úÖ Google Drive mounted at /content/drive")

In [None]:
import os, pathlib, time, yaml, shutil, pandas as pd

# ---- Project anchors ----
GDRIVE_ROOT = "/content/drive/MyDrive"
PROJECT_NAME = "ml-colab-agentic"        # change if you fork/rename
PROJECT_DRIVE = f"{GDRIVE_ROOT}/{PROJECT_NAME}"

# Code lives in Colab VM from GitHub clone; data/runs live in Drive:
DATA_DIR = f"{PROJECT_DRIVE}/data"       # data/raw, data/processed
RUNS_DIR = f"{PROJECT_DRIVE}/runs"       # one folder per training run
LATEST_DIR = f"{PROJECT_DRIVE}/latest"   # optional: points to latest run

# Ensure minimal skeleton on Drive
for p in [
    f"{DATA_DIR}/raw", f"{DATA_DIR}/processed",
    RUNS_DIR, f"{LATEST_DIR}"
]:
    pathlib.Path(p).mkdir(parents=True, exist_ok=True)

print("üìÅ DATA_DIR:", DATA_DIR)
print("üìÅ RUNS_DIR:", RUNS_DIR)
print("üìÅ LATEST_DIR:", LATEST_DIR)

In [None]:
def new_run_id(dataset, model, note=""):
    ts = time.strftime("%Y-%m-%d_%H-%M")
    return "_".join(x for x in [ts, dataset, model, note] if x)

RUN_ID  = new_run_id("cifar10", "simplenet", "amp")
RUN_DIR = f"{RUNS_DIR}/{RUN_ID}"

# Create per-run subfolders on Drive
subs = [
    "checkpoints",
    "plots/train","plots/val","plots/test","plots/calib",
    "artifacts/train","artifacts/val","artifacts/test","artifacts/calib",
    "cache"
]
for s in subs:
    pathlib.Path(f"{RUN_DIR}/{s}").mkdir(parents=True, exist_ok=True)

# Save the frozen config for reproducibility
CFG = {
    "seed": 42,
    "epochs": 5,
    "batch_size": 128,
    "lr": 1e-3,
    "dataset": "CIFAR10",
    "data_root": f"{DATA_DIR}/raw",
    "num_workers": 2,
    "amp": True,
}
with open(f"{RUN_DIR}/cfg.yaml", "w") as f:
    yaml.safe_dump(CFG, f)

# Convenience: point a "latest" folder to this run (copy if symlink fails)
def safe_point_latest(src, dst):
    try:
        if os.path.islink(dst) or os.path.exists(dst):
            if os.path.islink(dst):
                os.unlink(dst)
            else:
                shutil.rmtree(dst)
        os.symlink(src, dst)
    except Exception:
        shutil.copytree(src, dst)

safe_point_latest(RUN_DIR, f"{LATEST_DIR}/run")
print("üè∑Ô∏è RUN_ID:", RUN_ID)
print("üìÅ RUN_DIR:", RUN_DIR)

In [None]:
METRICS_CSV = f"{RUN_DIR}/metrics.csv"

def log_metrics(rows):
    """
    rows: list of dicts with keys: split, epoch, metric, value
    """
    df = pd.DataFrame(rows, columns=["split","epoch","metric","value"])
    df.to_csv(METRICS_CSV, mode="a", header=not os.path.exists(METRICS_CSV), index=False)

In [None]:
# Check GPU availability
!nvidia-smi

In [None]:
# Clone/update the repo into /content/
import os
import subprocess

REPO_PATH = '/content/ml-colab-agentic'
REPO_URL = 'https://github.com/armanfeili/ml-colab-agentic.git'

if os.path.exists(REPO_PATH):
    print(f"{REPO_PATH} already exists. Updating...")
    subprocess.run(['git', '-C', REPO_PATH, 'pull'], check=True)
else:
    print(f"Cloning {REPO_URL}...")
    subprocess.run(['git', 'clone', REPO_URL, REPO_PATH], check=True)

print(f"Repository ready at {REPO_PATH}")

In [None]:
# Install dependencies from requirements.txt
!pip install -q -r /content/ml-colab-agentic/requirements.txt
print("Dependencies installed.")

In [None]:
# Add repo to path and verify imports
import sys
sys.path.insert(0, REPO_PATH)

from src.utils import (
    set_seed,
    get_device,
    prepare_dataloaders_cifar10,
    SimpleNet,
    train_one_epoch,
    evaluate,
    save_checkpoint,
    append_metrics_csv,
)
import torch
import torch.optim as optim

print("‚úÖ All imports successful!")

## Section B: Config

In [None]:
# Display current configuration (already saved to cfg.yaml)
print("Current Configuration:")
print("=" * 50)
for key, val in CFG.items():
    print(f"  {key:20s}: {val}")
print("=" * 50)
print(f"\n‚úÖ Config saved to: {RUN_DIR}/cfg.yaml")

## Section C: Train

In [None]:
# Set seed and device
set_seed(CFG["seed"])
device = get_device()
print(f"Device: {device}")
print(f"CUDA available: {torch.cuda.is_available()}")

In [None]:
# Prepare dataloaders
print(f"Loading {CFG['dataset']} from {CFG['data_root']}...")
train_dl, test_dl = prepare_dataloaders_cifar10(
    root=CFG["data_root"],
    batch_size=CFG["batch_size"],
    num_workers=CFG["num_workers"],
)
print(f"‚úÖ Train batches: {len(train_dl)}, Test batches: {len(test_dl)}")

In [None]:
# Initialize model and optimizer
model = SimpleNet(num_classes=10).to(device)
opt = optim.Adam(model.parameters(), lr=CFG["lr"])

print(f"Model initialized on {device}")
print(f"Results will be saved to: {RUN_DIR}")

In [None]:
# Training loop
from tqdm import tqdm

print(f"\nTraining for {CFG['epochs']} epochs...\n")
best_val_acc = 0.0

for epoch in range(CFG["epochs"]):
    # Train
    train_loss, train_acc = train_one_epoch(model, train_dl, opt, device)
    
    # Evaluate
    val_loss, val_acc = evaluate(model, test_dl, device)
    
    # Save checkpoint
    epoch_ckpt = f"{RUN_DIR}/checkpoints/epoch_{epoch+1:03d}.pt"
    save_checkpoint(model, epoch_ckpt)
    
    # Update best checkpoint if improved
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        import shutil
        shutil.copy2(epoch_ckpt, f"{RUN_DIR}/checkpoints/best.pt")
    
    # Log metrics (long-form)
    log_metrics([
        {"split": "train", "epoch": epoch+1, "metric": "loss", "value": train_loss},
        {"split": "train", "epoch": epoch+1, "metric": "acc", "value": train_acc},
        {"split": "val", "epoch": epoch+1, "metric": "loss", "value": val_loss},
        {"split": "val", "epoch": epoch+1, "metric": "acc", "value": val_acc},
    ])
    
    print(
        f"Epoch {epoch+1}/{CFG['epochs']} | "
        f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | "
        f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}"
    )

print(f"\n‚úÖ Training complete! Best val acc: {best_val_acc:.4f}")
print(f"Results saved to: {RUN_DIR}")

## Section D: Save Artifacts

In [None]:
# Show run directory contents
import os

print(f"Run directory: {RUN_DIR}\n")

# List all files in the run directory
for root, dirs, files in os.walk(RUN_DIR):
    level = root.replace(RUN_DIR, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f'{indent}{os.path.basename(root)}/')
    subindent = ' ' * 2 * (level + 1)
    for file in files:
        fpath = os.path.join(root, file)
        size = os.path.getsize(fpath) / (1024)  # KB
        print(f'{subindent}{file} ({size:.1f} KB)')

In [None]:
# All artifacts are already on Google Drive!
print("‚úÖ All artifacts saved to Google Drive:")
print(f"   {RUN_DIR}")
print(f"\nüìç Access from any device:")
print(f"   Google Drive ‚Üí MyDrive ‚Üí {PROJECT_NAME} ‚Üí runs ‚Üí {RUN_ID}")
print(f"\n? Latest run always available at:")
print(f"   {LATEST_DIR}/run")

In [None]:
# Display metrics table
import pandas as pd

if os.path.exists(METRICS_CSV):
    df = pd.read_csv(METRICS_CSV)
    print("Training Metrics (long-form):")
    print(df.to_string(index=False))
    
    # Pivot for easier viewing
    print("\n\nPivoted view:")
    pivot = df.pivot_table(index=['split', 'epoch'], columns='metric', values='value')
    print(pivot)
else:
    print("No metrics file found.")