# Colab-First Minimal Trainer

This notebook trains a simple CNN on CIFAR-10 in Google Colab with GPU acceleration.

## Section A: Setup & Mount

In [None]:
# Check GPU availability
!nvidia-smi

In [None]:
# Clone/update the repo into /content/
import os
import subprocess

REPO_PATH = '/content/ml-colab-agentic'
REPO_URL = 'https://github.com/armanfeili/ml-colab-agentic.git'

if os.path.exists(REPO_PATH):
    print(f"{REPO_PATH} already exists. Updating...")
    subprocess.run(['git', '-C', REPO_PATH, 'pull'], check=True)
else:
    print(f"Cloning {REPO_URL}...")
    subprocess.run(['git', 'clone', REPO_URL, REPO_PATH], check=True)

print(f"Repository ready at {REPO_PATH}")

In [None]:
# Mount Google Drive (optional but recommended)
from google.colab import drive

drive.mount('/content/drive')
print("Google Drive mounted at /content/drive")

In [None]:
# Install dependencies from requirements.txt
!pip install -q -r /content/ml-colab-agentic/requirements.txt
print("Dependencies installed.")

In [None]:
# Add repo to path and verify imports
import sys
sys.path.insert(0, REPO_PATH)

from src.utils import (
    set_seed,
    get_device,
    prepare_dataloaders_cifar10,
    SimpleNet,
    train_one_epoch,
    evaluate,
    save_checkpoint,
    append_metrics_csv,
)
import torch
import torch.optim as optim

print("✅ All imports successful!")

## Section B: Config

In [None]:
# Configuration
CFG = {
    "seed": 42,
    "epochs": 5,
    "batch_size": 128,
    "lr": 1e-3,
    "num_workers": 2,
    "dataset": "CIFAR10",
    "data_root": "/content/data",
    "save_to_drive": True,
    "drive_dir": "/content/drive/MyDrive/ml-outputs",
}

print("Config:")
for key, val in CFG.items():
    print(f"  {key}: {val}")

## Section C: Train

In [None]:
# Set seed and device
set_seed(CFG["seed"])
device = get_device()
print(f"Device: {device}")
print(f"CUDA available: {torch.cuda.is_available()}")

In [None]:
# Prepare dataloaders
print(f"Loading {CFG['dataset']} from {CFG['data_root']}...")
train_dl, test_dl = prepare_dataloaders_cifar10(
    root=CFG["data_root"],
    batch_size=CFG["batch_size"],
    num_workers=CFG["num_workers"],
)
print(f"✅ Train batches: {len(train_dl)}, Test batches: {len(test_dl)}")

In [None]:
# Initialize model, optimizer, and paths
model = SimpleNet(num_classes=10).to(device)
opt = optim.Adam(model.parameters(), lr=CFG["lr"])

metrics_path = f"{REPO_PATH}/outputs/metrics.csv"
checkpoint_path = f"{REPO_PATH}/checkpoints/last.pt"

print(f"Model initialized on {device}")
print(f"Metrics will be saved to: {metrics_path}")
print(f"Checkpoint will be saved to: {checkpoint_path}")

In [None]:
# Training loop
from tqdm import tqdm

all_metrics = []

print(f"\nTraining for {CFG['epochs']} epochs...\n")
for epoch in range(CFG["epochs"]):
    # Train
    train_loss, train_acc = train_one_epoch(model, train_dl, opt, device)
    
    # Evaluate
    val_loss, val_acc = evaluate(model, test_dl, device)
    
    # Save checkpoint
    save_checkpoint(model, checkpoint_path)
    
    # Append metrics
    all_metrics.append((epoch + 1, train_loss, train_acc, val_loss, val_acc))
    append_metrics_csv(metrics_path, [(epoch + 1, train_loss, train_acc, val_loss, val_acc)])
    
    print(
        f"Epoch {epoch+1}/{CFG['epochs']} | "
        f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | "
        f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}"
    )

print(f"\n✅ Training complete!")

## Section D: Save Artifacts

In [None]:
# Show saved files
import os

print("Outputs:")
outputs_dir = f"{REPO_PATH}/outputs"
if os.path.exists(outputs_dir):
    for f in os.listdir(outputs_dir):
        fpath = os.path.join(outputs_dir, f)
        size = os.path.getsize(fpath) / (1024 * 1024)  # MB
        print(f"  {f} ({size:.2f} MB)")
else:
    print("  (none)")

print("\nCheckpoints:")
checkpoints_dir = f"{REPO_PATH}/checkpoints"
if os.path.exists(checkpoints_dir):
    for f in os.listdir(checkpoints_dir):
        fpath = os.path.join(checkpoints_dir, f)
        size = os.path.getsize(fpath) / (1024 * 1024)  # MB
        print(f"  {f} ({size:.2f} MB)")
else:
    print("  (none)")

In [None]:
# Copy artifacts to Google Drive (if enabled)
if CFG["save_to_drive"]:
    import shutil
    from pathlib import Path
    
    drive_dir = Path(CFG["drive_dir"])
    drive_dir.mkdir(parents=True, exist_ok=True)
    
    # Copy metrics
    if os.path.exists(metrics_path):
        shutil.copy(metrics_path, drive_dir / "metrics.csv")
        print(f"✅ Copied metrics to {drive_dir / 'metrics.csv'}")
    
    # Copy checkpoint
    if os.path.exists(checkpoint_path):
        shutil.copy(checkpoint_path, drive_dir / "last.pt")
        print(f"✅ Copied checkpoint to {drive_dir / 'last.pt'}")
    
    print(f"\n📁 All artifacts saved to Drive: {CFG['drive_dir']}")
else:
    print(f"📁 Artifacts saved locally in repo: {REPO_PATH}")

In [None]:
# Optional: Display metrics table
import pandas as pd

if os.path.exists(metrics_path):
    df = pd.read_csv(metrics_path)
    print("Training Metrics:")
    print(df.to_string(index=False))
else:
    print("No metrics file found.")

# ML Training Notebook

This notebook is designed to run on **Google Colab** with a **GPU runtime** (T4, A100, etc.).

## Quick setup
1. Go to **Runtime → Change runtime type → GPU**
2. Run the cells below in order
3. Monitor GPU usage with `!nvidia-smi`

## Cell 1: Check GPU

In [None]:
!nvidia-smi

## Cell 2: Install dependencies from repo

In [None]:
!pip -q install -r https://raw.githubusercontent.com/armanfeili/ml-colab-agentic/main/requirements.txt

## Cell 3: Quick sanity check with PyTorch

In [None]:
import torch

from src.utils import set_seed, to_device

set_seed(0)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

x = torch.randn(3, 3)
y = to_device(x)
print(f"Tensor device: {y.device}")

# Tiny training loop (toy example)
w = torch.randn(3, 3, requires_grad=True, device=device)
opt = torch.optim.SGD([w], lr=0.1)

for epoch in range(5):
    loss = (w @ w.T).mean()
    opt.zero_grad()
    loss.backward()
    opt.step()
    print(f"Epoch {epoch}: loss={float(loss):.4f}")

print("\n✅ Toy training loop completed!")

## Cell 4: Create outputs folder and write metrics

In [None]:
import pathlib

pathlib.Path("outputs").mkdir(exist_ok=True)

with open("outputs/metrics.csv", "w") as f:
    f.write("epoch,loss\n")
    for i in range(5):
        f.write(f"{i},0.{i}\n")

print("✅ Wrote outputs/metrics.csv")

# List outputs
!ls -la outputs/