# EU4 AI Training on Colab

Train LoRA adapters for EU4 AI using CUDA GPUs.

**One-time setup:**
1. Create a GitHub PAT at https://github.com/settings/tokens (classic token, `repo` scope)
2. In Colab sidebar: ðŸ”‘ Secrets â†’ Add new secret â†’ Name: `GITHUB_TOKEN`, Value: your PAT
3. Create a folder in Google Drive (e.g., `eu4_training/`) for training data

**Per-session:**
1. Upload your `.cpb.zip` training data to your Drive folder
2. Run all cells in order â€” scripts are pulled fresh from GitHub

**Tip:** To get a file path, mount Drive first, then right-click file in sidebar â†’ "Copy path"

In [None]:
# Check GPU availability
!nvidia-smi

In [None]:
# Mount Google Drive
from google.colab import drive

drive.mount("/content/drive")

In [None]:
# Clone repo from GitHub (uses PAT from Colab Secrets)
import os
from google.colab import userdata

# Get token from Colab Secrets (ðŸ”‘ icon in sidebar)
token = userdata.get("GITHUB_TOKEN")

# Clone fresh (remove old clone if exists)
if os.path.exists("eu4rs"):
    !rm -rf eu4rs

!git clone --depth 1 https://{token}@github.com/atvrager/eu4rs.git

print("âœ“ Cloned eu4rs from GitHub")

In [None]:
# Install dependencies and disable W&B prompts
import os

os.environ["WANDB_DISABLED"] = "true"

!pip install -q transformers peft trl datasets pycapnp safetensors

print("âœ“ Dependencies installed")

In [None]:
# Configuration - EDIT THESE
# ===========================

# Path to your training data in Google Drive
# Tip: Right-click file in sidebar â†’ "Copy path"
DATA_PATH = "/content/drive/MyDrive/eu4_training/run_10yr_1.cpb.zip"

# Where to save the trained adapter (in Drive for persistence)
OUTPUT_DIR = "/content/drive/MyDrive/eu4_training/adapters/run1"

# Model settings
BASE_MODEL = "HuggingFaceTB/SmolLM2-360M"  # or "google/gemma-2-2b-it" for larger
MAX_STEPS = 10000  # Adjust based on dataset size
BATCH_SIZE = 4  # T4 handles 4-8 well
GRAD_ACCUM = 2  # Effective batch = BATCH_SIZE * GRAD_ACCUM
SAVE_STEPS = 2500  # Checkpoint every N steps

# Repo paths (from GitHub clone)
REPO_DIR = "/content/eu4rs"
SCRIPTS_DIR = f"{REPO_DIR}/scripts"

# Verify paths exist
import os

assert os.path.exists(DATA_PATH), f"Data not found: {DATA_PATH}"
assert os.path.exists(SCRIPTS_DIR), f"Repo not cloned: {SCRIPTS_DIR}"
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"âœ“ Data: {DATA_PATH}")
print(f"âœ“ Scripts: {SCRIPTS_DIR}")
print(f"âœ“ Output: {OUTPUT_DIR}")
print(f"âœ“ Effective batch size: {BATCH_SIZE * GRAD_ACCUM}")

In [None]:
# Verify data can be loaded (streaming - doesn't load full dataset)
import sys

sys.path.insert(0, SCRIPTS_DIR)

from load_training_data import iter_batches_raw

print("Checking first batch...")
for batch in iter_batches_raw(DATA_PATH):
    print(f"âœ“ Loaded batch with {len(batch.samples)} samples")
    sample = batch.samples[0]
    print(f"  First sample: {sample.country} @ tick {sample.tick}")
    break
print("âœ“ Data format verified")

In [None]:
# Run training!
!cd {SCRIPTS_DIR} && python train_ai.py \
    --data "{DATA_PATH}" \
    --base-model "{BASE_MODEL}" \
    --output "{OUTPUT_DIR}" \
    --max-steps {MAX_STEPS} \
    --save-steps {SAVE_STEPS} \
    --batch-size {BATCH_SIZE} \
    --grad-accum {GRAD_ACCUM} \
    --prefetch 1000

In [None]:
# Verify output
import os

files = os.listdir(OUTPUT_DIR)
print(f"Adapter files in {OUTPUT_DIR}:")
for f in files:
    size = os.path.getsize(os.path.join(OUTPUT_DIR, f))
    print(f"  {f}: {size / 1024:.1f} KB")

## Resume Training

If Colab disconnects, you can resume from the last checkpoint:

In [None]:
# Find latest checkpoint
import os
import re

checkpoints = [d for d in os.listdir(OUTPUT_DIR) if d.startswith("checkpoint-")]
if checkpoints:
    latest = max(checkpoints, key=lambda x: int(re.search(r"\d+", x).group()))
    RESUME_FROM = os.path.join(OUTPUT_DIR, latest)
    print(f"Latest checkpoint: {RESUME_FROM}")
else:
    print("No checkpoints found")
    RESUME_FROM = None

In [None]:
# Resume training from checkpoint (run this cell to continue)
if RESUME_FROM:
    !cd {SCRIPTS_DIR} && python train_ai.py \
        --data "{DATA_PATH}" \
        --base-model "{BASE_MODEL}" \
        --output "{OUTPUT_DIR}" \
        --max-steps {MAX_STEPS} \
        --save-steps {SAVE_STEPS} \
        --batch-size {BATCH_SIZE} \
        --grad-accum {GRAD_ACCUM} \
        --prefetch 1000 \
        --resume-from "{RESUME_FROM}"
else:
    print("No checkpoint to resume from. Run initial training first.")