# EU4 AI Training on Colab

Train LoRA adapters for EU4 AI using CUDA GPUs.

**Setup:**
1. Create a folder in Google Drive (e.g., `eu4_training/`)
2. Upload your `.cpb.zip` training data to that folder
3. Upload these files from the repo to the same folder:
   - `scripts/train_ai.py`
   - `scripts/load_training_data.py`
   - `schemas/training.capnp`
4. Run all cells in order

**Tip:** To get a file path, mount Drive first, then right-click file in sidebar → "Copy path"

In [None]:
# Check GPU availability
!nvidia-smi

In [None]:
# Mount Google Drive
from google.colab import drive

drive.mount("/content/drive")

In [None]:
# Install dependencies and disable W&B prompts
import os

os.environ["WANDB_DISABLED"] = "true"

!pip install -q transformers peft trl datasets pycapnp safetensors

In [None]:
# Copy scripts from Drive to local working directory
import shutil
import os

# Path to your scripts folder in Drive (same folder as your data)
# Right-click folder in sidebar → "Copy path"
SCRIPTS_FOLDER = "/content/drive/MyDrive/eu4_training"

# Copy scripts to local directory
os.makedirs("scripts", exist_ok=True)
os.makedirs("schemas", exist_ok=True)  # Note: schemas/ not schema/

scripts = ["train_ai.py", "load_training_data.py"]
for script in scripts:
    src = os.path.join(SCRIPTS_FOLDER, script)
    dst = f"scripts/{script}"
    shutil.copy(src, dst)
    print(f"✓ Copied {script}")

# Copy schema (load_training_data.py expects ../schemas/training.capnp)
schema_src = os.path.join(SCRIPTS_FOLDER, "training.capnp")
shutil.copy(schema_src, "schemas/training.capnp")
print("✓ Copied training.capnp")

In [None]:
# Configuration - EDIT THESE
# ===========================

# Path to your training data in Google Drive
# Tip: Right-click file in sidebar → "Copy path"
DATA_PATH = "/content/drive/MyDrive/eu4_training/run_10yr_1.cpb.zip"

# Where to save the trained adapter (in Drive for persistence)
OUTPUT_DIR = "/content/drive/MyDrive/eu4_training/adapters/run1"

# Model settings
BASE_MODEL = "HuggingFaceTB/SmolLM2-360M"  # or "google/gemma-2-2b-it" for larger
MAX_STEPS = 10000  # Adjust based on dataset size
BATCH_SIZE = 4  # T4 handles 4-8 well
GRAD_ACCUM = 2  # Effective batch = BATCH_SIZE * GRAD_ACCUM
SAVE_STEPS = 2500  # Checkpoint every N steps

# Verify paths exist
import os

assert os.path.exists(DATA_PATH), f"Data not found: {DATA_PATH}"
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"✓ Data: {DATA_PATH}")
print(f"✓ Output: {OUTPUT_DIR}")
print(f"✓ Effective batch size: {BATCH_SIZE * GRAD_ACCUM}")

In [None]:
# Verify data can be loaded (streaming - doesn't load full dataset)
import sys

sys.path.insert(0, "scripts")

from load_training_data import iter_batches_raw

print("Checking first batch...")
for batch in iter_batches_raw(DATA_PATH):
    print(f"✓ Loaded batch with {len(batch.samples)} samples")
    sample = batch.samples[0]
    print(f"  First sample: {sample.country} @ tick {sample.tick}")
    break
print("✓ Data format verified")

In [None]:
# Run training!
!cd scripts && python train_ai.py \
    --data "{DATA_PATH}" \
    --base-model "{BASE_MODEL}" \
    --output "{OUTPUT_DIR}" \
    --max-steps {MAX_STEPS} \
    --save-steps {SAVE_STEPS} \
    --batch-size {BATCH_SIZE} \
    --grad-accum {GRAD_ACCUM} \
    --prefetch 1000

In [None]:
# Verify output
import os

files = os.listdir(OUTPUT_DIR)
print(f"Adapter files in {OUTPUT_DIR}:")
for f in files:
    size = os.path.getsize(os.path.join(OUTPUT_DIR, f))
    print(f"  {f}: {size / 1024:.1f} KB")

## Resume Training

If Colab disconnects, you can resume from the last checkpoint:

In [None]:
# Find latest checkpoint
import os
import re

checkpoints = [d for d in os.listdir(OUTPUT_DIR) if d.startswith("checkpoint-")]
if checkpoints:
    latest = max(checkpoints, key=lambda x: int(re.search(r"\d+", x).group()))
    RESUME_FROM = os.path.join(OUTPUT_DIR, latest)
    print(f"Latest checkpoint: {RESUME_FROM}")
else:
    print("No checkpoints found")
    RESUME_FROM = None

In [None]:
# Resume training from checkpoint (run this cell to continue)
if RESUME_FROM:
    !cd scripts && python train_ai.py \
        --data "{DATA_PATH}" \
        --base-model "{BASE_MODEL}" \
        --output "{OUTPUT_DIR}" \
        --max-steps {MAX_STEPS} \
        --save-steps {SAVE_STEPS} \
        --batch-size {BATCH_SIZE} \
        --grad-accum {GRAD_ACCUM} \
        --prefetch 1000 \
        --resume-from "{RESUME_FROM}"
else:
    print("No checkpoint to resume from. Run initial training first.")