<a href="https://colab.research.google.com/github/Yeye-He/science-fair-NEO/blob/main/colab_yolo_2_succ_repro_with_resume.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%%bash
# ==== YOLOv7 install + PyTorch ≥2.6 fix for Google Colab ====

cd /content
rm -rf yolov7
git clone https://github.com/WongKinYiu/yolov7.git
cd yolov7

# 1️⃣  Upgrade pip and core deps (protobuf 5.x works best with Google libs)
pip install -U pip setuptools wheel "protobuf>=5.29.1,<6"

# 2️⃣  Remove legacy pins for numpy / protobuf
sed -i -E '/^[[:space:]]*numpy([<=>~! ]|$).*/d' requirements.txt
sed -i -E '/^[[:space:]]*protobuf([<=>~! ]|$).*/d' requirements.txt

# 3️⃣  Install remaining dependencies (no transitive dep changes)
pip install -U "jedi>=0.16"
pip install -r requirements.txt --no-deps



Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Collecting setuptools
  Downloading setuptools-80.9.0-py3-none-any.whl.metadata (6.6 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.8/1.8 MB 92.1 MB/s eta 0:00:00
Downloading setuptools-80.9.0-py3-none-any.whl (1.2 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.2/1.2 MB 86.2 MB/s eta 0:00:00
Installing collected packages: setuptools, pip
  Attempting uninstall: setuptools
    Found existing installation: setuptools 75.2.0
    Uninstalling setuptools-75.2.0:
      Successfully uninstalled setuptools-75.2.0
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.3 setuptools-80.9.0
Collecting jedi>=0.16
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━

Cloning into 'yolov7'...
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ipython 7.34.0 requires jedi>=0.16, which is not installed.


In [3]:
%%bash
set -e

# === Paths
ROOT=/content
REPO=${ROOT}/yolov7
DATA_DIR=${ROOT}/datasets
ZIP_URL="https://zenodo.org/api/records/14047944/files-archive?download=1"
ZIP_FILE=${DATA_DIR}/zenodo_yolo.zip
EXTRACT_DIR=${DATA_DIR}/zenodo

mkdir -p "$DATA_DIR"

echo "start..."



# === Download the Zenodo archive robustly
echo "Downloading dataset from Zenodo..."
if [ -s "$ZIP_FILE" ]; then
  echo "✅ ZIP file already exists, skipping download: $ZIP_FILE"
else
  if command -v curl >/dev/null 2>&1; then
    curl -fL "$ZIP_URL" -o "$ZIP_FILE"
  else
    wget -q --show-progress --content-disposition -O "$ZIP_FILE" "$ZIP_URL"
  fi
fi

# Quick sanity check
if [ ! -s "$ZIP_FILE" ]; then
  echo "❌ Download failed or empty file: $ZIP_FILE"
  exit 1
fi

# === Validate the ZIP (avoid unzip error noise)
python3 - <<'PY'
import sys, zipfile
p = "/content/datasets/zenodo_yolo.zip"
try:
    with zipfile.ZipFile(p) as zf:
        bad = zf.testzip()
        if bad is not None:
            print(f"❌ Corrupt file in zip: {bad}", file=sys.stderr)
            sys.exit(2)
    print("✅ ZIP integrity OK")
except zipfile.BadZipFile:
    print("❌ Not a ZIP file (or corrupted).", file=sys.stderr)
    sys.exit(3)
PY

# === Extract
echo "Extracting to $EXTRACT_DIR ..."
rm -rf "$EXTRACT_DIR"
mkdir -p "$EXTRACT_DIR"
python3 - <<'PY'
import zipfile, os
zip_path = "/content/datasets/zenodo_yolo.zip"
out_dir  = "/content/datasets/zenodo"
with zipfile.ZipFile(zip_path) as zf:
    zf.extractall(out_dir)
print("✅ Extracted")
PY



# 3) extract the StreaksYoloDataset.zip folder
cd "$EXTRACT_DIR"
unzip -o StreaksYoloDataset.zip


start...
Downloading dataset from Zenodo...
✅ ZIP integrity OK
Extracting to /content/datasets/zenodo ...
✅ Extracted
Archive:  StreaksYoloDataset.zip
   creating: StreaksYoloDataset/
  inflating: StreaksYoloDataset/data.yaml  
   creating: StreaksYoloDataset/test/
   creating: StreaksYoloDataset/test/images/
  inflating: StreaksYoloDataset/test/images/6.jpeg  
  inflating: StreaksYoloDataset/test/images/60.jpeg  
  inflating: StreaksYoloDataset/test/images/600.jpeg  
  inflating: StreaksYoloDataset/test/images/601.jpeg  
  inflating: StreaksYoloDataset/test/images/602.jpeg  
  inflating: StreaksYoloDataset/test/images/603.jpeg  
  inflating: StreaksYoloDataset/test/images/604.jpeg  
  inflating: StreaksYoloDataset/test/images/605.jpeg  
  inflating: StreaksYoloDataset/test/images/606.jpeg  
  inflating: StreaksYoloDataset/test/images/607.jpeg  
  inflating: StreaksYoloDataset/test/images/608.jpeg  
  inflating: StreaksYoloDataset/test/images/609.jpeg  
  inflating: StreaksYoloDataset/

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  336k    0  336k    0     0   215k      0 --:--:--  0:00:01 --:--:--  214k100 1022k    0 1022k    0     0   399k      0 --:--:--  0:00:02 --:--:--  399k100 1724k    0 1724k    0     0   485k      0 --:--:--  0:00:03 --:--:--  485k100 9678k    0 9678k    0     0  2122k      0 --:--:--  0:00:04 --:--:-- 2122k100 19.9M    0 19.9M    0     0  3671k      0 --:--:--  0:00:05 --:--:-- 4086k100 23.5M    0 23.5M    0     0  3664k      0 --:--:--  0:00:06 --:--:-- 4739k100 26.0M    0 26.0M    0     0  3537k      0 --:--:--  0:00:07 --:--:-- 5145k100 29.9M    0 29.9M    0     0  3589k      0 --:--:--  0:00:08 --:--:-- 5798k100 34.2M    0 34.2M    0     0  3670k      0 --:--

In [4]:
%%bash
set -e

# === Paths you provided ===
DATASET_ROOT="/content/datasets/zenodo/StreaksYoloDataset"
TRAIN_IMG="$DATASET_ROOT/train/images"
TRAIN_LBL="$DATASET_ROOT/train/labels"
VAL_IMG="$DATASET_ROOT/val/images"
VAL_LBL="$DATASET_ROOT/val/labels"

# === Sanity checks ===
if [ ! -d "$TRAIN_IMG" ] || [ ! -d "$TRAIN_LBL" ]; then
  echo "❌ Expected folders not found:"
  echo "   $TRAIN_IMG"
  echo "   $TRAIN_LBL"
  exit 1
fi

# === If no val set, create one by copying ~10% of train into val (non-destructive) ===
if [ ! -d "$VAL_IMG" ] || [ ! -d "$VAL_LBL" ]; then
  echo "ℹ️  No val/ found. Creating a small validation split (~10%) from train/ (copy only)."
  mkdir -p "$VAL_IMG" "$VAL_LBL"
  python3 - <<'PY'
import os, random, shutil, glob

TRAIN_IMG = "/content/datasets/zenodo/StreaksYoloDataset/train/images"
TRAIN_LBL = "/content/datasets/zenodo/StreaksYoloDataset/train/labels"
VAL_IMG   = "/content/datasets/zenodo/StreaksYoloDataset/val/images"
VAL_LBL   = "/content/datasets/zenodo/StreaksYoloDataset/val/labels"

# Collect image files
exts = (".jpg",".jpeg",".png",".bmp",".webp",".JPG",".JPEG",".PNG",".BMP",".WEBP")
imgs = [p for p in glob.glob(os.path.join(TRAIN_IMG, "*")) if p.endswith(exts)]
imgs.sort()
if not imgs:
    raise SystemExit("❌ No images found in train/images")

# sample ~10% but at least 50 and at most 1000 (tweakable)
n = max(50, min(1000, max(1, int(0.10*len(imgs)))))
sample = random.sample(imgs, n)

def lbl_for(img_path):
    base = os.path.splitext(os.path.basename(img_path))[0]
    for ext in (".txt",):
        p = os.path.join(TRAIN_LBL, base + ext)
        if os.path.isfile(p):
            return p
    return None

copied = 0
for img in sample:
    lbl = lbl_for(img)
    # copy image
    shutil.copy2(img, os.path.join(VAL_IMG, os.path.basename(img)))
    # copy label if exists (YOLO allows images with no labels)
    if lbl and os.path.isfile(lbl):
        shutil.copy2(lbl, os.path.join(VAL_LBL, os.path.basename(lbl)))
    copied += 1

print(f"✅ Validation split created with {copied} images")
PY
else
  echo "✅ Found existing val/:"
  echo "   $VAL_IMG"
  echo "   $VAL_LBL"
fi

# === Ensure YOLOv7 repo is present ===
cd /content
if [ ! -d "yolov7" ]; then
  git clone https://github.com/WongKinYiu/yolov7.git
fi
cd yolov7

# === Ensure weights exist ===
if [ ! -f "yolov7.pt" ]; then
  echo "Downloading yolov7.pt weights..."
  wget -q https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7.pt
fi

# === Write data/custom.yaml (single class 'streak' as per your use-case) ===
mkdir -p data
cat > data/custom.yaml <<EOF
train: $TRAIN_IMG
val: $VAL_IMG
nc: 1
names: ['streak']
EOF
echo "✅ Wrote data/custom.yaml"
echo "train: $TRAIN_IMG"
echo "val:   $VAL_IMG"

ℹ️  No val/ found. Creating a small validation split (~10%) from train/ (copy only).
✅ Validation split created with 172 images
Downloading yolov7.pt weights...
✅ Wrote data/custom.yaml
train: /content/datasets/zenodo/StreaksYoloDataset/train/images
val:   /content/datasets/zenodo/StreaksYoloDataset/val/images


In [5]:
%%bash
set -e
REPO=/content/yolov7
STUB_DIR="$REPO/utils/wandb_logging"
STUB="$STUB_DIR/wandb_utils.py"

mkdir -p "$STUB_DIR"

cat > "$STUB" <<'PY'
# Robust no-op WandB stub for YOLOv7 (PyTorch >=2.6 safe)
# Provides every attribute/method YOLOv7 may call so training never crashes.

class _NoOp:
    def __call__(self, *a, **k): return None
    def __getattr__(self, name): return self
    def __setattr__(self, name, value): object.__setattr__(self, name, value); return None
    def __enter__(self): return self
    def __exit__(self, *exc): return False

class WandbLogger:
    def __init__(self, *args, **kwargs):
        # Common fields YOLOv7 might access
        self.wandb = None
        self.wandb_run = None
        self.data_dict = kwargs.get("data_dict", None)
        self.job_type = kwargs.get("job_type", "train")
        self.val_artifact = None
        self.train_artifact = None
        self.result_artifact = None
        self.val_table = None
        self.train_table = None
        self.result_table = None
        self.kwargs = kwargs

    # YOLOv7 hooks that sometimes get called
    def log_training_progress(self, *a, **k): pass
    def log_training_results(self, *a, **k): pass
    def log_validation_results(self, *a, **k): pass
    def log_model(self, *a, **k): pass
    def update_dataset_artifact(self, *a, **k): pass
    def log_dataset_artifact(self, *a, **k): pass
    def checkpoint_artifact(self, *a, **k): pass
    def watch(self, *a, **k): pass
    def unwatch(self, *a, **k): pass
    def end_epoch(self, *a, **k): pass           # <- missing in your error
    def finish_run(self, *a, **k): pass
    def end_run(self, *a, **k): pass
    def setup_training(self, *a, **k): pass
    def setup_validation(self, *a, **k): pass

    # act like a dict of loggers in some places
    def __getitem__(self, key): return _NoOp()
    def __setitem__(self, key, val): return None

# Provide a module-level "wandb" shim that won't crash if imported
wandb = _NoOp()
PY

# Clear bytecode caches so Python loads the new stub
find "$REPO/utils" -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
echo "✅ Installed no-op WandB stub with end_epoch and cleared __pycache__."


✅ Installed no-op WandB stub with end_epoch and cleared __pycache__.


In [6]:
%%bash
set -e
REPO=/content/yolov7
STUB_DIR="$REPO/utils/wandb_logging"
STUB="$STUB_DIR/wandb_utils.py"

mkdir -p "$STUB_DIR"

cat > "$STUB" <<'PY'
# Robust no-op WandB stub for YOLOv7

class _NoOp:
    def __call__(self, *a, **k): return None
    def __getattr__(self, name): return self
    def __setattr__(self, name, value): object.__setattr__(self, name, value); return None
    def __enter__(self): return self
    def __exit__(self, *exc): return False

class WandbLogger:
    def __init__(self, *args, **kwargs):
        self.wandb = None
        self.wandb_run = None
        self.data_dict = kwargs.get("data_dict", None)
        self.job_type = kwargs.get("job_type", "train")
        self.kwargs = kwargs

    # hooks YOLOv7 may call
    def log_training_progress(self, *a, **k): pass
    def log_training_results(self, *a, **k): pass
    def log_validation_results(self, *a, **k): pass
    def log_model(self, *a, **k): pass
    def update_dataset_artifact(self, *a, **k): pass
    def log_dataset_artifact(self, *a, **k): pass
    def checkpoint_artifact(self, *a, **k): pass
    def watch(self, *a, **k): pass
    def unwatch(self, *a, **k): pass
    def end_epoch(self, *a, **k): pass
    def finish_run(self, *a, **k): pass
    def end_run(self, *a, **k): pass
    def setup_training(self, *a, **k): pass
    def setup_validation(self, *a, **k): pass
    def __getitem__(self, key): return _NoOp()
    def __setitem__(self, key, val): return None

# YOLOv7 imports this at module level:
def check_wandb_resume(opt):
    """
    Return (wandb_run, resume_flag). We disable resume in the stub.
    """
    return None, False

# Provide a module-level shim so `import wandb` inside their code won't crash
wandb = _NoOp()
PY

# Clear caches so Python reloads the new stub
find "$REPO/utils" -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
echo "✅ Updated WandB stub (added check_wandb_resume) and cleared __pycache__."


✅ Updated WandB stub (added check_wandb_resume) and cleared __pycache__.


In [10]:
%%bash
set -e
DATASET_ROOT="/content/datasets/zenodo/StreaksYoloDataset"

# Common YOLO cache file patterns
find "$DATASET_ROOT" -maxdepth 3 -type f \( -name "*.cache" -o -name "*.cache.*" \) -print -delete || true
# YOLO sometimes caches per-folder (e.g., images.cache, labels.cache)
for p in train/images val/images train/labels val/labels; do
  [ -d "$DATASET_ROOT/$p" ] && rm -f "$DATASET_ROOT/$p".cache* || true
done

echo "✅ Cleared dataset cache files."


/content/datasets/zenodo/StreaksYoloDataset/train/labels.cache
/content/datasets/zenodo/StreaksYoloDataset/val/labels.cache
✅ Cleared dataset cache files.


In [11]:
# Make YOLOv7 load data/custom.yaml directly into data_dict, keep PyTorch 2.6 fixes,
# patch strip_optimizer for torch>=2.6, and TRAIN with live streaming + resumable checkpoints
import os, re, subprocess, sys, glob, shutil, datetime

REPO = "/content/yolov7"
TRAIN = f"{REPO}/train.py"
UTILS_GENERAL = f"{REPO}/utils/general.py"

assert os.path.isdir(REPO), "YOLOv7 repo not found at /content/yolov7"

# === Optional: mount Google Drive for persistent checkpoints
try:
    from google.colab import drive  # type: ignore
    drive.mount("/content/drive", force_remount=False)
    DRIVE_OK = True
except Exception:
    print("ℹ️ Not running in Colab or Drive mount skipped; will save locally only.")
    DRIVE_OK = False

# Persistent runs root (Drive if available, otherwise local)
RUNS_ROOT = "/content/drive/MyDrive/yolov7_runs" if DRIVE_OK else f"{REPO}/runs_persistent"
os.makedirs(RUNS_ROOT, exist_ok=True)

# --- Restore pristine targets we modify (keep any WandB stub file) ---
subprocess.run(["git", "checkout", "--", "train.py", "utils/general.py"], cwd=REPO, check=True)

# --- Patch train.py: safe-unpickler + YAML-loaded data_dict + full unpickle for weights ---
with open(TRAIN, "r", encoding="utf-8") as f:
    src = f.read()

changed = False

# 1) Add safe allow-list for YOLOv7 Model (PyTorch 2.6)
inject_train_safe = (
    "from torch.serialization import add_safe_globals\n"
    "from models.yolo import Model as _YOLOModel\n"
    "add_safe_globals([_YOLOModel])\n"
)
if "add_safe_globals([_YOLOModel])" not in src:
    lines = src.splitlines(True)
    insert_at = 0
    for i, ln in enumerate(lines[:250]):
        if ln.strip() == "" and i > 0:
            insert_at = i + 1
            break
    src = "".join(lines[:insert_at]) + inject_train_safe + "".join(lines[insert_at:])
    changed = True

# 2) Skip fragile run_id unpickle (wandb id) on torch>=2.6
src2 = re.sub(
    r"run_id\s*=\s*torch\.load\([^)]*\)\.get\(\s*['\"]wandb_id['\"]\s*\)\s*if[^\n]*else\s*None",
    "run_id = None  # skip wandb_id unpickle for PyTorch>=2.6",
    src,
)
if src2 != src:
    src = src2
    changed = True

# 3) Force full checkpoint unpickling when loading weights
src2 = re.sub(
    r"torch\.load\(\s*weights\s*,\s*map_location\s*=\s*device\s*\)",
    "torch.load(weights, map_location=device, weights_only=False)",
    src,
)
if src2 != src:
    src = src2
    changed = True

# 4) Ensure data_dict loaded from YAML if `opt.data` is a path
if "import yaml" not in src:
    src = src.replace("import math", "import math\nimport yaml")
    changed = True
pattern = r"(\n\s*names\s*=\s*\['item'\].*?data_dict\['names'\].*?\n)"
if re.search(pattern, src, flags=re.S):
    src = re.sub(
        pattern,
        "\n    # Ensure data_dict is loaded from YAML path\n"
        "    if not isinstance(data_dict, dict):\n"
        "        with open(opt.data, 'r') as f:\n"
        "            data_dict = yaml.safe_load(f)\n"
        r"\1",
        src,
        count=1,
        flags=re.S,
    )
    changed = True

if changed:
    with open(TRAIN, "w", encoding="utf-8") as f:
        f.write(src)
    print("✅ train.py patched (safe-unpickler + YAML data_dict + weights_only=False).")
else:
    print("ℹ️ train.py already contained required patches.")

# --- Patch utils/general.py: allow-list NumPy reconstruct + weights_only=False in strip_optimizer ---
with open(UTILS_GENERAL, "r", encoding="utf-8") as f:
    gen = f.read()

g_changed = False

# 1) Add NumPy reconstruct to safe allow-list (needed by older checkpoints)
if "add_safe_globals" not in gen:
    gen = gen.replace(
        "import torch",
        "import torch\nfrom torch.serialization import add_safe_globals",
        1
    )
    g_changed = True

if "numpy._core.multiarray._reconstruct" not in gen:
    insert_hook = "from pathlib import Path"
    if insert_hook in gen:
        gen = gen.replace(
            insert_hook,
            insert_hook + "\n# PyTorch 2.6 safe unpickling: allow-list NumPy reconstruct\n"
                          "try:\n"
                          "    from numpy.core.multiarray import _reconstruct as _np_reconstruct\n"
                          "    add_safe_globals([_np_reconstruct])\n"
                          "except Exception:\n"
                          "    pass\n",
            1
        )
        g_changed = True

# 2) In strip_optimizer, force full unpickle
gen2 = re.sub(
    r"x\s*=\s*torch\.load\(\s*f\s*,\s*map_location\s*=\s*torch\.device\('cpu'\)\s*\)",
    "x = torch.load(f, map_location=torch.device('cpu'), weights_only=False)",
    gen,
)
if gen2 != gen:
    gen = gen2
    g_changed = True

if g_changed:
    with open(UTILS_GENERAL, "w", encoding="utf-8") as f:
        f.write(gen)
    print("✅ utils/general.py patched (NumPy allow-list + weights_only=False in strip_optimizer).")
else:
    print("ℹ️ utils/general.py already contained required patches.")

# --- Verify pretrained weights existence ---
weights_path = f"{REPO}/yolov7.pt"
if not os.path.exists(weights_path):
    raise FileNotFoundError(
        f"Expected pretrained weights at {weights_path}. "
        "Re-run the setup cell or adjust weights_path."
    )

# === Resume helper ===
def find_latest_last_pt(run_name: str, root: str) -> str | None:
    pattern = os.path.join(root, run_name + "*", "weights", "last.pt")
    candidates = sorted(glob.glob(pattern), key=os.path.getmtime, reverse=True)
    return candidates[0] if candidates else None

RUN_NAME = "streaks-yolov7"
latest_last = find_latest_last_pt(RUN_NAME, RUNS_ROOT)

# === Build YOLOv7 command (note: --save_period uses underscore; --img-size is correct flag)
cmd = [
    "/usr/bin/python3","-u","train.py",
    "--img-size","640",
    "--data","data/custom.yaml",
    "--single-cls",
    "--hyp","data/hyp.scratch.p5.yaml",
    "--cfg","cfg/training/yolov7.yaml",
    "--workers","2", # was 0
    "--batch-size","32", # was 4
    "--epochs","200",
    "--name", RUN_NAME,
    "--device","0",
    "--project", RUNS_ROOT,
    "--exist-ok",
    "--save_period","5",            # <- underscore (YOLOv7 arg name)
]

if latest_last and os.path.exists(latest_last):
    print(f"↩️  Resuming from {latest_last}")
    cmd += ["--resume", latest_last]
else:
    cmd += ["--weights", weights_path]

print("\n🚀", " ".join(cmd))

from collections import deque
tail = deque(maxlen=120)

def safe_backup_last(run_root: str, run_name: str):
    exps = sorted(glob.glob(os.path.join(run_root, run_name + "*")), key=os.path.getmtime, reverse=True)
    if not exps:
        return
    weights_dir = os.path.join(exps[0], "weights")
    last_pt = os.path.join(weights_dir, "last.pt")
    if os.path.exists(last_pt):
        stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        backup = os.path.join(weights_dir, f"last_{stamp}.pt")
        try:
            shutil.copy2(last_pt, backup)
            print(f"📦 Backed up latest last.pt -> {backup}")
        except Exception as e:
            print(f"⚠️ Backup skipped: {e}")

env = os.environ.copy()
env["PYTHONUNBUFFERED"] = "1"
env.setdefault("WANDB_SILENT", "true")

try:
    with subprocess.Popen(
        cmd,
        cwd=REPO,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        bufsize=1,
        text=True,
        env=env,
    ) as proc:
        for line in proc.stdout:
            print(line, end="")
            sys.stdout.flush()
            tail.append(line)

        ret = proc.wait()

    if ret != 0:
        print("\n❌ Training failed. Last 120 lines:\n" + "".join(tail))
        safe_backup_last(RUNS_ROOT, RUN_NAME)
        raise SystemExit(ret)

    print("✅ Training completed.")
    safe_backup_last(RUNS_ROOT, RUN_NAME)
    print(f"📁 Checkpoints at: {RUNS_ROOT}")

except KeyboardInterrupt:
    print("\n⏹️ Interrupted by user. Attempting to terminate training...")
    try:
        proc.terminate()
    except Exception:
        pass
    safe_backup_last(RUNS_ROOT, RUN_NAME)
    raise


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
   158/199     30.6G   0.03916  0.003436         0    0.0426        50       640:  41%|████      | 22/54 [00:17<00:27,  1.18it/s]
   158/199     30.6G   0.03896  0.003404         0   0.04236        45       640:  41%|████      | 22/54 [00:18<00:27,  1.18it/s]
   158/199     30.6G   0.03896  0.003404         0   0.04236        45       640:  43%|████▎     | 23/54 [00:18<00:27,  1.14it/s]
   158/199     30.6G     0.039  0.003448         0   0.04245        48       640:  43%|████▎     | 23/54 [00:19<00:27,  1.14it/s]
   158/199     30.6G     0.039  0.003448         0   0.04245        48       640:  44%|████▍     | 24/54 [00:19<00:26,  1.11it/s]
   158/199     30.6G   0.03907  0.003467         0   0.04254        46       640:  44%|████▍     | 24/54 [00:19<00:26,  1.11it/s]
   158/199     30.6G   0.03907  0.003467         0   0.04254        46       640:  46%|████▋     | 25/54 [00:19<00:24,  1.16it/s]
   158/199     30.6G   0.