# LeNet-5 Canonical Training + Phase 3 Export (Colab)

**Model:** LeNet-5 (adapted for 32x32 RGB images)

**Objective:**
- Phase 1: Canonical splits (load from Drive)
- Phase 2: Canonical classes (27 classes, fp=cdfa70b13f7390e6)
- Phase 3: Export contract (.npz + _meta.json) with strict validation

**Expected outputs:**
- `STORE/artifacts/exports/lenet_canonical/val.npz`
- `STORE/artifacts/exports/lenet_canonical/val_meta.json`

**Validation:**
- split_signature must match ResNet50: `cf53f8eb169b3531`
- classes_fp must equal canonical: `cdfa70b13f7390e6`
- idx order must align with ResNet50 for fusion compatibility

In [1]:
from pathlib import Path
import os

from google.colab import drive
drive.mount("/content/drive")

# --- EDIT THESE PATHS ONCE ---
DRIVE_CODE_SNAPSHOT = Path("/content/drive/MyDrive/DS_rakuten_colab")
DRIVE_STORE = Path("/content/drive/MyDrive/DS_rakuten_store")
DRIVE_SPLITS_SRC = DRIVE_STORE / "splits"   # expects train_idx.txt / val_idx.txt / test_idx.txt
# ----------------------------

assert DRIVE_CODE_SNAPSHOT.exists(), f"Missing code snapshot: {DRIVE_CODE_SNAPSHOT}"
DRIVE_STORE.mkdir(parents=True, exist_ok=True)

os.environ["DS_RAKUTEN_STORE"] = str(DRIVE_STORE)

print("✓ DRIVE_CODE_SNAPSHOT:", DRIVE_CODE_SNAPSHOT)
print("✓ DRIVE_STORE:", DRIVE_STORE)
print("✓ DRIVE_SPLITS_SRC:", DRIVE_SPLITS_SRC)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ DRIVE_CODE_SNAPSHOT: /content/drive/MyDrive/DS_rakuten_colab
✓ DRIVE_STORE: /content/drive/MyDrive/DS_rakuten_store
✓ DRIVE_SPLITS_SRC: /content/drive/MyDrive/DS_rakuten_store/splits


In [2]:
import shutil
import sys
from pathlib import Path

RUNTIME_ROOT = Path("/content/DS_rakuten")

# Clean and copy for deterministic imports
if RUNTIME_ROOT.exists():
    shutil.rmtree(RUNTIME_ROOT)

shutil.copytree(DRIVE_CODE_SNAPSHOT, RUNTIME_ROOT)

sys.path.insert(0, str(RUNTIME_ROOT))

print("✓ Runtime code ready:", RUNTIME_ROOT)
print("✓ sys.path[0]:", sys.path[0])

✓ Runtime code ready: /content/DS_rakuten
✓ sys.path[0]: /content/DS_rakuten


In [3]:
from pathlib import Path
import shutil

runtime_splits_dir = Path("/content/DS_rakuten/data/splits")
runtime_splits_dir.mkdir(parents=True, exist_ok=True)

# Copy txt files from Drive persistent store into /content runtime repo
src_files = ["train_idx.txt", "val_idx.txt", "test_idx.txt"]
for fn in src_files:
    src = DRIVE_SPLITS_SRC / fn
    dst = runtime_splits_dir / fn
    assert src.exists(), f"Missing split file in Drive: {src}"
    shutil.copy2(src, dst)

print("✓ Splits synced to:", runtime_splits_dir)
print("✓ Contents:", list(runtime_splits_dir.glob("*.txt"))[:10])

✓ Splits synced to: /content/DS_rakuten/data/splits
✓ Contents: [PosixPath('/content/DS_rakuten/data/splits/val_idx.txt'), PosixPath('/content/DS_rakuten/data/splits/test_idx.txt'), PosixPath('/content/DS_rakuten/data/splits/train_idx.txt')]


In [4]:
# Install wandb for experiment tracking
!pip -q install wandb

# Uncomment if your session is missing other packages:
# !pip -q install gdown
# !pip -q install scikit-learn

import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mxiaosong-dev[0m ([33mxiaosong-dev-formation-data-science[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
from pathlib import Path

IMAGE_FILE_ID = "15ZkS0iTQ7j3mHpxil4mABlXwP-jAN_zi"

BASE_DIR = Path("/content/images")
TMP_DIR = Path("/content/tmp")
ZIP_PATH = TMP_DIR / "images.zip"

BASE_DIR.mkdir(parents=True, exist_ok=True)
TMP_DIR.mkdir(parents=True, exist_ok=True)

if not ZIP_PATH.exists():
    print("Downloading images zip...")
    !gdown --id $IMAGE_FILE_ID -O {str(ZIP_PATH)}
else:
    print("Zip already present:", ZIP_PATH)

print("Unzipping images...")
!unzip -q -o {str(ZIP_PATH)} -d {str(BASE_DIR)}

def count_jpgs(p: Path, limit: int = 2000) -> int:
    if not p.exists():
        return 0
    n = 0
    for _ in p.rglob("*.jpg"):
        n += 1
        if n >= limit:
            break
    return n

# Common candidates
candidates = [
    BASE_DIR / "images" / "image_train",
    BASE_DIR / "image_train",
    BASE_DIR / "images" / "images" / "image_train",
]

best = None
best_count = 0
for c in candidates:
    n = count_jpgs(c)
    if n > best_count:
        best, best_count = c, n

# Fallback: search any folder named image_train
if best_count == 0:
    for c in BASE_DIR.rglob("image_train"):
        if c.is_dir():
            n = count_jpgs(c)
            if n > best_count:
                best, best_count = c, n

assert best is not None and best_count > 0, (
    "Could not find an image_train directory with jpg files under /content/images. "
    "Check zip content and unzip path."
)

IMG_ROOT = best
sample_jpg = next(IMG_ROOT.rglob("*.jpg"))

print("✓ IMG_ROOT detected:", IMG_ROOT)
print("✓ sample jpg:", sample_jpg)

Zip already present: /content/tmp/images.zip
Unzipping images...
✓ IMG_ROOT detected: /content/images/images/image_train
✓ sample jpg: /content/images/images/image_train/image_1068762146_product_1199752710.jpg


In [6]:
from src.data.image_dataset import RakutenImageDataset
from src.train.image_lenet import LeNetConfig, run_lenet_canonical

print("✓ RakutenImageDataset:", RakutenImageDataset)
print("✓ LeNetConfig:", LeNetConfig)
print("✓ run_lenet_canonical:", run_lenet_canonical)

✓ RakutenImageDataset: <class 'src.data.image_dataset.RakutenImageDataset'>
✓ LeNetConfig: <class 'src.train.image_lenet.LeNetConfig'>
✓ run_lenet_canonical: <function run_lenet_canonical at 0x7c90b43e4360>


In [7]:
from src.data.split_manager import load_splits, split_signature

splits = load_splits(verbose=True)
sig = split_signature(splits)

print("✓ signature:", sig)
print({k: len(v) for k, v in splits.items()})

[split_manager] Loading canonical splits from /content/DS_rakuten/data/splits
✓ signature: cf53f8eb169b3531
{'train_idx': 61351, 'val_idx': 10827, 'test_idx': 12738}


In [8]:
import os
from pathlib import Path

STORE = Path(os.environ["DS_RAKUTEN_STORE"])

wandb.init(
    project="rakuten_image",
    name="lenet",
    config={
        "model": "LeNet-5",
        "batch_size": 128,
        "lr": 1e-3,
        "epochs": 30,
    }
)

cfg = LeNetConfig(
    raw_dir=str(STORE / "data_raw"),
    img_dir=str(IMG_ROOT),  # must be /content local disk for speed
    out_dir=str(STORE / "artifacts" / "exports"),
    ckpt_dir=str(STORE / "checkpoints" / "image_lenet"),

    img_size=32,
    batch_size=128,
    num_workers=4,
    num_epochs=30,
    lr=1e-3,
    weight_decay=1e-4,

    use_amp=True,
    dropout_rate=0.5,

    force_colab_loader=True,  # Force Colab data loader

    model_name="lenet_canonical",
    export_split="val",
)

wandb.config.update(cfg.__dict__)

try:

    result = run_lenet_canonical(cfg)

    print("EXPORT:", result["export_result"])
    print("VERIFY:", result["verify_metadata"])
    print("probs_shape:", result["probs_shape"])
    print("best_val_f1:", result["best_val_f1"])

    wandb.log({"best_val_f1": result["best_val_f1"]})

finally:
    wandb.finish()

[INFO] Using Colab data loader (forced via force_colab_loader=True)
[load_data_colab] raw_dir: /content/drive/MyDrive/DS_rakuten_store/data_raw
[load_data_colab] img_root: /content/images/images/image_train
[load_data_colab] X: /content/drive/MyDrive/DS_rakuten_store/data_raw/X_train_update.csv
[load_data_colab] Y: /content/drive/MyDrive/DS_rakuten_store/data_raw/Y_train_CVw08PX.csv




[split_manager] Loading canonical splits from /content/DS_rakuten/data/splits




Epoch 1/30 | train_loss=2.9326 train_f1=0.1055 | val_loss=2.6894 val_f1=0.1713 | lr=1.00e-03




Epoch 2/30 | train_loss=2.7103 train_f1=0.1647 | val_loss=2.5358 val_f1=0.1844 | lr=1.00e-03




Epoch 3/30 | train_loss=2.6318 train_f1=0.1848 | val_loss=2.4539 val_f1=0.2171 | lr=1.00e-03




Epoch 4/30 | train_loss=2.5864 train_f1=0.1972 | val_loss=2.4320 val_f1=0.2267 | lr=1.00e-03




Epoch 5/30 | train_loss=2.5505 train_f1=0.2112 | val_loss=2.3794 val_f1=0.2436 | lr=1.00e-03




Epoch 6/30 | train_loss=2.5203 train_f1=0.2184 | val_loss=2.3701 val_f1=0.2463 | lr=1.00e-03




Epoch 7/30 | train_loss=2.5039 train_f1=0.2245 | val_loss=2.3696 val_f1=0.2481 | lr=1.00e-03




Epoch 8/30 | train_loss=2.4888 train_f1=0.2297 | val_loss=2.3338 val_f1=0.2667 | lr=1.00e-03




Epoch 9/30 | train_loss=2.4782 train_f1=0.2306 | val_loss=2.3414 val_f1=0.2679 | lr=1.00e-03




Epoch 10/30 | train_loss=2.4666 train_f1=0.2349 | val_loss=2.3221 val_f1=0.2575 | lr=1.00e-03




Epoch 11/30 | train_loss=2.4506 train_f1=0.2383 | val_loss=2.3213 val_f1=0.2626 | lr=1.00e-03




Epoch 12/30 | train_loss=2.4481 train_f1=0.2403 | val_loss=2.3033 val_f1=0.2730 | lr=1.00e-03




Epoch 13/30 | train_loss=2.4430 train_f1=0.2431 | val_loss=2.2823 val_f1=0.2805 | lr=1.00e-03




Epoch 14/30 | train_loss=2.4398 train_f1=0.2436 | val_loss=2.2899 val_f1=0.2745 | lr=1.00e-03




Epoch 15/30 | train_loss=2.4384 train_f1=0.2458 | val_loss=2.2834 val_f1=0.2820 | lr=1.00e-03




Epoch 16/30 | train_loss=2.4275 train_f1=0.2491 | val_loss=2.2693 val_f1=0.2853 | lr=1.00e-03




Epoch 17/30 | train_loss=2.4238 train_f1=0.2511 | val_loss=2.2621 val_f1=0.2836 | lr=1.00e-03




Epoch 18/30 | train_loss=2.4132 train_f1=0.2522 | val_loss=2.2840 val_f1=0.2844 | lr=1.00e-03




Epoch 19/30 | train_loss=2.4148 train_f1=0.2538 | val_loss=2.2696 val_f1=0.2917 | lr=1.00e-03




Epoch 20/30 | train_loss=2.4158 train_f1=0.2522 | val_loss=2.2746 val_f1=0.2863 | lr=1.00e-03




Epoch 21/30 | train_loss=2.4166 train_f1=0.2537 | val_loss=2.2741 val_f1=0.2821 | lr=1.00e-03




Epoch 22/30 | train_loss=2.4065 train_f1=0.2548 | val_loss=2.2546 val_f1=0.2891 | lr=1.00e-03




Epoch 23/30 | train_loss=2.4052 train_f1=0.2565 | val_loss=2.2568 val_f1=0.2929 | lr=1.00e-03




Epoch 24/30 | train_loss=2.4024 train_f1=0.2605 | val_loss=2.2497 val_f1=0.2930 | lr=1.00e-03




Epoch 25/30 | train_loss=2.3998 train_f1=0.2575 | val_loss=2.2441 val_f1=0.2913 | lr=1.00e-03




Epoch 26/30 | train_loss=2.3974 train_f1=0.2583 | val_loss=2.2438 val_f1=0.2966 | lr=1.00e-03




Epoch 27/30 | train_loss=2.3942 train_f1=0.2613 | val_loss=2.2380 val_f1=0.2978 | lr=1.00e-03




Epoch 28/30 | train_loss=2.3909 train_f1=0.2630 | val_loss=2.2381 val_f1=0.2949 | lr=1.00e-03




Epoch 29/30 | train_loss=2.3891 train_f1=0.2604 | val_loss=2.2326 val_f1=0.2967 | lr=1.00e-03




Epoch 30/30 | train_loss=2.3842 train_f1=0.2613 | val_loss=2.2349 val_f1=0.2921 | lr=1.00e-03




[OK] Exported model=lenet_canonical split=val npz=/content/drive/MyDrive/DS_rakuten_store/artifacts/exports/lenet_canonical/val.npz sig=cf53f8eb169b3531 fp=cdfa70b13f7390e6 n=10827
EXPORT: {'npz_path': '/content/drive/MyDrive/DS_rakuten_store/artifacts/exports/lenet_canonical/val.npz', 'meta_json_path': '/content/drive/MyDrive/DS_rakuten_store/artifacts/exports/lenet_canonical/val_meta.json', 'classes_fp': 'cdfa70b13f7390e6', 'split_signature': 'cf53f8eb169b3531', 'num_samples': 10827}
VERIFY: {'model_name': 'lenet_canonical', 'split_name': 'val', 'split_signature': 'cf53f8eb169b3531', 'classes_fp': 'cdfa70b13f7390e6', 'num_classes': 27, 'num_samples': 10827, 'has_y_true': True, 'probs_shape': [10827, 27], 'probs_dtype': 'float32', 'created_at': '2026-01-10T14:06:28.408471', 'extra': {'source': 'src/train/image_lenet.py', 'model_architecture': 'LeNet-5', 'img_dir': '/content/images/images/image_train', 'img_size': 32, 'batch_size': 128, 'num_epochs': 30, 'lr': 0.001, 'weight_decay': 0.

0,1
best_val_f1,▁
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
lr,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_acc,▁▄▅▅▆▆▆▇▇▇▇▇▇▇▇▇██████████████
train_f1,▁▄▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇█████████████
train_loss,█▅▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▃▄▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█████████
val_f1,▁▂▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇█▇▇█████████
val_loss,█▆▄▄▃▃▃▃▃▂▂▂▂▂▂▂▁▂▂▂▂▁▁▁▁▁▁▁▁▁

0,1
best_val_f1,0.29784
epoch,29.0
lr,0.001
train_acc,0.31243
train_f1,0.26132
train_loss,2.38424
val_acc,0.35042
val_f1,0.29209
val_loss,2.23493


In [9]:
import os
from pathlib import Path

STORE = Path(os.environ["DS_RAKUTEN_STORE"])
export_dir = STORE / "artifacts" / "exports" / "lenet_canonical"

print("Export dir:", export_dir)
print("Contents:", [p.name for p in export_dir.glob("*")])

assert (export_dir / "val.npz").exists(), "Missing val.npz"
assert (export_dir / "val_meta.json").exists(), "Missing val_meta.json"
print("✓ Export files exist.")

Export dir: /content/drive/MyDrive/DS_rakuten_store/artifacts/exports/lenet_canonical
Contents: ['val.npz', 'val_meta.json']
✓ Export files exist.


In [10]:
!python -m apps.image_app.scripts.validate_exports --split val --exports-root "$DS_RAKUTEN_STORE/artifacts/exports" --strict

/usr/bin/python3: Error while finding module specification for 'apps.image_app.scripts.validate_exports' (ModuleNotFoundError: No module named 'apps')


In [11]:
import json
from pathlib import Path
import os

STORE = Path(os.environ["DS_RAKUTEN_STORE"])
meta_path = STORE / "artifacts" / "exports" / "lenet_canonical" / "val_meta.json"

meta = json.loads(meta_path.read_text())
keys = [
    "model_name", "split_name", "split_signature",
    "classes_fp", "num_samples", "probs_shape"
]
for k in keys:
    print(f"{k}: {meta.get(k)}")

model_name: lenet_canonical
split_name: val
split_signature: cf53f8eb169b3531
classes_fp: cdfa70b13f7390e6
num_samples: 10827
probs_shape: [10827, 27]


In [12]:
import shutil
from pathlib import Path
from src.export.model_exporter import load_predictions
from src.data.label_mapping import CANONICAL_CLASSES_FP
from src.data.split_manager import load_splits, split_signature

splits = load_splits(verbose=False)
sig = split_signature(splits)

CACHE = Path("/content/cache_exports")
CACHE.mkdir(parents=True, exist_ok=True)

export_result = result["export_result"]
npz_src = Path(export_result["npz_path"])
meta_src = npz_src.with_name(npz_src.stem + "_meta.json")

npz_local = CACHE / npz_src.name
meta_local = CACHE / meta_src.name

# Copy both files (npz + meta)
if (not npz_local.exists()) or (npz_local.stat().st_size != npz_src.stat().st_size):
    shutil.copy2(npz_src, npz_local)

if (not meta_local.exists()) or (meta_local.stat().st_size != meta_src.stat().st_size):
    shutil.copy2(meta_src, meta_local)

loaded = load_predictions(
    npz_path=str(npz_local),
    verify_split_signature=sig,
    verify_classes_fp=CANONICAL_CLASSES_FP,
    require_y_true=True,
)

print("✓ loaded ok")
print("model:", loaded["metadata"]["model_name"])
print("split:", loaded["metadata"]["split_name"])
print("sig:", loaded["metadata"]["split_signature"])
print("fp:", loaded["metadata"]["classes_fp"])
print("probs:", loaded["probs"].shape)

✓ loaded ok
model: lenet_canonical
split: val
sig: cf53f8eb169b3531
fp: cdfa70b13f7390e6
probs: (10827, 27)


In [13]:
import os
from pathlib import Path

STORE = Path(os.environ["DS_RAKUTEN_STORE"])
export_dir = STORE / "artifacts" / "exports" / "lenet_canonical"

print("Export dir:", export_dir)
print("Files:", [p.name for p in export_dir.glob("*")])

assert (export_dir / "val.npz").exists(), "Missing val.npz"
assert (export_dir / "val_meta.json").exists(), "Missing val_meta.json"
print("✓ Export files exist")

Export dir: /content/drive/MyDrive/DS_rakuten_store/artifacts/exports/lenet_canonical
Files: ['val.npz', 'val_meta.json']
✓ Export files exist
