# FlauBERT Canonical Training + Phase 3 Export (Colab)

**Model:** FlauBERT-Base-Cased (via HuggingFace Transformers)

**Objective:** Text classification with canonical pipeline alignment:
- Phase 1: Canonical splits (load from Drive) - **SAME AS IMAGE MODELS**
- Phase 2: Canonical classes (27 classes, fp=cdfa70b13f7390e6) - **SAME AS IMAGE MODELS**
- Phase 3: Export contract (.npz + _meta.json) with strict validation

**Expected outputs:**
- `STORE/artifacts/exports/flaubert_canonical/val.npz`
- `STORE/artifacts/exports/flaubert_canonical/val_meta.json`

**Validation:**
- split_signature must match image models: `cf53f8eb169b3531`
- classes_fp must equal canonical: `cdfa70b13f7390e6`
- idx order must align with image models for fusion compatibility

**‚ö†Ô∏è CRITICAL ALIGNMENT:**
- Uses `load_splits()` to load SAME indices as image models
- NO independent train_test_split - ensures text & image use identical samples

In [1]:
from pathlib import Path
import os

from google.colab import drive
drive.mount("/content/drive")

# --- EDIT THESE PATHS ONCE ---
DRIVE_CODE_SNAPSHOT = Path("/content/drive/MyDrive/DS_rakuten_colab")
DRIVE_STORE = Path("/content/drive/MyDrive/DS_rakuten_store")
DRIVE_SPLITS_SRC = DRIVE_STORE / "splits"   # expects train_idx.txt / val_idx.txt / test_idx.txt
# ----------------------------

assert DRIVE_CODE_SNAPSHOT.exists(), f"Missing code snapshot: {DRIVE_CODE_SNAPSHOT}"
DRIVE_STORE.mkdir(parents=True, exist_ok=True)

os.environ["DS_RAKUTEN_STORE"] = str(DRIVE_STORE)

print("‚úì DRIVE_CODE_SNAPSHOT:", DRIVE_CODE_SNAPSHOT)
print("‚úì DRIVE_STORE:", DRIVE_STORE)
print("‚úì DRIVE_SPLITS_SRC:", DRIVE_SPLITS_SRC)

Mounted at /content/drive
‚úì DRIVE_CODE_SNAPSHOT: /content/drive/MyDrive/DS_rakuten_colab
‚úì DRIVE_STORE: /content/drive/MyDrive/DS_rakuten_store
‚úì DRIVE_SPLITS_SRC: /content/drive/MyDrive/DS_rakuten_store/splits


In [2]:
import shutil
import sys
from pathlib import Path

RUNTIME_ROOT = Path("/content/DS_rakuten")

# Clean and copy for deterministic imports
if RUNTIME_ROOT.exists():
    shutil.rmtree(RUNTIME_ROOT)

shutil.copytree(DRIVE_CODE_SNAPSHOT, RUNTIME_ROOT)

sys.path.insert(0, str(RUNTIME_ROOT))

print("‚úì Runtime code ready:", RUNTIME_ROOT)
print("‚úì sys.path[0]:", sys.path[0])

‚úì Runtime code ready: /content/DS_rakuten
‚úì sys.path[0]: /content/DS_rakuten


In [3]:
from pathlib import Path
import shutil

runtime_splits_dir = Path("/content/DS_rakuten/data/splits")
runtime_splits_dir.mkdir(parents=True, exist_ok=True)

# ‚ö†Ô∏è CRITICAL: Copy SAME split files used by image models
src_files = ["train_idx.txt", "val_idx.txt", "test_idx.txt"]
for fn in src_files:
    src = DRIVE_SPLITS_SRC / fn
    dst = runtime_splits_dir / fn
    assert src.exists(), f"Missing split file in Drive: {src}"
    shutil.copy2(src, dst)
    print(f"‚úì Copied {fn}: {src.stat().st_size} bytes")

print("‚úì Splits synced to:", runtime_splits_dir)
print("‚úì Contents:", list(runtime_splits_dir.glob("*.txt")))

‚úì Copied train_idx.txt: 421445 bytes
‚úì Copied val_idx.txt: 74328 bytes
‚úì Copied test_idx.txt: 87529 bytes
‚úì Splits synced to: /content/DS_rakuten/data/splits
‚úì Contents: [PosixPath('/content/DS_rakuten/data/splits/test_idx.txt'), PosixPath('/content/DS_rakuten/data/splits/val_idx.txt'), PosixPath('/content/DS_rakuten/data/splits/train_idx.txt')]


In [4]:
# Install dependencies
!pip -q install transformers datasets wandb
!pip install sacremoses
# Verify installation
import transformers
import datasets
print(f"‚úì transformers version: {transformers.__version__}")
print(f"‚úì datasets version: {datasets.__version__}")

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m897.5/897.5 kB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1
‚úì transformers version: 4.57.3
‚úì datasets version: 4.0.0


In [5]:
# WandB Authentication (REQUIRED for logging!)
import wandb

# Login to WandB
wandb.login()

print("‚úì WandB login successful")

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 2


[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mxiaosong-dev[0m ([33mxiaosong-dev-formation-data-science[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


‚úì WandB login successful


In [6]:
from src.train.text_flaubert import FlauBERTConfig, run_flaubert_canonical

print("‚úì FlauBERTConfig:", FlauBERTConfig)
print("‚úì run_flaubert_canonical:", run_flaubert_canonical)

‚úì FlauBERTConfig: <class 'src.train.text_flaubert.FlauBERTConfig'>
‚úì run_flaubert_canonical: <function run_flaubert_canonical at 0x798e663f5bc0>


In [7]:
# ‚ö†Ô∏è VERIFY SPLIT SIGNATURE (must match image models)
from src.data.split_manager import load_splits, split_signature

splits = load_splits(verbose=True)
sig = split_signature(splits)

print("="*60)
print("SPLIT SIGNATURE VERIFICATION")
print("="*60)
print(f"‚úì Signature: {sig}")
print(f"‚úì Expected: cf53f8eb169b3531 (from image models)")
print(f"‚úì Match: {sig == 'cf53f8eb169b3531'}")
print()
print("Split sizes:")
for k, v in splits.items():
    print(f"  {k}: {len(v)}")
print("="*60)

# CRITICAL: This signature MUST match image models for fusion!

[split_manager] Loading canonical splits from /content/DS_rakuten/data/splits
SPLIT SIGNATURE VERIFICATION
‚úì Signature: cf53f8eb169b3531
‚úì Expected: cf53f8eb169b3531 (from image models)
‚úì Match: True

Split sizes:
  train_idx: 61351
  val_idx: 10827
  test_idx: 12738


In [8]:
import os
from pathlib import Path

STORE = Path(os.environ["DS_RAKUTEN_STORE"])

cfg = FlauBERTConfig(
    raw_dir=str(STORE / "data_raw"),
    out_dir=str(STORE / "artifacts" / "exports"),
    ckpt_dir=str(STORE / "checkpoints" / "text_flaubert"),

    # Text config
    text_col="designation",
    text_col2="description",
    max_length=384,

    # Training config
    batch_size=128,  # Updated for Colab GPU
    num_epochs=20,
    lr=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.06,

    # Regularization
    label_smoothing=0.0,
    dropout=0.15,
    patience=2,

    # Model (use_fast=False is handled automatically in the training script)
    model_name="flaubert/flaubert_base_cased",  # or "flaubert/flaubert_large_cased"

    force_colab_loader=True,

    export_name="flaubert_canonical",
    export_split="val",
)

print("="*80)
print("FlauBERT Training Configuration")
print("="*80)
print(f"Raw dir: {cfg.raw_dir}")
print(f"Export dir: {cfg.out_dir}")
print(f"Checkpoint dir: {cfg.ckpt_dir}")
print(f"Model: {cfg.model_name}")
print(f"Max length: {cfg.max_length}")
print(f"Batch size: {cfg.batch_size}")
print(f"Epochs: {cfg.num_epochs}")
print(f"Export split: {cfg.export_split}")
print("="*80)

FlauBERT Training Configuration
Raw dir: /content/drive/MyDrive/DS_rakuten_store/data_raw
Export dir: /content/drive/MyDrive/DS_rakuten_store/artifacts/exports
Checkpoint dir: /content/drive/MyDrive/DS_rakuten_store/checkpoints/text_flaubert
Model: flaubert/flaubert_base_cased
Max length: 384
Batch size: 128
Epochs: 20
Export split: val


In [9]:
# üöÄ RUN TRAINING
result = run_flaubert_canonical(cfg)

print("\n" + "="*80)
print("TRAINING COMPLETE")
print("="*80)
print("\nValidation Metrics:")
for k, v in result["val_metrics"].items():
    if isinstance(v, float):
        print(f"  {k}: {v:.4f}")
    else:
        print(f"  {k}: {v}")

print("\nExport Result:")
print(f"  NPZ path: {result['export_result']['npz_path']}")
print(f"  Meta path: {result['export_result']['meta_json_path']}")
print(f"  Probs shape: {result['probs_shape']}")

print("\nVerification:")
print(f"  Model name: {result['verify_metadata']['model_name']}")
print(f"  Split signature: {result['verify_metadata']['split_signature']}")
print(f"  Classes fingerprint: {result['verify_metadata']['classes_fp']}")
print("="*80)



[INFO] Using Colab data loader (forced via force_colab_loader=True)
[load_data_colab] raw_dir: /content/drive/MyDrive/DS_rakuten_store/data_raw
[load_data_colab] img_root: None
[load_data_colab] X: /content/drive/MyDrive/DS_rakuten_store/data_raw/X_train_update.csv
[load_data_colab] Y: /content/drive/MyDrive/DS_rakuten_store/data_raw/Y_train_CVw08PX.csv
[split_manager] Loading canonical splits from /content/DS_rakuten/data/splits
‚úì Split signature: cf53f8eb169b3531
‚úì Split sizes: train=61351, val=10827, test=12738
‚úì Train size: 61351
‚úì Val size: 10827
‚úì Test size: 12738
[INFO] Loading tokenizer: flaubert/flaubert_base_cased
[INFO] Using fast tokenizer: False


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

[INFO] Loading model: flaubert/flaubert_base_cased


model.safetensors:   0%|          | 0.00/553M [00:00<?, ?B/s]

Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[INFO] Tokenizing datasets...


Map:   0%|          | 0/61351 [00:00<?, ? examples/s]

Map:   0%|          | 0/10827 [00:00<?, ? examples/s]

Map:   0%|          | 0/12738 [00:00<?, ? examples/s]

[INFO] Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,1.1039,0.84874,0.740925,0.687319,0.732844
2,0.6977,0.584813,0.824236,0.796218,0.822359
3,0.5452,0.516395,0.844093,0.823313,0.8431
4,0.4425,0.48096,0.851667,0.826627,0.85055
5,0.392,0.451166,0.86469,0.84805,0.865139
6,0.3443,0.412334,0.877067,0.861676,0.877204
7,0.3027,0.41763,0.880115,0.866417,0.87969
8,0.2566,0.416284,0.886857,0.874401,0.8867
9,0.2425,0.407463,0.889258,0.87653,0.889093
10,0.2111,0.403835,0.888797,0.874778,0.888585


[INFO] Evaluating on validation set...


‚úì Validation metrics: {'eval_loss': 0.40746307373046875, 'eval_accuracy': 0.8892583356423756, 'eval_f1_macro': 0.87652950558684, 'eval_f1_weighted': 0.8890930153200882, 'eval_runtime': 11.8666, 'eval_samples_per_second': 912.391, 'eval_steps_per_second': 3.624, 'epoch': 11.0}
[INFO] Generating predictions for val set...
[OK] Exported model=flaubert_canonical split=val npz=/content/drive/MyDrive/DS_rakuten_store/artifacts/exports/flaubert_canonical/val.npz sig=cf53f8eb169b3531 fp=cdfa70b13f7390e6 n=10827
‚úì Export verified: flaubert_canonical
‚úì Split signature matches: True
‚úì Classes fingerprint matches: True


0,1
eval/accuracy,‚ñÅ‚ñÖ‚ñÜ‚ñÜ‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
eval/f1_macro,‚ñÅ‚ñÖ‚ñÜ‚ñÜ‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
eval/f1_weighted,‚ñÅ‚ñÖ‚ñÜ‚ñÜ‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
eval/loss,‚ñà‚ñÑ‚ñÉ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
eval/runtime,‚ñÜ‚ñÉ‚ñÉ‚ñá‚ñà‚ñá‚ñÖ‚ñá‚ñÖ‚ñÅ‚ñÑ‚ñÇ
eval/samples_per_second,‚ñÉ‚ñÜ‚ñÜ‚ñÇ‚ñÅ‚ñÇ‚ñÑ‚ñÇ‚ñÑ‚ñà‚ñÖ‚ñá
eval/steps_per_second,‚ñÉ‚ñÜ‚ñÜ‚ñÇ‚ñÅ‚ñÇ‚ñÑ‚ñÇ‚ñÑ‚ñà‚ñÖ‚ñá
test/accuracy,‚ñÅ
test/f1_macro,‚ñÅ
test/f1_weighted,‚ñÅ

0,1
eval/accuracy,0.88926
eval/f1_macro,0.87653
eval/f1_weighted,0.88909
eval/loss,0.40746
eval/runtime,11.8666
eval/samples_per_second,912.391
eval/steps_per_second,3.624
test/accuracy,0.88926
test/f1_macro,0.87653
test/f1_weighted,0.88909



TRAINING COMPLETE

Validation Metrics:
  eval_loss: 0.4075
  eval_accuracy: 0.8893
  eval_f1_macro: 0.8765
  eval_f1_weighted: 0.8891
  eval_runtime: 11.8666
  eval_samples_per_second: 912.3910
  eval_steps_per_second: 3.6240
  epoch: 11.0000

Export Result:
  NPZ path: /content/drive/MyDrive/DS_rakuten_store/artifacts/exports/flaubert_canonical/val.npz
  Meta path: /content/drive/MyDrive/DS_rakuten_store/artifacts/exports/flaubert_canonical/val_meta.json
  Probs shape: (10827, 27)

Verification:
  Model name: flaubert_canonical
  Split signature: cf53f8eb169b3531
  Classes fingerprint: cdfa70b13f7390e6


In [10]:
# Verify export files exist
import os
from pathlib import Path

STORE = Path(os.environ["DS_RAKUTEN_STORE"])
export_dir = STORE / "artifacts" / "exports" / cfg.export_name

print("Export dir:", export_dir)
print("Contents:", [p.name for p in export_dir.glob("*")])

assert (export_dir / "val.npz").exists(), "Missing val.npz"
assert (export_dir / "val_meta.json").exists(), "Missing val_meta.json"
print("‚úì Export files exist.")

Export dir: /content/drive/MyDrive/DS_rakuten_store/artifacts/exports/flaubert_canonical
Contents: ['val.npz', 'val_meta.json']
‚úì Export files exist.


In [11]:
# CRITICAL: Verify alignment with image models
import json
from pathlib import Path
import os

STORE = Path(os.environ["DS_RAKUTEN_STORE"])
text_meta = STORE / "artifacts" / "exports" / cfg.export_name / "val_meta.json"

# Compare with image model export (if exists)
image_meta_candidates = [
    STORE / "artifacts" / "exports" / "convnext_canonical_smoke" / "val_meta.json",
    STORE / "artifacts" / "exports" / "swin_canonical" / "val_meta.json",
    STORE / "artifacts" / "exports" / "vit_canonical" / "val_meta.json",
]

text_data = json.loads(text_meta.read_text())
print("="*80)
print("TEXT MODEL METADATA")
print("="*80)
print(f"Model: {text_data['model_name']}")
print(f"Split: {text_data['split_name']}")
print(f"Split signature: {text_data['split_signature']}")
print(f"Classes fingerprint: {text_data['classes_fp']}")
print(f"Num samples: {text_data['num_samples']}")
print(f"Probs shape: {text_data['probs_shape']}")
print()

for img_meta_path in image_meta_candidates:
    if img_meta_path.exists():
        img_data = json.loads(img_meta_path.read_text())
        print("="*80)
        print(f"COMPARISON WITH {img_data['model_name'].upper()}")
        print("="*80)
        print(f"Split signature match: {text_data['split_signature'] == img_data['split_signature']} ‚úì" if text_data['split_signature'] == img_data['split_signature'] else f"Split signature MISMATCH! ‚úó")
        print(f"Classes FP match: {text_data['classes_fp'] == img_data['classes_fp']} ‚úì" if text_data['classes_fp'] == img_data['classes_fp'] else f"Classes FP MISMATCH! ‚úó")
        print(f"Num samples match: {text_data['num_samples'] == img_data['num_samples']} ‚úì" if text_data['num_samples'] == img_data['num_samples'] else f"Num samples MISMATCH! ‚úó")
        print()
        break
else:
    print("‚ö†Ô∏è No image model exports found for comparison")
    print("   Train an image model first to verify alignment")

TEXT MODEL METADATA
Model: flaubert_canonical
Split: val
Split signature: cf53f8eb169b3531
Classes fingerprint: cdfa70b13f7390e6
Num samples: 10827
Probs shape: [10827, 27]

COMPARISON WITH CONVNEXT_CANONICAL_SMOKE
Split signature match: True ‚úì
Classes FP match: True ‚úì
Num samples match: True ‚úì



In [12]:
# Load and inspect predictions
from src.export.model_exporter import load_predictions
from src.data.label_mapping import CANONICAL_CLASSES_FP
from src.data.split_manager import load_splits, split_signature

splits = load_splits(verbose=False)
sig = split_signature(splits)

loaded = load_predictions(
    npz_path=str(result["export_result"]["npz_path"]),
    verify_split_signature=sig,
    verify_classes_fp=CANONICAL_CLASSES_FP,
    require_y_true=True,
)

print("‚úì Loaded predictions successfully")
print(f"  Model: {loaded['metadata']['model_name']}")
print(f"  Split: {loaded['metadata']['split_name']}")
print(f"  Signature: {loaded['metadata']['split_signature']}")
print(f"  Fingerprint: {loaded['metadata']['classes_fp']}")
print(f"  Probs shape: {loaded['probs'].shape}")
print(f"  Y_true shape: {loaded['y_true'].shape}")
print(f"  Idx shape: {loaded['idx'].shape}")

‚úì Loaded predictions successfully
  Model: flaubert_canonical
  Split: val
  Signature: cf53f8eb169b3531
  Fingerprint: cdfa70b13f7390e6
  Probs shape: (10827, 27)
  Y_true shape: (10827,)
  Idx shape: (10827,)
