In [None]:
# ============================================================================
# PHASE 3: EXPORT VAL PREDICTIONS (ALIGNMENT-SAFE)
# ============================================================================
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

print("\n" + "="*80)
print("EXPORTING VAL PREDICTIONS FOR FUSION")
print("="*80)

from src.data.label_mapping import CANONICAL_CLASSES, reorder_probs_to_canonical
from src.export.model_exporter import export_predictions

# Index-returning dataset wrapper for alignment safety
class IndexedDataset(Dataset):
    def __init__(self, base_dataset_full, indices):
        """Dataset that returns (image, real_idx) for alignment verification.
        
        Args:
            base_dataset_full: Full RakutenImageDataset over df_full
            indices: Subset indices to use (e.g., splits["val_idx"])
        """
        self.base_dataset = base_dataset_full
        self.indices = indices
    
    def __len__(self):
        return len(self.indices)
    
    def __getitem__(self, i):
        real_idx = int(self.indices[i])
        img, _ = self.base_dataset[real_idx]
        return img, real_idx

# Create full dataset for export (requires encoded_label column)
df_full_labeled = df_full.copy()
df_full_labeled['encoded_label'] = le.transform(df_full_labeled['prdtypecode'])

full_dataset_for_export = RakutenImageDataset(
    df=df_full_labeled,
    img_root=IMG_ROOT,
    transform=val_transform
)

# Create indexed dataset for val split
val_dataset_indexed = IndexedDataset(full_dataset_for_export, splits["val_idx"])
val_loader_indexed = DataLoader(
    val_dataset_indexed,
    batch_size=CONFIG["batch_size"],
    shuffle=False,
    num_workers=CONFIG.get("num_workers", 0),
    pin_memory=True
)

# Load best model from checkpoint
print("Loading best model from convnext_best.pth")
checkpoint = torch.load("convnext_best.pth", map_location=device, weights_only=False)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

# Collect predictions and indices
val_probs_list = []
val_seen_idx_list = []

print("Running inference on val split...")
with torch.no_grad():
    for images, indices in tqdm(val_loader_indexed, desc="Val Inference"):
        images = images.to(device)
        
        if CONFIG["use_amp"]:
            with torch.amp.autocast(device_type="cuda"):
                outputs = model(images)
        else:
            outputs = model(images)
        
        probs = torch.softmax(outputs, dim=-1)
        val_probs_list.append(probs.cpu().numpy())
        val_seen_idx_list.append(indices.cpu().numpy())

# Concatenate results
val_probs = np.concatenate(val_probs_list, axis=0)
val_idx = np.concatenate(val_seen_idx_list)

# Defensive assertions: verify alignment
print("\nAlignment verification:")
print(f"  Collected {len(val_idx)} samples")
print(f"  Expected {len(splits['val_idx'])} samples")

assert len(val_idx) == len(splits["val_idx"]), f"Sample count mismatch: {len(val_idx)} != {len(splits['val_idx'])}"
assert np.array_equal(np.sort(val_idx), np.sort(splits["val_idx"])), "Index set mismatch"
assert np.array_equal(val_idx, splits["val_idx"]), "Index order mismatch (shuffle=False violation)"
print("  ‚úì Alignment verified")

# Get ground truth labels from df_full using collected indices
val_labels = df_full.iloc[val_idx]["prdtypecode"].values

# Verify encoder classes match model output shape
assert len(le.classes_) == val_probs.shape[1], f"Encoder classes ({len(le.classes_)}) != probs shape[1] ({val_probs.shape[1]})"

# Reorder probabilities to canonical class order
print("\nReordering probabilities to canonical class order...")
val_probs_aligned = reorder_probs_to_canonical(val_probs, le.classes_, CANONICAL_CLASSES)
print(f"  Input shape: {val_probs.shape} ‚Üí Output shape: {val_probs_aligned.shape}")

# Export predictions
print("\nExporting predictions...")
export_result = export_predictions(
    out_dir="artifacts/exports",
    model_name="convnext",
    split_name="val",
    idx=val_idx,
    split_signature=sig,
    probs=val_probs_aligned,
    classes=CANONICAL_CLASSES,
    y_true=val_labels,
    extra_meta={
        "model_architecture": CONFIG["model_name"],
        "checkpoint": "convnext_best.pth",
        "image_size": CONFIG["img_size"],
        "batch_size": CONFIG["batch_size"],
        "drop_path_rate": CONFIG["drop_path_rate"],
        "use_ema": CONFIG["use_ema"],
    }
)

print("\n" + "="*80)
print("EXPORT SUMMARY")
print("="*80)
print(f"NPZ file:     {export_result['npz_path']}")
print(f"Metadata:     {export_result['meta_json_path']}")
print(f"Classes_fp:   {export_result['classes_fp']}")
print(f"Split_sig:    {export_result['split_signature']}")
print(f"Num samples:  {export_result['num_samples']}")
print("="*80)

# Rakuten Image Classification - ConvNeXt

## ConvNeXt Model Exploration
Testing ConvNeXt architecture to compare with Swin Transformer (best: 74.64%)

**ConvNeXt Features:**
- Modernized CNN architecture inspired by Vision Transformers
- Efficient training with larger batch sizes and higher resolutions
- Strong performance on image classification tasks

**Training Strategy:**
- Higher resolution: 384x384 (vs 224x224 for Swin)
- Anti-overfitting: Mixup, CutMix, Stochastic Depth, **EMA**
- Optimizer: AdamW with LayerNorm-aware weight decay
- Data: Colab-friendly Google Drive loading (85% dev / 15% holdout)
- WandB tracking for experiment monitoring

## 1. Setup Environment & Dependencies

In [None]:
# @title Install Dependencies!pip install -q timm gdown pandas scikit-learn matplotlib torch torchvision tqdmimport osimport gcfrom datetime import datetime  # ‚úÖ Fixed: Only import datetime classimport torchimport torch.nn as nnimport pandas as pdimport numpy as npimport gdownimport timmfrom torch.utils.data import Dataset, DataLoaderfrom PIL import Imagefrom sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import LabelEncoderfrom sklearn.metrics import f1_score, accuracy_score, classification_reportfrom tqdm.auto import tqdmfrom torchvision import transformsfrom torch.cuda.amp import GradScalerfrom timm.utils import ModelEmaV2  # EMA supportimport matplotlib.pyplot as plt# Set seed for reproducibilitydef set_seed(seed=42):    torch.manual_seed(seed)    torch.cuda.manual_seed(seed)    torch.cuda.manual_seed_all(seed)    np.random.seed(seed)    torch.backends.cudnn.deterministic = True    torch.backends.cudnn.benchmark = Trueset_seed(42)device = torch.device("cuda" if torch.cuda.is_available() else "cpu")print(f"‚úÖ Environment setup complete. Using device: {device}")if torch.cuda.is_available():    print(f"   GPU: {torch.cuda.get_device_name(0)}")    print(f"   CUDA Version: {torch.version.cuda}")

## 2. Download Data with Proper Split

In [None]:
# @title Download CSV Data
def load_csv_from_gdrive(share_url: str, **read_csv_kwargs) -> pd.DataFrame:
    try:
        file_id = share_url.split("/d/")[1].split("/")[0]
        download_url = f"https://drive.google.com/uc?id={file_id}"
        return pd.read_csv(download_url, **read_csv_kwargs)
    except IndexError:
        print(f"Error parsing URL: {share_url}")
        return None

print("Downloading CSV data...")
X_train_url = "https://drive.google.com/file/d/1geSiJTTjamysiSbJ8-W9gR1kv-x6HyEd/view?usp=drive_link"
y_train_url = "https://drive.google.com/file/d/16czWmLR5Ff0s5aYIqy1rHT7hc6Gcpfw3/view?usp=sharing"

try:
    X_train_full = load_csv_from_gdrive(X_train_url)
    y_train_full = load_csv_from_gdrive(y_train_url)

    if X_train_full is not None and y_train_full is not None:
        df_full = X_train_full.copy()
        df_full['prdtypecode'] = y_train_full['prdtypecode']
        print(f"Total data loaded: {len(df_full):,} samples")
        print(f"Classes: {df_full['prdtypecode'].nunique()}")
    else:
        raise ValueError("Failed to load DataFrames")
except Exception as e:
    print(f"CSV download failed: {e}")

In [None]:
# @title Download Images
IMAGE_FILE_ID = "15ZkS0iTQ7j3mHpxil4mABlXwP-jAN_zi"

if not os.path.exists("/content/images"):
    print("\nDownloading images...")
    os.makedirs("/content/tmp", exist_ok=True)
    os.makedirs("/content/images", exist_ok=True)
    !gdown --id $IMAGE_FILE_ID -O /content/tmp/images.zip

    print("Unzipping images...")
    !unzip -q -o /content/tmp/images.zip -d /content/images
    print("Images unzipped")
else:
    print("\nImages already exist, skipping download")

IMG_ROOT = "/content/images/images/image_train"
print(f"Image Root: {IMG_ROOT}")

In [None]:
# @title Load Canonical Splits
import sys
from pathlib import Path

def find_repo_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start, *start.parents]:
        if (p / "src").exists() and (p / "data").exists():
            return p
    raise RuntimeError("Repo root not found. Ensure the DS_rakuten repo is present with 'src/' and 'data/'.")

repo_root = find_repo_root(Path.cwd())
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

from src.data.split_manager import load_splits, split_signature

splits = load_splits(verbose=True)
sig = split_signature(splits)

print("\nCanonical split sizes:")
print(f"  Train: {len(splits['train_idx']):,}")
print(f"  Val:   {len(splits['val_idx']):,}")
print(f"  Test:  {len(splits['test_idx']):,}")
print(f"  Total: {len(splits['train_idx']) + len(splits['val_idx']) + len(splits['test_idx']):,}")
print(f"\nSplit signature: {sig}")
print(f"Repo root: {repo_root}")

# @title Label Encoding
print("="*80)
print("LABEL ENCODING (FIT ON DEV ONLY)")
print("="*80)

dev_idx = np.concatenate([splits["train_idx"], splits["val_idx"]])

le = LabelEncoder()
le.fit(df_full.iloc[dev_idx]["prdtypecode"])

NUM_CLASSES = len(le.classes_)
print(f"LabelEncoder fitted on DEV only (train+val).")
print(f"Number of classes: {NUM_CLASSES}")
assert NUM_CLASSES == 27, f"Expected 27 classes, got {NUM_CLASSES}"

# Fingerprint of class order for alignment (important for fusion)
import hashlib, json
classes_fingerprint = hashlib.sha256(json.dumps(le.classes_.tolist()).encode("utf-8")).hexdigest()[:16]
print(f"Classes fingerprint: {classes_fingerprint}")

print("="*80)


# @title Create Train/Val/Test Splits from Canonical Indices
print("="*80)
print("CREATING TRAIN/VAL/TEST SPLITS FROM CANONICAL INDICES")
print("="*80)

# Create splits using canonical indices (no train_test_split!)
df_train = df_full.iloc[splits["train_idx"]].copy()
df_val   = df_full.iloc[splits["val_idx"]].copy()
df_test  = df_full.iloc[splits["test_idx"]].copy()

# Encode labels per split
df_train["encoded_label"] = le.transform(df_train["prdtypecode"])
df_val["encoded_label"]   = le.transform(df_val["prdtypecode"])
df_test["encoded_label"]  = le.transform(df_test["prdtypecode"])

# Compute sizes and percentages
total_samples = len(df_train) + len(df_val) + len(df_test)
pct_train = 100 * len(df_train) / total_samples
pct_val   = 100 * len(df_val) / total_samples
pct_test  = 100 * len(df_test) / total_samples

print(f"Training:   {len(df_train):6,} samples ({pct_train:.1f}%)")
print(f"Validation: {len(df_val):6,} samples ({pct_val:.1f}%)")
print(f"Test:       {len(df_test):6,} samples ({pct_test:.1f}%)")
print(f"Total:      {total_samples:6,}")
assert total_samples == 84916, f"Expected 84916 total samples, got {total_samples}"

print(f"\nSplit signature: {sig}")
print(f"Classes fingerprint: {classes_fingerprint}")

print("\nModel selection will use Train/Val ONLY")
print("Test set will be evaluated at the END")
print("="*80)


## 5. Dataset Definition

In [None]:
# @title Dataset Class (Optimized for Performance)
class RakutenImageDataset(Dataset):
    def __init__(self, df, img_root, transform=None):
        # ‚úÖ Pre-convert to lists for faster access (avoid .iloc performance issue)
        self.image_ids = df['imageid'].tolist()
        self.product_ids = df['productid'].tolist()
        self.labels = df['encoded_label'].tolist()
        self.img_root = img_root
        self.transform = transform

        print(f"‚úì Dataset initialized with {len(self.labels):,} samples (optimized)")

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # ‚úÖ Direct list access
        imageid = self.image_ids[idx]
        productid = self.product_ids[idx]
        label = self.labels[idx]

        # Image Processing
        img_name = f"image_{imageid}_product_{productid}.jpg"
        img_path = os.path.join(self.img_root, img_name)
        try:
            image = Image.open(img_path).convert("RGB")
        except (FileNotFoundError, OSError):
            # Fallback for missing/corrupt images
            image = Image.new('RGB', (384, 384), (0, 0, 0))

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(label, dtype=torch.long)

print(f"Dataset class ready.")

## 6. Model Definition - ConvNeXt

In [None]:
# @title ConvNeXt Model
class RakutenConvNeXt(nn.Module):
    """
    ConvNeXt for Rakuten product classification.

    Anti-Overfitting Features:
    - Stochastic Depth (drop_path_rate)
    - LayerNorm + Dropout in classification head
    """

    def __init__(
        self,
        model_name: str = 'convnext_base',
        num_classes: int = 27,
        pretrained: bool = True,
        drop_path_rate: float = 0.3
    ):
        super(RakutenConvNeXt, self).__init__()

        # Load ConvNeXt backbone with Stochastic Depth
        # Note: ConvNeXt doesn't accept img_size parameter in timm
        self.backbone = timm.create_model(
            model_name,
            pretrained=pretrained,
            num_classes=0,  # Remove default head
            global_pool='avg',
            drop_path_rate=drop_path_rate  # Stochastic Depth
        )

        feature_dim = self.backbone.num_features

        # Classification head with LayerNorm + Dropout
        self.head = nn.Sequential(
            nn.LayerNorm(feature_dim),
            nn.Dropout(p=0.5),
            nn.Linear(feature_dim, 512),
            nn.GELU(),
            nn.Dropout(p=0.3),
            nn.Linear(512, num_classes)
        )

        self.num_classes = num_classes
        self.model_name = model_name
        self.drop_path_rate = drop_path_rate

        print(f"‚úì RakutenConvNeXt initialized:")
        print(f"  - Model: {model_name}")
        print(f"  - Drop Path Rate: {drop_path_rate}")
        print(f"  - Head: {feature_dim} ‚Üí 512 ‚Üí {num_classes}")

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        features = self.backbone(x)
        logits = self.head(features)
        return logits

print("‚úì RakutenConvNeXt class defined")

## 7. Configuration

In [None]:
# @title Training Configuration
CONFIG = {
    # Model
    "model_name": "convnext_base",  # Using ConvNeXt Base
    "img_size": 384,  # Higher resolution for better performance
    "num_classes": NUM_CLASSES,

    # Training - Optimized for A100 Colab Pro
    "batch_size": 64,  # Adjusted for 384x384 on A100
    "num_epochs": 30,
    "learning_rate": 1e-4,  # ConvNeXt works well with higher LR
    "weight_decay": 0.05,

    # Anti-Overfitting
    "drop_path_rate": 0.3,
    "mixup_alpha": 0.8,
    "cutmix_alpha": 1.0,
    "label_smoothing": 0.1,
    "use_ema": True,  # ‚úÖ EMA enabled
    "ema_decay": 0.9999,  # EMA decay rate

    # Other
    "early_stopping_patience": 5,
    "use_amp": True,
    "num_workers": 2,
}

print("="*80)
print("CONVNEXT TRAINING CONFIGURATION (Colab A100)")
print("="*80)
print(f"Model: {CONFIG['model_name']}")
print(f"Image Size: {CONFIG['img_size']}x{CONFIG['img_size']} (higher resolution)")
print(f"Batch Size: {CONFIG['batch_size']} (optimized for A100 + 384x384)")
print(f"Epochs: {CONFIG['num_epochs']}")
print(f"Learning Rate: {CONFIG['learning_rate']}")
print(f"Weight Decay: {CONFIG['weight_decay']}")
print(f"\nAnti-Overfitting:")
print(f"  - Drop Path: {CONFIG['drop_path_rate']}")
print(f"  - Mixup Alpha: {CONFIG['mixup_alpha']}")
print(f"  - CutMix Alpha: {CONFIG['cutmix_alpha']}")
print(f"  - Label Smoothing: {CONFIG['label_smoothing']}")
print(f"  - EMA: {CONFIG['use_ema']} (decay={CONFIG['ema_decay']})")
print(f"\nAMP: {CONFIG['use_amp']}")
print("="*80)

## 8. Data Transforms & Loaders

In [None]:
# @title Data Transformstrain_transform = transforms.Compose([    transforms.RandomResizedCrop(384, scale=(0.8, 1.0)),    transforms.RandomHorizontalFlip(p=0.5),    transforms.RandAugment(num_ops=2, magnitude=9),    transforms.ToTensor(),    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])val_transform = transforms.Compose([    transforms.Resize(438),    transforms.CenterCrop(384),    transforms.ToTensor(),    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])# Create Datasetstrain_dataset = RakutenImageDataset(train_df, IMG_ROOT, transform=train_transform)val_dataset = RakutenImageDataset(val_df, IMG_ROOT, transform=val_transform)holdout_dataset = RakutenImageDataset(df_test, IMG_ROOT, transform=val_transform)# Create DataLoaderstrain_loader = DataLoader(    train_dataset,    batch_size=CONFIG["batch_size"],    shuffle=True,    num_workers=CONFIG["num_workers"],    pin_memory=True,    drop_last=True)val_loader = DataLoader(    val_dataset,    batch_size=CONFIG["batch_size"],    shuffle=False,    num_workers=CONFIG["num_workers"],    pin_memory=True)holdout_loader = DataLoader(    holdout_dataset,    batch_size=CONFIG["batch_size"],    shuffle=False,    num_workers=CONFIG["num_workers"],    pin_memory=True)print(f"‚úì Train batches: {len(train_loader):,}")print(f"‚úì Val batches: {len(val_loader):,}")print(f"‚úì Holdout batches: {len(holdout_loader):,}")

## 9. Model Initialization

In [None]:
# @title Initialize Model
model = RakutenConvNeXt(
    model_name=CONFIG["model_name"],
    num_classes=NUM_CLASSES,
    pretrained=True,
    drop_path_rate=CONFIG["drop_path_rate"]
)
model = model.to(device)

# Initialize EMA
model_ema = None
if CONFIG["use_ema"]:
    model_ema = ModelEmaV2(model, decay=CONFIG["ema_decay"])
    print(f"‚úì EMA initialized with decay={CONFIG['ema_decay']}")

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\nüìä Model Statistics:")
print(f"  Total parameters: {total_params:,}")
print(f"  Trainable parameters: {trainable_params:,}")

## 10. Training Setup (Mixup/CutMix + Optimizer)

In [None]:
# @title Mixup/CutMix & Optimizer Setup
from timm.data.mixup import Mixup
from timm.loss import SoftTargetCrossEntropy

# Initialize Mixup/CutMix
mixup_fn = Mixup(
    mixup_alpha=CONFIG["mixup_alpha"],
    cutmix_alpha=CONFIG["cutmix_alpha"],
    cutmix_minmax=None,
    prob=1.0,  # Apply to all batches
    switch_prob=0.5,  # 50% Mixup, 50% CutMix
    mode='batch',
    label_smoothing=CONFIG["label_smoothing"],
    num_classes=NUM_CLASSES
)

# Loss functions
criterion_train = SoftTargetCrossEntropy()  # For Mixup (soft labels)
criterion_val = nn.CrossEntropyLoss()       # For validation (hard labels)

print("‚úì Mixup & CutMix initialized")
print(f"  Mixup alpha: {CONFIG['mixup_alpha']}")
print(f"  CutMix alpha: {CONFIG['cutmix_alpha']}")

# Optimizer - AdamW with LayerNorm-aware weight decay
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=CONFIG["learning_rate"],
    weight_decay=CONFIG["weight_decay"]
)

# Scheduler - Cosine Annealing
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=CONFIG["num_epochs"],
    eta_min=1e-6
)

# AMP Scaler
scaler = GradScaler() if CONFIG["use_amp"] else None

print("‚úì Optimizer: AdamW with Cosine Annealing")
print(f"‚úì AMP: {CONFIG['use_amp']}")

## 11. Training Loop with WandB & EMA

In [None]:
# @title Training Loop with EMAbest_val_acc = 0.0best_val_f1 = 0.0best_ema_acc = 0.0best_ema_f1 = 0.0patience_counter = 0history = {    "train_loss": [],    "val_loss": [],    "val_acc": [],    "val_f1": [],    "ema_val_acc": [],    "ema_val_f1": []}def evaluate(model, loader, criterion):    model.eval()    val_loss = 0.0    all_preds = []    all_labels = []    with torch.no_grad():        for images, labels in tqdm(loader, desc="Evaluating", leave=False):            images = images.to(device)            labels = labels.to(device)            if CONFIG["use_amp"]:                with torch.amp.autocast(device_type="cuda"):                    outputs = model(images)                    loss = criterion(outputs, labels)            else:                outputs = model(images)                loss = criterion(outputs, labels)            val_loss += loss.item()            predictions = torch.argmax(outputs, dim=-1)            all_preds.extend(predictions.cpu().numpy())            all_labels.extend(labels.cpu().numpy())    avg_loss = val_loss / len(loader)    acc = accuracy_score(all_labels, all_preds)    f1 = f1_score(all_labels, all_preds, average='weighted')    return avg_loss, acc * 100, f1print("="*80)print("üöÄ STARTING CONVNEXT TRAINING WITH EMA")print("="*80)for epoch in range(CONFIG["num_epochs"]):    print(f"\nEpoch {epoch + 1}/{CONFIG['num_epochs']}")    print("="*80)    # ========================================================================    # TRAINING with Mixup/CutMix    # ========================================================================    model.train()    train_loss = 0.0    train_pbar = tqdm(train_loader, desc="Training")    for images, labels in train_pbar:        images, labels = images.to(device), labels.to(device)        # Apply Mixup/CutMix        images, labels = mixup_fn(images, labels)        optimizer.zero_grad()        if CONFIG["use_amp"]:            with torch.amp.autocast(device_type="cuda"):                outputs = model(images)                loss = criterion_train(outputs, labels)            scaler.scale(loss).backward()            scaler.unscale_(optimizer)            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)            scaler.step(optimizer)            scaler.update()        else:            outputs = model(images)            loss = criterion_train(outputs, labels)            loss.backward()            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)            optimizer.step()        # Update EMA        if model_ema is not None:            model_ema.update(model)        train_loss += loss.item()        train_pbar.set_postfix({'loss': f'{loss.item():.4f}'})    avg_train_loss = train_loss / len(train_loader)    # ========================================================================    # VALIDATION (no Mixup) - Regular Model    # ========================================================================    val_loss, val_acc, val_f1 = evaluate(model, val_loader, criterion_val)    # ========================================================================    # VALIDATION - EMA Model    # ========================================================================    ema_val_acc, ema_val_f1 = 0.0, 0.0    if model_ema is not None:        _, ema_val_acc, ema_val_f1 = evaluate(model_ema.module, val_loader, criterion_val)    # ========================================================================    # LOGGING & CHECKPOINTING    # ========================================================================    history["train_loss"].append(avg_train_loss)    history["val_loss"].append(val_loss)    history["val_acc"].append(val_acc)    history["val_f1"].append(val_f1)    history["ema_val_acc"].append(ema_val_acc)    history["ema_val_f1"].append(ema_val_f1)    print(f"\nüìä Results:")    print(f"  Train Loss: {avg_train_loss:.4f}")    print(f"  Val (Regular): Acc={val_acc:.2f}%, F1={val_f1:.4f}")    if model_ema is not None:        print(f"  Val (EMA):     Acc={ema_val_acc:.2f}%, F1={ema_val_f1:.4f}")    # Save best model (use EMA if better)    current_best_acc = max(val_acc, ema_val_acc)    if current_best_acc > best_val_acc:        best_val_acc = current_best_acc        patience_counter = 0        # Save the better model        if ema_val_acc > val_acc and model_ema is not None:            torch.save({                'epoch': epoch + 1,                'model_state_dict': model_ema.module.state_dict(),                'val_acc': ema_val_acc,                'val_f1': ema_val_f1,                'is_ema': True            }, "convnext_best.pth")            print(f"  ‚úÖ Best EMA model saved! (Acc: {ema_val_acc:.2f}%, F1: {ema_val_f1:.4f})")        else:            torch.save({                'epoch': epoch + 1,                'model_state_dict': model.state_dict(),                'val_acc': val_acc,                'val_f1': val_f1,                'is_ema': False            }, "convnext_best.pth")            print(f"  ‚úÖ Best model saved! (Acc: {val_acc:.2f}%, F1: {val_f1:.4f})")    else:        patience_counter += 1        print(f"  ‚è≥ No improvement ({patience_counter}/{CONFIG['early_stopping_patience']})")    scheduler.step()    if patience_counter >= CONFIG["early_stopping_patience"]:        print(f"\n‚ö†Ô∏è Early stopping triggered after {epoch + 1} epochs")        breakprint("\n" + "="*80)print("üéâ TRAINING COMPLETE")print("="*80)print(f"Best Val Acc: {best_val_acc:.2f}%")print("="*80)

## 12. Final Evaluation on Holdout Set

In [None]:
# @title Detailed Classification Report
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for images, labels in tqdm(holdout_loader, desc="Final Prediction"):
        images = images.to(device)

        if CONFIG["use_amp"]:
            with torch.amp.autocast(device_type="cuda"):
                outputs = model(images)
        else:
            outputs = model(images)

        predictions = torch.argmax(outputs, dim=-1)
        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(labels.numpy())

print("\nClassification Report (Holdout):")
print(classification_report(all_labels, all_preds, digits=4, zero_division=0))

## 13. Visualizations

In [None]:
# @title Training Curves
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Loss
axes[0].plot(history["train_loss"], label='Train Loss', marker='o', linewidth=2)
axes[0].plot(history["val_loss"], label='Val Loss', marker='s', linewidth=2)
axes[0].set_xlabel('Epoch', fontsize=12)
axes[0].set_ylabel('Loss', fontsize=12)
axes[0].set_title('Training and Validation Loss', fontsize=14, fontweight='bold')
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)

# Accuracy - Regular vs EMA
axes[1].plot(history["val_acc"], label='Val Acc (Regular)', marker='o', linewidth=2)
if CONFIG["use_ema"]:
    axes[1].plot(history["ema_val_acc"], label='Val Acc (EMA)', marker='s', linewidth=2, linestyle='--')
axes[1].set_xlabel('Epoch', fontsize=12)
axes[1].set_ylabel('Accuracy (%)', fontsize=12)
axes[1].set_title('Validation Accuracy (Regular vs EMA)', fontsize=14, fontweight='bold')
axes[1].legend(fontsize=11)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('convnext_training_curves.png', dpi=150, bbox_inches='tight')
plt.show()

print("Plot saved: convnext_training_curves.png")

## 14. Summary

## 15. Save to Google Drive (Optional)

In [None]:
# @title Save Model to Google Drive
from google.colab import drive
import shutil

drive.mount('/content/drive')

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
target_dir = "/content/drive/MyDrive/Rakuten_models"
os.makedirs(target_dir, exist_ok=True)

model_type = "ema" if is_ema else "regular"
target_file = os.path.join(target_dir, f"convnext_{model_type}_{timestamp}.pth")
shutil.copy("convnext_best.pth", target_file)
print(f"‚úì Model saved to: {target_file}")