# Classification Rakuten - ConvNeXt (Colab)

In [None]:
!pip install -q timm gdown pandas scikit-learn torch torchvision tqdm

import os
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import gdown
import timm
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score, classification_report
from tqdm.auto import tqdm
from torchvision import transforms
from torch.cuda.amp import GradScaler
from timm.utils import ModelEmaV2
import json

torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

## 1. Téléchargement des données

In [None]:
# Téléchargement des CSVs avec gdown
print("Downloading CSV data...")

# Download X_train
X_TRAIN_FILE_ID = "1geSiJTTjamysiSbJ8-W9gR1kv-x6HyEd"
gdown.download(id=X_TRAIN_FILE_ID, output='/content/X_train.csv', quiet=False)
X_train_full = pd.read_csv('/content/X_train.csv')

# Download y_train
Y_TRAIN_FILE_ID = "16czWmLR5Ff0s5aYIqy1rHT7hc6Gcpfw3"
gdown.download(id=Y_TRAIN_FILE_ID, output='/content/y_train.csv', quiet=False)
y_train_full = pd.read_csv('/content/y_train.csv')

print(f"Total data loaded: {len(X_train_full):,} samples")

# Constantes pour les splits unifiés (identiques au projet)
SEED = 42
TEST_SIZE = 0.15
VAL_SIZE = 0.15

print("\n" + "="*80)
print("SPLITTING DATA - Unified Project Splits")
print("="*80)

# Génération des splits unifiés (même logique que generate_splits)
y = y_train_full['prdtypecode'].to_numpy()
indices = np.arange(len(y))

# Première division: full_train (85%) / test (15%)
full_train_idx, test_idx = train_test_split(
    indices, test_size=TEST_SIZE, random_state=SEED, stratify=y
)

# Deuxième division: train (85% de full_train) / val (15% de full_train)
train_idx, val_idx = train_test_split(
    full_train_idx, test_size=VAL_SIZE, random_state=SEED, stratify=y[full_train_idx]
)

# Créer les DataFrames
train_df = X_train_full.iloc[train_idx].copy()
train_df['prdtypecode'] = y[train_idx]

val_df = X_train_full.iloc[val_idx].copy()
val_df['prdtypecode'] = y[val_idx]

df_test = X_train_full.iloc[test_idx].copy()
df_test['prdtypecode'] = y[test_idx]

print(f"✓ Train: {len(train_df):,} samples ({len(train_df)/len(y)*100:.2f}%)")
print(f"✓ Val: {len(val_df):,} samples ({len(val_df)/len(y)*100:.2f}%)")
print(f"✓ Test: {len(df_test):,} samples ({len(df_test)/len(y)*100:.2f}%)")
print(f"✓ Classes: {train_df['prdtypecode'].nunique()}")
print("\n⚠️  CRITICAL: Test set will ONLY be used for final evaluation!")
print("="*80)

In [None]:
IMAGE_FILE_ID = "15ZkS0iTQ7j3mHpxil4mABlXwP-jAN_zi"

if not os.path.exists("/content/images"):
    os.makedirs("/content/tmp", exist_ok=True)
    os.makedirs("/content/images", exist_ok=True)
    !gdown --id $IMAGE_FILE_ID -O /content/tmp/images.zip
    !unzip -q -o /content/tmp/images.zip -d /content/images

IMG_ROOT = "/content/images/images/image_train"
print(f"Images: {IMG_ROOT}")

## 2. Préparation des données

In [None]:
# Encodage des labels (sur train uniquement)
le = LabelEncoder()
le.fit(train_df['prdtypecode'])

train_df['encoded_label'] = le.transform(train_df['prdtypecode'])
val_df['encoded_label'] = le.transform(val_df['prdtypecode'])
df_test['encoded_label'] = le.transform(df_test['prdtypecode'])

NUM_CLASSES = len(le.classes_)

print(f"Nombre de classes: {NUM_CLASSES}")

In [None]:
class RakutenImageDataset(Dataset):
    def __init__(self, df, img_root, transform=None):
        self.image_ids = df['imageid'].tolist()
        self.product_ids = df['productid'].tolist()
        self.labels = df['encoded_label'].tolist()
        self.img_root = img_root
        self.transform = transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        img_name = f"image_{self.image_ids[idx]}_product_{self.product_ids[idx]}.jpg"
        img_path = os.path.join(self.img_root, img_name)
        try:
            image = Image.open(img_path).convert("RGB")
        except:
            image = Image.new('RGB', (384, 384), (0, 0, 0))
        
        if self.transform:
            image = self.transform(image)
        
        return image, torch.tensor(self.labels[idx], dtype=torch.long)

## 3. Modèle ConvNeXt

In [None]:
class RakutenConvNeXt(nn.Module):
    def __init__(self, model_name='convnext_base', num_classes=27, 
                 pretrained=True, drop_path_rate=0.3):
        super().__init__()
        
        self.backbone = timm.create_model(
            model_name, pretrained=pretrained, num_classes=0,
            global_pool='avg', drop_path_rate=drop_path_rate
        )
        
        feature_dim = self.backbone.num_features
        self.head = nn.Sequential(
            nn.LayerNorm(feature_dim),
            nn.Dropout(p=0.5),
            nn.Linear(feature_dim, 512),
            nn.GELU(),
            nn.Dropout(p=0.3),
            nn.Linear(512, num_classes)
        )
        
        self.num_classes = num_classes

    def forward(self, x):
        return self.head(self.backbone(x))

print("Modèle ConvNeXt défini")

## 4. Configuration

In [None]:
from datetime import datetime

# Timestamp pour versioning
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

CONFIG = {
    "model_name": "convnext_base",
    "img_size": 384,
    "num_classes": NUM_CLASSES,
    "batch_size": 64,  # Optimisé pour A100 + 384x384
    "num_epochs": 30,
    "learning_rate": 1e-4,
    "weight_decay": 0.05,
    "drop_path_rate": 0.3,
    "mixup_alpha": 0.8,
    "cutmix_alpha": 1.0,
    "label_smoothing": 0.1,
    "use_ema": True,
    "ema_decay": 0.9999,
    "early_stopping_patience": 5,
    "use_amp": True,
    "num_workers": 2,
    "timestamp": timestamp
}

print("="*80)
print("CONVNEXT TRAINING CONFIGURATION (Colab A100)")
print("="*80)
print(f"Model: {CONFIG['model_name']}")
print(f"Image Size: {CONFIG['img_size']}x{CONFIG['img_size']}")
print(f"Batch Size: {CONFIG['batch_size']} (optimized for A100 + 384x384)")
print(f"Epochs: {CONFIG['num_epochs']}")
print(f"Learning Rate: {CONFIG['learning_rate']}")
print(f"EMA: {CONFIG['use_ema']} (decay={CONFIG['ema_decay']})")
print(f"AMP: {CONFIG['use_amp']}")
print(f"Timestamp: {timestamp}")
print("="*80)

In [None]:
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(384, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandAugment(num_ops=2, magnitude=9),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize(438),
    transforms.CenterCrop(384),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = RakutenImageDataset(train_df, IMG_ROOT, train_transform)
val_dataset = RakutenImageDataset(val_df, IMG_ROOT, val_transform)
test_dataset = RakutenImageDataset(df_test, IMG_ROOT, val_transform)

train_loader = DataLoader(train_dataset, batch_size=CONFIG["batch_size"], shuffle=True,
                          num_workers=CONFIG["num_workers"], pin_memory=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=CONFIG["batch_size"], shuffle=False,
                        num_workers=CONFIG["num_workers"], pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=CONFIG["batch_size"], shuffle=False,
                         num_workers=CONFIG["num_workers"], pin_memory=True)

print(f"Batches - Train: {len(train_loader)} | Val: {len(val_loader)} | Test: {len(test_loader)}")

## 5. Initialisation

In [None]:
from timm.data.mixup import Mixup
from timm.loss import SoftTargetCrossEntropy

model = RakutenConvNeXt(
    model_name=CONFIG["model_name"],
    num_classes=NUM_CLASSES,
    drop_path_rate=CONFIG["drop_path_rate"]
).to(device)

model_ema = ModelEmaV2(model, decay=CONFIG["ema_decay"]) if CONFIG["use_ema"] else None

mixup_fn = Mixup(
    mixup_alpha=CONFIG["mixup_alpha"], cutmix_alpha=CONFIG["cutmix_alpha"],
    prob=1.0, switch_prob=0.5, mode='batch',
    label_smoothing=CONFIG["label_smoothing"], num_classes=NUM_CLASSES
)

criterion_train = SoftTargetCrossEntropy()
criterion_val = nn.CrossEntropyLoss()

optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG["learning_rate"],
                              weight_decay=CONFIG["weight_decay"])
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=CONFIG["num_epochs"],
                                                        eta_min=1e-6)
scaler = GradScaler() if CONFIG["use_amp"] else None

print("Initialisation terminée")

## 6. Entraînement

In [None]:
def evaluate(model, loader, criterion):
    model.eval()
    val_loss = 0.0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for images, labels in tqdm(loader, desc="Évaluation", leave=False):
            images, labels = images.to(device), labels.to(device)

            if CONFIG["use_amp"]:
                with torch.amp.autocast(device_type="cuda"):
                    outputs = model(images)
                    loss = criterion(outputs, labels)
            else:
                outputs = model(images)
                loss = criterion(outputs, labels)

            val_loss += loss.item()
            all_preds.extend(torch.argmax(outputs, dim=-1).cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = val_loss / len(loader)
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')

    return avg_loss, acc * 100, f1

In [None]:
best_val_acc = 0.0
patience_counter = 0
history = {"train_loss": [], "val_loss": [], "val_acc": [], "ema_val_acc": []}

# Nom du fichier avec timestamp
model_filename = f"convnext_best_{CONFIG['timestamp']}.pth"

for epoch in range(CONFIG["num_epochs"]):
    print(f"\nEpoch {epoch + 1}/{CONFIG['num_epochs']}")
    
    model.train()
    train_loss = 0.0

    for images, labels in tqdm(train_loader, desc="Entrainement"):
        images, labels = images.to(device), labels.to(device)
        images, labels = mixup_fn(images, labels)

        optimizer.zero_grad()

        if CONFIG["use_amp"]:
            with torch.amp.autocast(device_type="cuda"):
                outputs = model(images)
                loss = criterion_train(outputs, labels)
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            scaler.step(optimizer)
            scaler.update()
        else:
            outputs = model(images)
            loss = criterion_train(outputs, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            optimizer.step()

        if model_ema is not None:
            model_ema.update(model)

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)

    val_loss, val_acc, val_f1 = evaluate(model, val_loader, criterion_val)
    ema_val_acc = 0.0
    if model_ema is not None:
        _, ema_val_acc, _ = evaluate(model_ema.module, val_loader, criterion_val)

    history["train_loss"].append(avg_train_loss)
    history["val_loss"].append(val_loss)
    history["val_acc"].append(val_acc)
    history["ema_val_acc"].append(ema_val_acc)

    print(f"Train Loss: {avg_train_loss:.4f}")
    print(f"Val Acc: {val_acc:.2f}% | EMA: {ema_val_acc:.2f}%")

    current_best_acc = max(val_acc, ema_val_acc)
    if current_best_acc > best_val_acc:
        best_val_acc = current_best_acc
        patience_counter = 0

        save_model = model_ema.module if ema_val_acc > val_acc else model
        is_ema = ema_val_acc > val_acc
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': save_model.state_dict(),
            'val_acc': current_best_acc,
            'is_ema': is_ema,
            'timestamp': CONFIG['timestamp']
        }, model_filename)
        print(f"Meilleur modele sauvegarde: {model_filename} (EMA: {is_ema})")
    else:
        patience_counter += 1
        if patience_counter >= CONFIG["early_stopping_patience"]:
            print(f"Arret precoce apres {epoch + 1} epochs")
            break

    scheduler.step()

print(f"\nEntrainement termine. Meilleure Val Acc: {best_val_acc:.2f}%")

## 7. Évaluation finale

In [None]:
import numpy as np
from google.colab import drive
import shutil

# Charger le meilleur modele
model_filename = f"convnext_best_{CONFIG['timestamp']}.pth"
checkpoint = torch.load(model_filename, weights_only=False)
model.load_state_dict(checkpoint['model_state_dict'])

is_ema = checkpoint.get('is_ema', False)
print(f"Modele charge: {model_filename} (EMA: {is_ema})")
print(f"Epoch: {checkpoint['epoch']} | Val Acc: {checkpoint['val_acc']:.2f}%")

# =========================================================================
# Evaluation sur VALIDATION set et export des predictions
# =========================================================================
print("\n" + "="*70)
print("EVALUATION SUR VALIDATION SET")
print("="*70)

val_probs_list = []
val_labels_list = []

model.eval()
with torch.no_grad():
    for images, labels in tqdm(val_loader, desc="Validation"):
        images = images.to(device)
        
        if CONFIG["use_amp"]:
            with torch.amp.autocast(device_type="cuda"):
                outputs = model(images)
        else:
            outputs = model(images)
        
        probs = torch.softmax(outputs, dim=-1).cpu().numpy()
        val_probs_list.append(probs)
        val_labels_list.append(labels.numpy())

val_probs = np.vstack(val_probs_list)
val_labels = np.concatenate(val_labels_list)
val_preds = val_probs.argmax(axis=1)

val_acc = 100.0 * accuracy_score(val_labels, val_preds)
val_f1 = f1_score(val_labels, val_preds, average='weighted')

print(f"Val Acc: {val_acc:.2f}% | Val F1: {val_f1:.4f}")

# Export validation predictions
np.save(f"convnext_probs_val_{CONFIG['timestamp']}.npy", val_probs)
np.save(f"convnext_labels_val_{CONFIG['timestamp']}.npy", val_labels)
np.save(f"convnext_preds_val_{CONFIG['timestamp']}.npy", val_preds)
print("Predictions validation exportees (.npy)")

# =========================================================================
# Evaluation sur TEST set et export des predictions
# =========================================================================
print("\n" + "="*70)
print("EVALUATION SUR TEST SET")
print("="*70)

test_probs_list = []
test_labels_list = []

with torch.no_grad():
    for images, labels in tqdm(test_loader, desc="Test"):
        images = images.to(device)
        
        if CONFIG["use_amp"]:
            with torch.amp.autocast(device_type="cuda"):
                outputs = model(images)
        else:
            outputs = model(images)
        
        probs = torch.softmax(outputs, dim=-1).cpu().numpy()
        test_probs_list.append(probs)
        test_labels_list.append(labels.numpy())

test_probs = np.vstack(test_probs_list)
test_labels = np.concatenate(test_labels_list)
test_preds = test_probs.argmax(axis=1)

test_acc = 100.0 * accuracy_score(test_labels, test_preds)
test_f1 = f1_score(test_labels, test_preds, average='weighted')

print(f"Test Acc: {test_acc:.2f}% | Test F1: {test_f1:.4f}")
print("\nRapport de classification (Test):")
print(classification_report(test_labels, test_preds, digits=4))

# Export test predictions
np.save(f"convnext_probs_test_{CONFIG['timestamp']}.npy", test_probs)
np.save(f"convnext_labels_test_{CONFIG['timestamp']}.npy", test_labels)
np.save(f"convnext_preds_test_{CONFIG['timestamp']}.npy", test_preds)
print("Predictions test exportees (.npy)")

# =========================================================================
# Sauvegarde des resultats (JSON)
# =========================================================================
results = {
    "timestamp": CONFIG["timestamp"],
    "model_name": CONFIG["model_name"],
    "best_epoch": int(checkpoint['epoch']),
    "val_acc": float(val_acc),
    "val_f1": float(val_f1),
    "test_acc": float(test_acc),
    "test_f1": float(test_f1),
    "is_ema": is_ema,
    "num_classes": int(CONFIG["num_classes"]),
    "train_samples": len(train_df),
    "val_samples": len(val_df),
    "test_samples": len(df_test)
}

results_filename = f"convnext_results_{CONFIG['timestamp']}.json"
with open(results_filename, "w") as f:
    json.dump(results, f, indent=2)

print(f"\nResultats sauvegardes: {results_filename}")

# =========================================================================
# Sauvegarde sur Google Drive
# =========================================================================
print("\n" + "="*70)
print("SAUVEGARDE SUR GOOGLE DRIVE")
print("="*70)

drive.mount('/content/drive')

target_dir = "/content/drive/MyDrive/Rakuten_models"
os.makedirs(target_dir, exist_ok=True)

# Copier le modele
model_target = os.path.join(target_dir, model_filename)
shutil.copy(model_filename, model_target)
print(f"Modele copie: {model_target}")

# Copier les fichiers npy (validation)
for suffix in ["probs", "labels", "preds"]:
    src = f"convnext_{suffix}_val_{CONFIG['timestamp']}.npy"
    dst = os.path.join(target_dir, src)
    shutil.copy(src, dst)
    print(f"NPY copie: {dst}")

# Copier les fichiers npy (test)
for suffix in ["probs", "labels", "preds"]:
    src = f"convnext_{suffix}_test_{CONFIG['timestamp']}.npy"
    dst = os.path.join(target_dir, src)
    shutil.copy(src, dst)
    print(f"NPY copie: {dst}")

# Copier le JSON
results_target = os.path.join(target_dir, results_filename)
shutil.copy(results_filename, results_target)
print(f"JSON copie: {results_target}")

print("="*70)
print("SAUVEGARDE TERMINEE")
print("="*70)