# MLP LOPO-CV : Robustesse Sémantique via Similarité Cosine

**Objectif** : Validation croisée Leave-One-Product-Out sur MLP + analyse de robustesse liée à la similarité cosine des embeddings (768D et 16D)

## 1. Imports et Configuration

In [6]:
import sys
sys.path.insert(0, '../')
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as sp
import warnings
warnings.filterwarnings('ignore')

# Import src modules
import src.models as md
import src.import_data as id
import src.utils_preprocessing as up
import src.utils_deep as ud
import src.plot_deep as plt_d

# Random seed
np.random.seed(42)
torch.manual_seed(42)

# GPU check
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f" Device: {device}")
if device.type == 'cuda':
    print(f" GPU: {torch.cuda.get_device_name(0)}")
    print(f" VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
print(" Libraries OK")

 Device: cuda
 GPU: NVIDIA GeForce RTX 4070 Laptop GPU
 VRAM: 8.59 GB
 Libraries OK


## 2. Chargement des données IA_only

In [7]:
DATA_DIR = '../data/'

# Charger IA_only (données d'entraînement cohérentes)
embeddings_768 = np.load(DATA_DIR + 'embeddings_IA_only_all.npy')
df_ia_only = pd.read_excel(DATA_DIR + 'data_IA_only_with_embeddings_index.xlsx')

# Variables depuis src.import_data
vars_expl = id.vars_expl
vars_cibles = id.vars_cibles

X_vars = df_ia_only[vars_expl].values
y_data = df_ia_only[vars_cibles].values

print(f" Dim Embeddings 768D: {embeddings_768.shape}")
print(f" Dim Variables explicatives: {X_vars.shape}")
print(f" Dim Cibles: {y_data.shape}")
print(f" Produits uniques: {len(df_ia_only['Nom'].unique())}")

 Dim Embeddings 768D: (6352, 768)
 Dim Variables explicatives: (6352, 10)
 Dim Cibles: (6352, 11)
 Produits uniques: 73


## 3. Autoencoder (768D → 16D)

In [8]:
# Créer dataloaders pour l'autoencoder (60/20/20 split)
train_loader_emb, val_loader_emb, test_loader_emb, scaler_X_emb, scaler_y_emb = up.split_2_DataLoader(
    embeddings_768, embeddings_768, train_size=0.6, val_size=0.25
)

print(f" ✓ AutoEncoder DataLoaders created")
print(f"   Train: {len(train_loader_emb.dataset)} | Val: {len(val_loader_emb.dataset)} | Test: {len(test_loader_emb.dataset)}")

# Initialiser autoencoder
model_auto_encoder = md.MirrorAutoEncoder().to(device)
optimizer_auto = torch.optim.AdamW(model_auto_encoder.parameters(), lr=1e-3, weight_decay=1e-4)
criterion_auto = F.smooth_l1_loss

# Entraîner avec la fonction du module
print(f"\n Training AutoEncoder (768 → 16)...")
model_auto_encoder, history_auto = ud.train_model(
    model_auto_encoder, train_loader_emb, val_loader_emb, optimizer_auto, criterion_auto, device
)

train_loss_auto = history_auto["train_loss"]
val_loss_auto = history_auto["val_loss"]

# Sauvegarder poids
ud.save_weights("../models/model_auto_Ba.pth", model_auto_encoder)

# Créer encoder gelé
frozen_encoder = md.FrozenEncoder(model_auto_encoder.encoder).to(device)

print(f" ✓ AutoEncoder training done")

 ✓ AutoEncoder DataLoaders created
   Train: 3811 | Val: 635 | Test: 1906

 Training AutoEncoder (768 → 16)...


  0%|          | 1/500 [00:01<11:46,  1.42s/it]

Epoch   1 | Train                 0.1448 | Val 0.0342


  2%|▏         | 11/500 [00:13<10:13,  1.25s/it]

Epoch  11 | Train                 0.0004 | Val 0.0007


  4%|▍         | 21/500 [00:24<06:12,  1.28it/s]

Epoch  21 | Train                 0.0000 | Val 0.0000


  6%|▌         | 31/500 [00:28<03:28,  2.25it/s]

Epoch  31 | Train                 0.0007 | Val 0.0011


  8%|▊         | 41/500 [00:32<03:04,  2.49it/s]

Epoch  41 | Train                 0.0000 | Val 0.0000


 10%|█         | 51/500 [00:36<02:50,  2.64it/s]

Epoch  51 | Train                 0.0001 | Val 0.0003


 12%|█▏        | 61/500 [00:40<02:56,  2.49it/s]

Epoch  61 | Train                 0.0001 | Val 0.0002


 14%|█▍        | 71/500 [00:44<02:47,  2.56it/s]

Epoch  71 | Train                 0.0000 | Val 0.0001


 16%|█▌        | 81/500 [00:48<02:39,  2.63it/s]

Epoch  81 | Train                 0.0002 | Val 0.0002


 18%|█▊        | 91/500 [00:52<02:34,  2.64it/s]

Epoch  91 | Train                 0.0094 | Val 0.0248


 20%|██        | 101/500 [00:56<02:38,  2.51it/s]

Epoch 101 | Train                 0.0000 | Val 0.0000


 22%|██▏       | 111/500 [01:00<02:37,  2.47it/s]

Epoch 111 | Train                 0.0003 | Val 0.0002


 23%|██▎       | 117/500 [01:03<03:26,  1.86it/s]


Early stopping at epoch 118
Training complete. Best val loss: 0.0000
Weights saved to ../models/model_auto_Ba.pth
 ✓ AutoEncoder training done





## 4. Encodage des embeddings + Similarité Cosine

In [9]:
# Encode all embeddings using frozen encoder (src.models.FrozenEncoder)
frozen_encoder.eval()

# Transform embeddings using the scaler from autoencoder training
X_emb_768_scaled = scaler_X_emb.transform(embeddings_768)
X_emb_768_tensor = torch.FloatTensor(X_emb_768_scaled).to(device)

with torch.no_grad():
    embeddings_16 = frozen_encoder(X_emb_768_tensor).cpu().numpy()

# Combine embeddings + variables explicatives
X_combined = np.hstack([embeddings_16, X_vars])

print(f" ✓ Embeddings 16D: {embeddings_16.shape}")
print(f" ✓ Features combined: {X_combined.shape} (16 latent + 10 vars)")

 ✓ Embeddings 16D: (6352, 16)
 ✓ Features combined: (6352, 26) (16 latent + 10 vars)


In [10]:
# Compute cosine similarity matrices
# Get unique product indices
unique_products = df_ia_only['Nom'].unique()
product_indices = {}
for nom in unique_products:
    product_indices[nom] = np.where(df_ia_only['Nom'].values == nom)[0][0]  # First occurrence

n_products = len(unique_products)

# Similarity matrices (unique products only)
unique_emb_768 = embeddings_768[[product_indices[nom] for nom in unique_products]]
unique_emb_16 = embeddings_16[[product_indices[nom] for nom in unique_products]]

sim_matrix_768 = cosine_similarity(unique_emb_768)
sim_matrix_16 = cosine_similarity(unique_emb_16)

# Remove diagonal
np.fill_diagonal(sim_matrix_768, 0)
np.fill_diagonal(sim_matrix_16, 0)

print(f" ✓ Similarity matrix 768D: {sim_matrix_768.shape}")
print(f" ✓ Similarity matrix 16D: {sim_matrix_16.shape}")

 ✓ Similarity matrix 768D: (73, 73)
 ✓ Similarity matrix 16D: (73, 73)


## 5. LOPO-CV avec MLP (src.models.MLPRegressor)

In [17]:
from tqdm import tqdm
import sys

results_lopo = []
all_y_true = []
all_y_pred = []

print(f"\n Starting LOPO-CV on {n_products} products...\n")
sys.stdout.flush()

for idx_prod, nom_test in tqdm(enumerate(unique_products), total=n_products, desc="LOPO-CV Progress"):
    # Afficher le produit en cours
    print(f"\n[{idx_prod+1:2d}/{n_products}] Training on: {nom_test:40s}", end='', flush=True)
    
    # Get product index in similarity matrices
    prod_sim_idx = list(unique_products).index(nom_test)
    
    # Max similarity in 768D and 16D
    max_sim_768 = np.max(sim_matrix_768[prod_sim_idx])
    max_sim_16 = np.max(sim_matrix_16[prod_sim_idx])
    
    # Mean distance (1 - similarity)
    semantic_distance_768 = 1 - np.mean(sim_matrix_768[prod_sim_idx])
    semantic_distance_16 = 1 - np.mean(sim_matrix_16[prod_sim_idx])
    
    # Split train/test
    mask_test = df_ia_only['Nom'].values == nom_test
    mask_train = ~mask_test
    
    X_train = X_combined[mask_train]
    X_test = X_combined[mask_test]
    y_train = y_data[mask_train]
    y_test = y_data[mask_test]
    
    # Normalize
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    X_train_scaled = scaler_X.fit_transform(X_train)
    X_test_scaled = scaler_X.transform(X_test)
    y_train_scaled = scaler_y.fit_transform(y_train)
    y_test_scaled = scaler_y.transform(y_test)
    
    # DataLoaders
    train_dataset = TensorDataset(
        torch.FloatTensor(X_train_scaled),
        torch.FloatTensor(y_train_scaled)
    )
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    
    # Create val loader for early stopping
    val_dataset = TensorDataset(
        torch.FloatTensor(X_test_scaled),
        torch.FloatTensor(y_test_scaled)
    )
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
    
    # Train MLP using src.models.MLPRegressor with early stopping
    mlp = md.MLPRegressor(input_size=26, hidden_sizes=[128, 256, 64], output_size=11, dropout_rate=0.2).to(device)
    optimizer = optim.Adam(mlp.parameters(), lr=1e-3, weight_decay=1e-5)
    criterion = nn.MSELoss()
    
    # Early stopping
    best_val_loss = float('inf')
    patience = 10
    patience_counter = 0
    
    for epoch in range(50):  # Réduit de 100 à 50
        # Train
        mlp.train()
        for X_batch, y_batch in train_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            optimizer.zero_grad()
            y_pred = mlp(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
        
        # Validate pour early stopping
        mlp.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = mlp(X_batch)
                loss = criterion(y_pred, y_batch)
                val_loss += loss.item() * X_batch.size(0)
        val_loss /= len(val_loader.dataset)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            best_state = mlp.state_dict().copy()
        else:
            patience_counter += 1
            if patience_counter >= patience:
                mlp.load_state_dict(best_state)
                break
    
    # Test
    mlp.eval()
    with torch.no_grad():
        X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
        y_pred_scaled = mlp(X_test_tensor).cpu().numpy()
    
    # Inverse scaling
    y_pred = scaler_y.inverse_transform(y_pred_scaled)
    
    # Metrics per target
    r2_targets = []
    mae_targets = []
    rmse_targets = []
    
    for col in range(y_test.shape[1]):
        if len(y_test) > 1 and np.std(y_test[:, col]) > 0:
            r2_targets.append(r2_score(y_test[:, col], y_pred[:, col]))
            mae_targets.append(mean_absolute_error(y_test[:, col], y_pred[:, col]))
            rmse_targets.append(np.sqrt(mean_squared_error(y_test[:, col], y_pred[:, col])))
        else:
            r2_targets.append(np.nan)
            mae_targets.append(np.nan)
            rmse_targets.append(np.nan)
    
    all_y_true.append(y_test)
    all_y_pred.append(y_pred)
    
    results_lopo.append({
        'Produit': nom_test,
        'Nb_Echantillons_Test': len(y_test),
        'Sim_768D': max_sim_768,
        'Sim_16D': max_sim_16,
        'Dist_Semantic_768D': semantic_distance_768,
        'Dist_Semantic_16D': semantic_distance_16,
        'R2_Targets': r2_targets,
        'MAE_Targets': mae_targets,
        'RMSE_Targets': rmse_targets,
        'R2_Local': np.nanmean(r2_targets)
    })
    
    print(f" → R²={np.nanmean(r2_targets):.4f}", flush=True)

df_lopo = pd.DataFrame(results_lopo)
print(f"\n\n ✓ LOPO-CV complete")


 Starting LOPO-CV on 73 products...



LOPO-CV Progress:   0%|          | 0/73 [00:00<?, ?it/s]


[ 1/73] Training on: Avoine                                   → R²=-10.3553


LOPO-CV Progress:   1%|▏         | 1/73 [00:03<04:33,  3.79s/it]


[ 2/73] Training on: Blé tendre                               → R²=-2.3697


LOPO-CV Progress:   3%|▎         | 2/73 [00:07<04:32,  3.84s/it]


[ 3/73] Training on: Concentré protéique de luzerne           → R²=-118.7923


LOPO-CV Progress:   4%|▍         | 3/73 [00:22<10:05,  8.64s/it]


[ 4/73] Training on: Coproduits de biscuiterie                → R²=0.3445


LOPO-CV Progress:   5%|▌         | 4/73 [00:32<10:53,  9.47s/it]


[ 5/73] Training on: Coques de soja                           → R²=-65.6707


LOPO-CV Progress:   7%|▋         | 5/73 [00:40<09:49,  8.67s/it]


[ 6/73] Training on: Corn gluten feed                         → R²=-3.0012


LOPO-CV Progress:   8%|▊         | 6/73 [00:45<08:17,  7.42s/it]


[ 7/73] Training on: Corn gluten meal                         → R²=-41.9811


LOPO-CV Progress:  10%|▉         | 7/73 [00:51<07:43,  7.03s/it]


[ 8/73] Training on: Drêches de blé de distillerie, amidon < 7 %  → R²=-1.3593


LOPO-CV Progress:  11%|█         | 8/73 [01:07<10:46,  9.95s/it]


[ 9/73] Training on: Drêches de blé de distillerie, amidon > 7 %  → R²=-5.2161


LOPO-CV Progress:  12%|█▏        | 9/73 [01:14<09:46,  9.16s/it]


[10/73] Training on: Farine basse de blé tendre               → R²=-1.2857


LOPO-CV Progress:  14%|█▎        | 10/73 [01:19<08:00,  7.63s/it]


[11/73] Training on: Farine de poisson (toutes)               → R²=-16.7071


LOPO-CV Progress:  15%|█▌        | 11/73 [01:23<07:00,  6.78s/it]


[12/73] Training on: Farine de viande (toutes)                → R²=-2.1417


LOPO-CV Progress:  16%|█▋        | 12/73 [01:28<06:07,  6.02s/it]


[13/73] Training on: Féverole à fleurs blanches               → R²=-12.0767


LOPO-CV Progress:  18%|█▊        | 13/73 [01:39<07:30,  7.51s/it]


[14/73] Training on: Gluten feed de blé, type 20% amidon      → R²=-1.8685


LOPO-CV Progress:  19%|█▉        | 14/73 [01:43<06:35,  6.70s/it]


[15/73] Training on: Graine de lin                            → R²=-0.9362


LOPO-CV Progress:  21%|██        | 15/73 [01:48<05:46,  5.98s/it]


[16/73] Training on: Graine de soja extrudée                  → R²=-13.6733


LOPO-CV Progress:  22%|██▏       | 16/73 [01:54<05:42,  6.00s/it]


[17/73] Training on: Graine de tournesol                      → R²=-39.5602


LOPO-CV Progress:  23%|██▎       | 17/73 [02:08<07:47,  8.35s/it]


[18/73] Training on: Lupin générique                          → R²=-42.1448


LOPO-CV Progress:  25%|██▍       | 18/73 [02:11<06:23,  6.97s/it]


[19/73] Training on: Luzerne déshydratée, générique           → R²=-1.1679


LOPO-CV Progress:  26%|██▌       | 19/73 [02:15<05:22,  5.97s/it]


[20/73] Training on: Maïs                                     → R²=-19.5605


LOPO-CV Progress:  27%|██▋       | 20/73 [02:19<04:42,  5.33s/it]


[21/73] Training on: Manioc, amidon 70-74 %                   → R²=-20.6027


LOPO-CV Progress:  29%|██▉       | 21/73 [02:26<05:06,  5.90s/it]


[22/73] Training on: Orge                                     → R²=-18.9851


LOPO-CV Progress:  30%|███       | 22/73 [02:30<04:32,  5.35s/it]


[23/73] Training on: Pois                                     → R²=-5.1278


LOPO-CV Progress:  32%|███▏      | 23/73 [02:36<04:30,  5.41s/it]


[24/73] Training on: Pois chiche, type Kabuli                 → R²=-3.6806


LOPO-CV Progress:  33%|███▎      | 24/73 [02:40<04:13,  5.18s/it]


[25/73] Training on: Pulpe d'agrumes déshydratée              → R²=-59.4850


LOPO-CV Progress:  34%|███▍      | 25/73 [02:44<03:44,  4.67s/it]


[26/73] Training on: Pulpe de betterave déshydratée           → R²=-61.0056


LOPO-CV Progress:  36%|███▌      | 26/73 [02:47<03:21,  4.29s/it]


[27/73] Training on: Radicelles d'orge de brasserie déshydratées  → R²=-0.9524


LOPO-CV Progress:  37%|███▋      | 27/73 [02:55<03:58,  5.18s/it]


[28/73] Training on: Remoulage de blé tendre (tous)           → R²=0.4396


LOPO-CV Progress:  38%|███▊      | 28/73 [03:06<05:20,  7.13s/it]


[29/73] Training on: Seigle                                   → R²=-7.6622


LOPO-CV Progress:  40%|███▉      | 29/73 [03:10<04:31,  6.16s/it]


[30/73] Training on: Son de blé tendre                        → R²=-24.0603


LOPO-CV Progress:  41%|████      | 30/73 [03:17<04:36,  6.44s/it]


[31/73] Training on: Sorgho                                   → R²=-6.8218


LOPO-CV Progress:  42%|████▏     | 31/73 [03:24<04:31,  6.46s/it]


[32/73] Training on: Tourteau d'arachide, huile < 5 %, cellulose brute > 9 %  → R²=-2.2029


LOPO-CV Progress:  44%|████▍     | 32/73 [03:30<04:22,  6.40s/it]


[33/73] Training on: Tourteau de colza, huile < 5 %           → R²=-25.8389


LOPO-CV Progress:  45%|████▌     | 33/73 [03:41<05:08,  7.70s/it]


[34/73] Training on: Tourteau de colza, huile 5-20 %          → R²=-5.4074


LOPO-CV Progress:  47%|████▋     | 34/73 [03:50<05:21,  8.25s/it]


[35/73] Training on: Tourteau de coprah, huile 5-20%          → R²=-1.0986


LOPO-CV Progress:  48%|████▊     | 35/73 [03:58<05:13,  8.24s/it]


[36/73] Training on: Tourteau de germes de maïs, huile < 5 %  → R²=-2.2970


LOPO-CV Progress:  49%|████▉     | 36/73 [04:06<04:56,  8.02s/it]


[37/73] Training on: Tourteau de germes de maïs, huile 5-20 %  → R²=-7.6410


LOPO-CV Progress:  51%|█████     | 37/73 [04:14<04:52,  8.14s/it]


[38/73] Training on: Tourteau de lin, huile > 5 %             → R²=-3.3121


LOPO-CV Progress:  52%|█████▏    | 38/73 [04:22<04:37,  7.94s/it]


[39/73] Training on: Tourteau de palmiste, huile 5-20%        → R²=-7.3771


LOPO-CV Progress:  53%|█████▎    | 39/73 [04:26<03:49,  6.75s/it]


[40/73] Training on: Tourteau de soja, huile < 5 %, 48 % protéine + huile  → R²=-32.3708


LOPO-CV Progress:  55%|█████▍    | 40/73 [04:31<03:30,  6.38s/it]


[41/73] Training on: Tourteau de soja, huile < 5 %, 50 % protéine + huile  → R²=-0.2209


LOPO-CV Progress:  56%|█████▌    | 41/73 [04:43<04:15,  7.97s/it]


[42/73] Training on: Tourteau de tournesol, huile < 5 %, décortiqué  → R²=-57.9989


LOPO-CV Progress:  58%|█████▊    | 42/73 [04:48<03:39,  7.09s/it]


[43/73] Training on: Tourteau de tournesol, huile < 5 %, non décortiqué  → R²=-17.0963


LOPO-CV Progress:  59%|█████▉    | 43/73 [04:52<03:06,  6.21s/it]


[44/73] Training on: Triticale                                → R²=-1.4209


LOPO-CV Progress:  60%|██████    | 44/73 [04:57<02:45,  5.72s/it]


[45/73] Training on: Riz                                      → R²=-0.0960


LOPO-CV Progress:  62%|██████▏   | 45/73 [05:01<02:28,  5.31s/it]


[46/73] Training on: Drêches de maïs de distillerie avec solubles, déshydratées, huile < 6 %, protéines < 30 % → R²=-5.9912


LOPO-CV Progress:  63%|██████▎   | 46/73 [05:05<02:13,  4.96s/it]


[47/73] Training on: Drêches de maïs de distillerie avec solubles, déshydratées, huile > 6 % → R²=-0.9236


LOPO-CV Progress:  64%|██████▍   | 47/73 [05:12<02:26,  5.64s/it]


[48/73] Training on: Farine fourragère de maïs                → R²=-3.1038


LOPO-CV Progress:  66%|██████▌   | 48/73 [05:21<02:45,  6.63s/it]


[49/73] Training on: Germes de maïs                           → R²=-3.4311


LOPO-CV Progress:  67%|██████▋   | 49/73 [05:30<02:54,  7.28s/it]


[50/73] Training on: Son de maïs                              → R²=-10.5050


LOPO-CV Progress:  68%|██████▊   | 50/73 [05:34<02:23,  6.23s/it]


[51/73] Training on: Coques de sarrasin                       → R²=-20.1777


LOPO-CV Progress:  70%|██████▉   | 51/73 [05:42<02:32,  6.91s/it]


[52/73] Training on: Drêches d'orge de brasserie déshydratées → R²=-2.8664


LOPO-CV Progress:  71%|███████   | 52/73 [05:47<02:11,  6.26s/it]


[53/73] Training on: Farine basse de riz                      → R²=0.3706


LOPO-CV Progress:  73%|███████▎  | 53/73 [05:53<02:00,  6.04s/it]


[54/73] Training on: Son de riz, huile < 5 %                  → R²=0.6255


LOPO-CV Progress:  74%|███████▍  | 54/73 [06:01<02:04,  6.56s/it]


[55/73] Training on: Son de riz, huile > 5 %                  → R²=0.6429


LOPO-CV Progress:  75%|███████▌  | 55/73 [06:15<02:39,  8.87s/it]


[56/73] Training on: Graine de colza                          → R²=-8.9372


LOPO-CV Progress:  77%|███████▋  | 56/73 [06:18<02:02,  7.19s/it]


[57/73] Training on: Graine de coton                          → R²=-0.3029


LOPO-CV Progress:  78%|███████▊  | 57/73 [06:26<02:00,  7.53s/it]


[58/73] Training on: Grignon d'olive                          → R²=-13.4707


LOPO-CV Progress:  79%|███████▉  | 58/73 [06:30<01:34,  6.30s/it]


[59/73] Training on: Tourteau de coton, huile < 5 %           → R²=-0.3623


LOPO-CV Progress:  81%|████████  | 59/73 [06:34<01:18,  5.61s/it]


[60/73] Training on: Tourteau de coton, huile 5-20 %          → R²=0.2631


LOPO-CV Progress:  82%|████████▏ | 60/73 [06:42<01:22,  6.36s/it]


[61/73] Training on: Tourteau de tournesol, huile > 5 %       → R²=-1.0603


LOPO-CV Progress:  84%|████████▎ | 61/73 [06:49<01:19,  6.64s/it]


[62/73] Training on: Amidon de maïs                           → R²=-4.1663


LOPO-CV Progress:  85%|████████▍ | 62/73 [06:54<01:06,  6.05s/it]


[63/73] Training on: Marc ou pulpe de raisin déshydraté       → R²=-8.1766


LOPO-CV Progress:  86%|████████▋ | 63/73 [06:58<00:53,  5.38s/it]


[64/73] Training on: Mélasse de canne ou de betterave         → R²=-8.5498


LOPO-CV Progress:  88%|████████▊ | 64/73 [07:02<00:45,  5.07s/it]


[65/73] Training on: Vinasse de levurerie                     → R²=-748.1454


LOPO-CV Progress:  89%|████████▉ | 65/73 [07:12<00:51,  6.41s/it]


[66/73] Training on: Concentré protéique de soja              → R²=-1.6529


LOPO-CV Progress:  90%|█████████ | 66/73 [07:23<00:55,  8.00s/it]


[67/73] Training on: Farine de gousse de caroube              → R²=-51.1963


LOPO-CV Progress:  92%|█████████▏| 67/73 [07:26<00:39,  6.53s/it]


[68/73] Training on: Levure de brasserie déshydratée          → R²=-16.0158


LOPO-CV Progress:  93%|█████████▎| 68/73 [07:37<00:39,  7.84s/it]


[69/73] Training on: Paille de blé                            → R²=-444.4343


LOPO-CV Progress:  95%|█████████▍| 69/73 [07:41<00:26,  6.63s/it]


[70/73] Training on: Lactosérum écrèmé déshydraté (doux ou acide) → R²=-7.5939


LOPO-CV Progress:  96%|█████████▌| 70/73 [07:48<00:19,  6.64s/it]


[71/73] Training on: Lactosérum réengraissé                   → R²=-2.2207


LOPO-CV Progress:  97%|█████████▋| 71/73 [07:54<00:13,  6.55s/it]


[72/73] Training on: Poudre de lait entier                    → R²=-11.9163


LOPO-CV Progress:  99%|█████████▊| 72/73 [07:58<00:05,  5.87s/it]


[73/73] Training on: Farine de plumes                         → R²=-13.2894


LOPO-CV Progress: 100%|██████████| 73/73 [08:04<00:00,  6.64s/it]



 ✓ LOPO-CV complete





## 6. Analyse des résultats globaux

In [18]:
# Global metrics
y_true_all = np.vstack(all_y_true)
y_pred_all = np.vstack(all_y_pred)

r2_global = r2_score(y_true_all, y_pred_all, multioutput='raw_values')
mae_global = mean_absolute_error(y_true_all, y_pred_all, multioutput='raw_values')
rmse_global = np.sqrt(mean_squared_error(y_true_all, y_pred_all, multioutput='raw_values'))

r2_global_weighted = r2_score(y_true_all, y_pred_all, multioutput='variance_weighted')
r2_global_uniform = r2_score(y_true_all, y_pred_all, multioutput='uniform_average')

print("\n" + "="*70)
print(" RÉSULTATS LOPO-CV (MLP)")
print("="*70)

print(f"\n R² GLOBAL (ALL SAMPLES):")
print(f"  Weighted Average: {r2_global_weighted:.4f}")
print(f"  Uniform Average:  {r2_global_uniform:.4f}")
print(f"  Mean per-target:  {np.mean(r2_global):.4f} ± {np.std(r2_global):.4f}")

print(f"\n R² PER TARGET (GLOBAL):")
summary_df = pd.DataFrame({
    'Cible': [v.split(' (')[0] for v in vars_cibles],
    'R2': r2_global,
    'MAE': mae_global,
    'RMSE': rmse_global
}).sort_values('R2', ascending=False)

print(summary_df.to_string(index=False))

print(f"\n R² LOCAL (PER PRODUCT):")
print(f"  Mean:   {df_lopo['R2_Local'].mean():.4f}")
print(f"  Std:    {df_lopo['R2_Local'].std():.4f}")
print(f"  Min:    {df_lopo['R2_Local'].min():.4f}")
print(f"  Max:    {df_lopo['R2_Local'].max():.4f}")


 RÉSULTATS LOPO-CV (MLP)

 R² GLOBAL (ALL SAMPLES):
  Weighted Average: 0.8412
  Uniform Average:  0.7924
  Mean per-target:  0.7924 ± 0.0685

 R² PER TARGET (GLOBAL):
                  Cible       R2        MAE       RMSE
     EN porc croissance 0.895167 157.341846 228.907921
                     EB 0.878748 158.797146 218.324046
               EMAn coq 0.845679 243.772248 320.517238
            EMAn poulet 0.841427 238.392784 315.240959
     EM porc croissance 0.811403 239.752689 349.665384
     ED porc croissance 0.809315 250.836143 365.543339
   UFL 2018 par kg brut 0.777427   0.098385   0.136018
   UFV 2018 par kg brut 0.760870   0.115703   0.159409
BalProRu 2018 g/kg brut 0.704931  35.350971  53.699817
     PDI 2018 g/kg brut 0.700980  24.357815  43.695870
    PDIA 2018 g/kg brut 0.690103  25.144386  45.150460

 R² LOCAL (PER PRODUCT):
  Mean:   -29.5676
  Std:    101.1520
  Min:    -748.1454
  Max:    0.6429


## 7. Corrélation R² vs Similarité Cosine

In [19]:
# Correlations
corr_r2_sim768 = np.corrcoef(df_lopo['R2_Local'], df_lopo['Sim_768D'])[0, 1]
corr_r2_sim16 = np.corrcoef(df_lopo['R2_Local'], df_lopo['Sim_16D'])[0, 1]
corr_r2_dist768 = np.corrcoef(df_lopo['R2_Local'], df_lopo['Dist_Semantic_768D'])[0, 1]
corr_r2_dist16 = np.corrcoef(df_lopo['R2_Local'], df_lopo['Dist_Semantic_16D'])[0, 1]

print("\n" + "="*70)
print(" CORRÉLATIONS R² LOCAL vs SIMILARITÉ")
print("="*70)

print(f"\n R² vs Similarité:")
print(f"  768D: {corr_r2_sim768:.4f}")
print(f"  16D:  {corr_r2_sim16:.4f}")

print(f"\n R² vs Distance sémantique (1 - Sim):")
print(f"  768D: {corr_r2_dist768:.4f}")
print(f"  16D:  {corr_r2_dist16:.4f}")


 CORRÉLATIONS R² LOCAL vs SIMILARITÉ

 R² vs Similarité:
  768D: 0.1052
  16D:  0.1164

 R² vs Distance sémantique (1 - Sim):
  768D: -0.0135
  16D:  -0.1038


## 8. Top/Flop Produits

In [20]:
print("\n" + "="*70)
print(" TOP 10 - Meilleurs R² locaux")
print("="*70)

top_10 = df_lopo.nlargest(10, 'R2_Local')[['Produit', 'R2_Local', 'Sim_768D', 'Dist_Semantic_768D']]
print(top_10.to_string(index=False))

print("\n" + "="*70)
print(" FLOP 10 - Pires R² locaux")
print("="*70)

flop_10 = df_lopo.nsmallest(10, 'R2_Local')[['Produit', 'R2_Local', 'Sim_768D', 'Dist_Semantic_768D']]
print(flop_10.to_string(index=False))


 TOP 10 - Meilleurs R² locaux
                                              Produit  R2_Local  Sim_768D  Dist_Semantic_768D
                              Son de riz, huile > 5 %  0.642948  0.929852            0.554623
                              Son de riz, huile < 5 %  0.625501  0.929852            0.560494
                      Remoulage de blé tendre (tous)   0.439569  0.837774            0.508948
                                  Farine basse de riz  0.370590  0.670092            0.555215
                           Coproduits de biscuiterie   0.344519  0.612149            0.649767
                      Tourteau de coton, huile 5-20 %  0.263074  0.709508            0.715295
                                                  Riz -0.095955  0.663984            0.671180
Tourteau de soja, huile < 5 %, 50 % protéine + huile  -0.220868  0.919070            0.518383
                                      Graine de coton -0.302937  0.608560            0.635546
                       Tourte

## 9. Visualisations

In [21]:
# Scatter plot: R² vs Similarité
fig = sp.make_subplots(
    rows=1, cols=2,
    subplot_titles=(
        f"R² vs Similarity 768D (r={corr_r2_sim768:.3f})",
        f"R² vs Similarity 16D (r={corr_r2_sim16:.3f})"
    )
)

fig.add_trace(
    go.Scatter(
        x=df_lopo['Sim_768D'], y=df_lopo['R2_Local'],
        mode='markers',
        marker=dict(size=8, color='steelblue', opacity=0.6),
        text=df_lopo['Produit'],
        hovertemplate='<b>%{text}</b><br>Sim: %{x:.3f}<br>R²: %{y:.3f}<extra></extra>',
        showlegend=False
    ),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(
        x=df_lopo['Sim_16D'], y=df_lopo['R2_Local'],
        mode='markers',
        marker=dict(size=8, color='coral', opacity=0.6),
        text=df_lopo['Produit'],
        hovertemplate='<b>%{text}</b><br>Sim: %{x:.3f}<br>R²: %{y:.3f}<extra></extra>',
        showlegend=False
    ),
    row=1, col=2
)

fig.update_xaxes(title_text="Similarity", row=1, col=1)
fig.update_xaxes(title_text="Similarity", row=1, col=2)
fig.update_yaxes(title_text="R² Local", row=1, col=1)
fig.update_yaxes(title_text="R² Local", row=1, col=2)

fig.update_layout(height=500, width=1000, template='plotly_white',
                  title_text="<b>Robustesse sémantique</b> : R² Local vs Similarité Cosine")
fig.show()

In [23]:
import plotly.graph_objects as go
import plotly.subplots as sp
import numpy as np

# 1. Traitement des données pour l'affichage (Clipping)
# On limite les R² très bas à -1 pour que les points positifs (0 à 1) 
# occupent 50% de l'espace visuel au lieu d'être écrasés par un outlier à -50.
df_plot = df_lopo.copy()
min_visu = -1.0  # Seuil de visualisation pour ne pas écraser l'échelle
df_plot['R2_Visu'] = df_plot['R2_Local'].clip(lower=min_visu)

# Scatter plot: R² vs Similarité
fig = sp.make_subplots(
    rows=1, cols=2,
    subplot_titles=(
        f"R² vs Similarity 768D (r={corr_r2_sim768:.3f})",
        f"R² vs Similarity 16D (r={corr_r2_sim16:.3f})"
    )
)

# --- Subplot 1: 768D ---
fig.add_trace(
    go.Scatter(
        x=df_plot['Sim_768D'], 
        y=df_plot['R2_Visu'],
        mode='markers',
        marker=dict(
            size=10, 
            color=df_plot['R2_Local'], # Couleur basée sur la vraie valeur
            colorscale='RdBu', 
            showscale=True,
            colorbar=dict(title="R² Réel", x=1.1)
        ),
        text=df_plot['Produit'],
        # Le hover affiche la vraie valeur de R², pas la valeur clippée
        customdata=df_plot['R2_Local'],
        hovertemplate='<b>%{text}</b><br>Sim: %{x:.3f}<br>R² Réel: %{customdata:.3f}<extra></extra>',
        showlegend=False
    ),
    row=1, col=1
)

# --- Subplot 2: 16D ---
fig.add_trace(
    go.Scatter(
        x=df_plot['Sim_16D'], 
        y=df_plot['R2_Visu'],
        mode='markers',
        marker=dict(
            size=10, 
            color=df_plot['R2_Local'], 
            colorscale='RdBu'
        ),
        text=df_plot['Produit'],
        customdata=df_plot['R2_Local'],
        hovertemplate='<b>%{text}</b><br>Sim: %{x:.3f}<br>R² Réel: %{customdata:.3f}<extra></extra>',
        showlegend=False
    ),
    row=1, col=2
)

# --- Configuration des axes ---
# On force l'affichage de la zone positive
fig.update_yaxes(title_text="R² Local (clippé à -1)", range=[min_visu - 0.1, 1.1], row=1, col=1)
fig.update_yaxes(title_text="R² Local (clippé à -1)", range=[min_visu - 0.1, 1.1], row=1, col=2)

fig.update_xaxes(title_text="Similarity", row=1, col=1)
fig.update_xaxes(title_text="Similarity", row=1, col=2)

# Ajout d'une ligne horizontale à 0 pour bien séparer les succès des échecs
fig.add_hline(y=0, line_dash="dash", line_color="black", opacity=0.5)

fig.update_layout(
    height=600, width=1100, 
    template='plotly_white',
    title_text="<b>Robustesse sémantique</b> : Focus sur les R² positifs vs négatifs",
    margin=dict(r=150) # Espace pour la colorbar
)

fig.show()

In [22]:
# True vs Pred per target
fig = sp.make_subplots(
    rows=4, cols=3,
    subplot_titles=[v.split(' (')[0] for v in vars_cibles],
    vertical_spacing=0.08,
    horizontal_spacing=0.08
)

for i, name in enumerate(vars_cibles):
    row = (i // 3) + 1
    col = (i % 3) + 1
    
    fig.add_trace(
        go.Scatter(
            x=y_true_all[:, i], y=y_pred_all[:, i],
            mode='markers',
            marker=dict(size=4, color='royalblue', opacity=0.5),
            showlegend=False
        ),
        row=row, col=col
    )
    
    # Diagonal line
    min_v = min(y_true_all[:, i].min(), y_pred_all[:, i].min())
    max_v = max(y_true_all[:, i].max(), y_pred_all[:, i].max())
    fig.add_trace(
        go.Scatter(
            x=[min_v, max_v], y=[min_v, max_v],
            mode='lines',
            line=dict(color='red', dash='dash'),
            showlegend=False
        ),
        row=row, col=col
    )
    
    fig.update_xaxes(title_text="Vrai", row=row, col=col)
    fig.update_yaxes(title_text="Prédit", row=row, col=col)

fig.update_layout(height=1200, width=1000, template='plotly_white',
                  title_text="<b>LOPO-CV Results</b> : True vs Predicted Values")
fig.show()