In [1]:
import numpy as np
import pandas as pd
import os
import time
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
import torch
import gc

# je definis les chemins
path_root = r"C:\Users\amisf\Desktop\datascientest_projet"
path_out = os.path.join(path_root, "implementation", "outputs")

# je verifie si le gpu est dispo pour xgboost
# xgboost a besoin de savoir si cuda est la
use_gpu = False
try:
    if torch.cuda.is_available():
        print("gpu detecte on va accelerer xgboost")
        use_gpu = True
    else:
        print("pas de gpu on reste sur cpu")
except:
    print("erreur detection gpu passage cpu")

# je cree le dossier sortie si besoin
if not os.path.exists(path_out):
    os.makedirs(path_out)

gpu detecte on va accelerer xgboost


In [2]:
print("chargement des donnees converties")

# je tente de charger les features extraites par le reseau
# c est ce qui donne les meilleurs resultats en ml classique
try:
    path_x = os.path.join(path_out, 'train_features_resnet50_augmented.npy')
    path_y = os.path.join(path_out, 'train_labels_augmented.npy')
    
    if not os.path.exists(path_x):
        # plan b si le fichier augmente n est pas la je cherche le simple
        path_x = os.path.join(path_out, 'train_features_resnet50.npy')
        path_y = os.path.join(path_out, 'train_labels.npy')

    x_data = np.load(path_x)
    y_data = np.load(path_y)
    
    print("donnees chargees en memoire")
    print(x_data.shape)

except Exception as e:
    print("erreur critique impossible de trouver les fichiers npy")
    print("tu dois avoir extrait les features avant de lancer xgboost")
    raise e

# encodage des cibles
le = LabelEncoder()
y_encoded = le.fit_transform(y_data)

# split train val
# je garde 20% pour verifier si on overfit pas
x_train, x_val, y_train, y_val = train_test_split(
    x_data, y_encoded, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_encoded
)

print("split termine pret a entrainer")

chargement des donnees converties
donnees chargees en memoire
(405000, 2048)
split termine pret a entrainer


In [4]:
print("lancement optimisation xgboost")

# config du modele champion
# je pousse les parametres pour exploiter ta machine
params = {
    'objective': 'multi:softmax',
    'num_class': len(le.classes_),
    'n_estimators': 3000, # on vise haut l arret auto coupera avant
    'max_depth': 8, # profondeur pour capter les nuances
    'learning_rate': 0.05, # vitesse lente pour precision
    'subsample': 0.8, # evite overfit
    'colsample_bytree': 0.8,
    'early_stopping_rounds': 50, # securite anti overfit
    'eval_metric': 'mlogloss'
}

# gestion hardware
if use_gpu:
    params['tree_method'] = 'hist'
    params['device'] = 'cuda'
else:
    params['tree_method'] = 'hist'
    params['device'] = 'cpu'
    params['n_jobs'] = -1

model = xgb.XGBClassifier(**params)

t_start = time.time()
print("demarrage entrainement surveille")

try:
    # je lance l apprentissage
    # le verbose affiche le score regulierement
    model.fit(
        x_train, y_train,
        eval_set=[(x_train, y_train), (x_val, y_val)],
        verbose=10
    )
    print("entrainement gpu termine")

except Exception as e:
    print("echec gpu detection saturation memoire")
    print("bascule automatique sur cpu ram 128go")
    
    # nettoyage avant reprise
    if 'model' in globals(): del model
    torch.cuda.empty_cache()
    gc.collect()
    
    # reconfig cpu forcee
    params['device'] = 'cpu'
    params['n_jobs'] = -1
    model = xgb.XGBClassifier(**params)
    
    model.fit(
        x_train, y_train,
        eval_set=[(x_train, y_train), (x_val, y_val)],
        verbose=10
    )
    print("entrainement cpu termine")

duration = time.time() - t_start
print(f"temps total {duration:.1f} sec")

lancement optimisation xgboost
demarrage entrainement surveille
[0]	validation_0-mlogloss:3.12367	validation_1-mlogloss:3.14457
[10]	validation_0-mlogloss:2.31332	validation_1-mlogloss:2.45704
[20]	validation_0-mlogloss:1.91287	validation_1-mlogloss:2.12983
[30]	validation_0-mlogloss:1.64494	validation_1-mlogloss:1.91686
[40]	validation_0-mlogloss:1.44806	validation_1-mlogloss:1.76370
[50]	validation_0-mlogloss:1.29666	validation_1-mlogloss:1.64825
[60]	validation_0-mlogloss:1.17473	validation_1-mlogloss:1.55718
[70]	validation_0-mlogloss:1.07402	validation_1-mlogloss:1.48375
[80]	validation_0-mlogloss:0.99035	validation_1-mlogloss:1.42363
[90]	validation_0-mlogloss:0.91929	validation_1-mlogloss:1.37334
[100]	validation_0-mlogloss:0.85748	validation_1-mlogloss:1.33024
[110]	validation_0-mlogloss:0.80232	validation_1-mlogloss:1.29228
[120]	validation_0-mlogloss:0.75441	validation_1-mlogloss:1.25961
[130]	validation_0-mlogloss:0.71083	validation_1-mlogloss:1.23014
[140]	validation_0-mlog

In [5]:
print("analyse des performances")

# predictions
preds = model.predict(x_val)
score = f1_score(y_val, preds, average='weighted')

print(f"score f1 final obtenu {score:.4f}")

# rapport detaille
print("rapport par classe")
print(classification_report(y_val, preds))

# sauvegarde intelligente
# je ne sauvegarde que si le modele est valide
save_path = os.path.join(path_out, "best_xgboost_gpu_model.json")
model.save_model(save_path)
print(f"modele sauvegarde sous {save_path}")

# generation csv pour comparaison
results = pd.DataFrame({
    'y_true': le.inverse_transform(y_val),
    'y_pred': le.inverse_transform(preds)
})
csv_path = os.path.join(path_out, "resultats_xgboost_detail.csv")
results.to_csv(csv_path, index=False)
print("fichier csv detaille genere")

analyse des performances


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


score f1 final obtenu 0.8532
rapport par classe
              precision    recall  f1-score   support

           0       0.76      0.79      0.77      3000
           1       0.78      0.78      0.78      3000
           2       0.89      0.89      0.89      3000
           3       0.93      0.98      0.95      3000
           4       0.84      0.86      0.85      3000
           5       0.93      0.93      0.93      3000
           6       0.95      0.96      0.95      3000
           7       0.76      0.62      0.68      3000
           8       0.81      0.77      0.79      3000
           9       0.78      0.85      0.82      3000
          10       0.96      0.99      0.97      3000
          11       0.86      0.87      0.86      3000
          12       0.84      0.80      0.82      3000
          13       0.83      0.73      0.78      3000
          14       0.88      0.89      0.88      3000
          15       0.93      0.99      0.96      3000
          16       0.74      0.75

In [None]:
print("analyse des performances")

# predictions
preds = model.predict(x_val)
score = f1_score(y_val, preds, average='weighted')

print(f"score f1 final obtenu {score:.4f}")

# rapport detaille
print("rapport par classe")
print(classification_report(y_val, preds))

# sauvegarde intelligente
# je ne sauvegarde que si le modele est valide
save_path = os.path.join(path_out, "best_xgboost_gpu_model.json")
model.save_model(save_path)
print(f"modele sauvegarde sous {save_path}")

# generation csv pour comparaison
results = pd.DataFrame({
    'y_true': le.inverse_transform(y_val),
    'y_pred': le.inverse_transform(preds)
})
csv_path = os.path.join(path_out, "resultats_xgboost_detail.csv")
results.to_csv(csv_path, index=False)
print("fichier csv detaille genere")

In [2]:
import pandas as pd
import os
import joblib
from sklearn.preprocessing import LabelEncoder

print(">>> generation de l'encodeur manquant pour m2 <<<")

# chemins
base_dir = r"C:\Users\amisf\Desktop\datascientest_projet"
output_dir = os.path.join(base_dir, "implementation", "outputs")

# on recharge juste les y pour refaire l'encodeur à l'identique
path_y = os.path.join(base_dir, "data", "raw", "Y_train_CVw08PX.csv")

if os.path.exists(path_y):
    # lecture
    y = pd.read_csv(path_y)
    
    # correction nom colonne si besoin
    if 'prdtypecode' not in y.columns: 
        y = y.rename(columns={y.columns[1]: 'prdtypecode'})
        
    # creation encodeur
    le = LabelEncoder()
    le.fit(y['prdtypecode'])
    
    # sauvegarde immediate avec le bon nom m2
    save_path = os.path.join(output_dir, "M2_IMAGE_XGBoost_Encoder.pkl")
    joblib.dump(le, save_path)
    
    print(f"succes ! fichier généré ici : {save_path}")
    
    
else:
    print("erreur : fichier y_train introuvable. verifie tes dossiers.")

>>> generation de l'encodeur manquant pour m2 <<<
succes ! fichier généré ici : C:\Users\amisf\Desktop\datascientest_projet\implementation\outputs\M2_IMAGE_XGBoost_Encoder.pkl


In [3]:
import torch
import torch.nn as nn
import xgboost as xgb
import os
import pandas as pd
import numpy as np
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
import cv2
from tqdm import tqdm
import joblib

print(">>> operation sauvetage m2 : re-alignement total <<<")

# config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
path_data = r"C:\Users\amisf\Desktop\datascientest_projet"
path_out = os.path.join(path_data, "implementation", "outputs")
batch_size = 64

# 1. repérage des images (le radar infaillible)
print("scan des images...")
real_path_img = None
candidates = [
    os.path.join(path_data, "data", "raw", "images", "images", "image_train"),
    r"C:\Users\amisf\Desktop\datascientest_projet\data\raw\images\images\image_train"
]
for p in candidates:
    if os.path.exists(p) and len(os.listdir(p)) > 100:
        real_path_img = p; break

if not real_path_img: raise FileNotFoundError("dossier images introuvable")

# 2. preparation dataframe
df = pd.read_csv(os.path.join(path_data, "data", "raw", "X_train_update.csv"))
y = pd.read_csv(os.path.join(path_data, "data", "raw", "Y_train_CVw08PX.csv"))
if 'prdtypecode' not in y.columns: y = y.rename(columns={y.columns[1]: 'prdtypecode'})
df = df.merge(y, left_index=True, right_index=True)
df['path'] = df.apply(lambda r: os.path.join(real_path_img, f"image_{r['imageid']}_product_{r['productid']}.jpg"), axis=1)
# filtre securite
df = df[df['path'].apply(os.path.exists)]

# 3. encodage labels
le = preprocessing.LabelEncoder()
y_encoded = le.fit_transform(df['prdtypecode'])
# on sauvegarde cet encodeur, c'est la reference absolue
joblib.dump(le, os.path.join(path_out, "M2_IMAGE_XGBoost_Encoder.pkl"))
print("encodeur m2 mis a jour.")

# 4. extracteur resnet (standard pytorch)
print("chargement resnet50...")
resnet = models.resnet50(weights="IMAGENET1K_V1")
# on coupe la tete pour garder les 2048 features
extractor = nn.Sequential(*list(resnet.children())[:-1])
extractor.to(device).eval()

# 5. extraction features (c'est la partie un peu longue, ~10min)
print("extraction des features pour tout le dataset...")

class FeatDS(Dataset):
    def __init__(self, paths): self.paths = paths
    def __len__(self): return len(self.paths)
    def __getitem__(self, i):
        try:
            img = cv2.imread(self.paths[i]); img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img = cv2.resize(img, (224, 224))
            img = img / 255.0
            img = (img - np.array([0.485, 0.456, 0.406])) / np.array([0.229, 0.224, 0.225])
            return torch.tensor(img.transpose(2,0,1), dtype=torch.float32)
        except: return torch.zeros((3, 224, 224))

ds = FeatDS(df['path'].values)
loader = DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=0)

features_list = []
with torch.no_grad():
    for batch in tqdm(loader, desc="extraction"):
        batch = batch.to(device)
        # squeeze pour virer les dimensions 1x1 inutiles
        f = extractor(batch).squeeze(-1).squeeze(-1)
        features_list.append(f.cpu().numpy())

X_features = np.concatenate(features_list)
print(f"features extraites : {X_features.shape}")

# 6. entrainement xgboost express
print("entrainement du nouveau xgboost...")
# split train/val
X_train, X_val, y_train, y_val = train_test_split(X_features, y_encoded, test_size=0.15, stratify=y_encoded, random_state=42)

model_xgb = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=len(le.classes_),
    n_estimators=1000,     # suffisant pour etre bon
    max_depth=6,           # standard
    learning_rate=0.05,
    tree_method='hist',    # acceleration gpu si dispo
    device='cuda',
    early_stopping_rounds=20
)

model_xgb.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=100
)

# 7. sauvegarde du sauveur
path_model = os.path.join(path_out, "M2_IMAGE_Classic_XGBoost.json")
model_xgb.save_model(path_model)
print(f">>> M2 REPARÉ ET SAUVEGARDÉ ICI : {path_model}")

# petit check score
acc = accuracy_score(y_val, model_xgb.predict(X_val))
print(f"score de validation du nouveau m2 : {acc:.4f}")

>>> operation sauvetage m2 : re-alignement total <<<
scan des images...
encodeur m2 mis a jour.
chargement resnet50...
extraction des features pour tout le dataset...


extraction: 100%|██████████| 1327/1327 [10:49<00:00,  2.04it/s]


features extraites : (84916, 2048)
entrainement du nouveau xgboost...
[0]	validation_0-mlogloss:3.11062
[100]	validation_0-mlogloss:1.46294
[200]	validation_0-mlogloss:1.33674
[300]	validation_0-mlogloss:1.29742
[400]	validation_0-mlogloss:1.28092
[500]	validation_0-mlogloss:1.27525
[572]	validation_0-mlogloss:1.27521
>>> M2 REPARÉ ET SAUVEGARDÉ ICI : C:\Users\amisf\Desktop\datascientest_projet\implementation\outputs\M2_IMAGE_Classic_XGBoost.json


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


score de validation du nouveau m2 : 0.6267


In [4]:
print("on le re entraine avec nos spécificité ") 

on le re entraine avec nos spécificité 


In [2]:
import torch
import torch.nn as nn
import xgboost as xgb
import os
import pandas as pd
import numpy as np
from torchvision import models
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import cv2
from tqdm import tqdm
import time

print(">>> etape 1 : generation des features obligatoires <<<")

# config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
path_data = r"C:\Users\amisf\Desktop\datascientest_projet"

# 1. radar a images
print("je cherche les images...")
real_path_img = None
cands = [
    os.path.join(path_data, "data", "raw", "images", "images", "image_train"),
    r"C:\Users\amisf\Desktop\datascientest_projet\data\raw\images\images\image_train"
]
for p in cands:
    if os.path.exists(p) and len(os.listdir(p)) > 100: real_path_img = p; break

if not real_path_img: raise FileNotFoundError("pas d images trouvees")

# 2. dataframe
df = pd.read_csv(os.path.join(path_data, "data", "raw", "X_train_update.csv"))
y = pd.read_csv(os.path.join(path_data, "data", "raw", "Y_train_CVw08PX.csv"))
if 'prdtypecode' not in y.columns: y = y.rename(columns={y.columns[1]: 'prdtypecode'})
df = df.merge(y, left_index=True, right_index=True)
df['path'] = df.apply(lambda r: os.path.join(real_path_img, f"image_{r['imageid']}_product_{r['productid']}.jpg"), axis=1)
df = df[df['path'].apply(os.path.exists)] # secu

# encodage
le = preprocessing.LabelEncoder()
y_encoded = le.fit_transform(df['prdtypecode'])

# 3. extracteur resnet
print("chargement extracteur...")
resnet = models.resnet50(weights="IMAGENET1K_V1")
extractor = nn.Sequential(*list(resnet.children())[:-1]) # on vire la fin
extractor.to(device).eval()

# 4. boucle extraction
print("extraction features en cours (patience)...")
class FeatDS(Dataset):
    def __init__(self, p): self.p = p
    def __len__(self): return len(self.p)
    def __getitem__(self, i):
        try:
            im = cv2.imread(self.p[i]); im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
            im = cv2.resize(im, (224, 224))
            im = im / 255.0
            im = (im - np.array([0.485, 0.456, 0.406])) / np.array([0.229, 0.224, 0.225])
            return torch.tensor(im.transpose(2,0,1), dtype=torch.float32)
        except: return torch.zeros((3,224,224))

loader = DataLoader(FeatDS(df['path'].values), batch_size=64, shuffle=False, num_workers=0)

feats = []
with torch.no_grad():
    for b in tqdm(loader, desc="extract"):
        b = b.to(device)
        f = extractor(b).squeeze(-1).squeeze(-1)
        feats.append(f.cpu().numpy())

X_features = np.concatenate(feats)
print(f"features pretes : {X_features.shape}")

print("\n>>> etape 2 : banc d essai vitesse xgboost <<<")

# split test rapide
X_sub, _, y_sub, _ = train_test_split(X_features, y_encoded, train_size=0.2, random_state=42)

configs = [
    {"name": "RAPIDE (Depth 6)", "depth": 6, "lr": 0.05},
    {"name": "MOYEN (Depth 8)", "depth": 8, "lr": 0.03},
    {"name": "TITAN (Depth 10)", "depth": 10, "lr": 0.01},
]

best_conf = None

for c in configs:
    print(f"\n--- test config : {c['name']} ---")
    
    mod = xgb.XGBClassifier(
        objective='multi:softmax', 
        num_class=len(le.classes_),
        n_estimators=10, 
        max_depth=c['depth'],
        learning_rate=c['lr'],
        tree_method='hist',
        device='cuda',
        subsample=0.8,
        colsample_bytree=0.8
    )
    
    t0 = time.time()
    mod.fit(X_sub, y_sub, verbose=False)
    dt = time.time() - t0
    
    t_tree = dt / 10
    est_h = (t_tree * 5000) / 3600
    
    print(f"temps 10 arbres : {dt:.2f}s")
    print(f"estim 5000 arbres : ~{est_h:.2f} h")
    
    if est_h < 2.5: # tolerance 2h30
        print("ok valide")
        best_conf = c
    else:
        print("trop lent")

print("-" * 30)
if best_conf:
    print(f"recommandation : {best_conf['name']}")
else:
    print("tout est trop lent, reste sur depth 6")

>>> etape 1 : generation des features obligatoires <<<
je cherche les images...
chargement extracteur...
extraction features en cours (patience)...


extract: 100%|██████████| 1327/1327 [09:40<00:00,  2.28it/s]


features pretes : (84916, 2048)

>>> etape 2 : banc d essai vitesse xgboost <<<

--- test config : RAPIDE (Depth 6) ---
temps 10 arbres : 7.29s
estim 5000 arbres : ~1.01 h
ok valide

--- test config : MOYEN (Depth 8) ---
temps 10 arbres : 12.43s
estim 5000 arbres : ~1.73 h
ok valide

--- test config : TITAN (Depth 10) ---
temps 10 arbres : 16.77s
estim 5000 arbres : ~2.33 h
ok valide
------------------------------
recommandation : TITAN (Depth 10)


In [3]:
import xgboost as xgb
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, f1_score
import joblib
import torch

print(">>> lancement titan depth 10 (validé par test) <<<")

# config
device = torch.device("cuda")
path_data = r"C:\Users\amisf\Desktop\datascientest_projet"
path_out = os.path.join(path_data, "implementation", "outputs")

# 1. verif features en memoire
try:
    if 'X_features' not in locals(): raise NameError("pas de features")
    print(f"features ok : {X_features.shape}")
except:
    raise SystemExit("erreur : relance l extraction avant le titan")

# 2. split (identique voting)
print("split train val...")
X_train, X_val, y_train, y_val = train_test_split(
    X_features, y_encoded, 
    test_size=0.15, 
    stratify=y_encoded, 
    random_state=42
)

# 3. config titan
print("config depth 10 activee...")
model_xgb = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=len(le.classes_),
    n_estimators=5000,       # endurance max
    max_depth=10,            # mode titan validé
    learning_rate=0.01,      # precision fine
    subsample=0.8,           
    colsample_bytree=0.8,    
    gamma=0.2,               
    tree_method='hist',      # gpu
    device='cuda',
    early_stopping_rounds=200 # patience augmentée car lr faible
)

# 4. entrainement
print("demarrage (environ 2h30)...")
model_xgb.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=10 # affiche progression tous les 10 arbres
)

# 5. resultats
print("calcul scores...")
y_pred = model_xgb.predict(X_val)
acc = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average='weighted')

print(f">>> TITAN FINI - acc: {acc:.4f} | f1: {f1:.4f}")

# sauvegarde
path_model = os.path.join(path_out, "M2_IMAGE_Classic_XGBoost.json")
model_xgb.save_model(path_model)
print(f"sauvegardé : {path_model}")

>>> lancement titan depth 10 (validé par test) <<<
features ok : (84916, 2048)
split train val...
config depth 10 activee...
demarrage (environ 2h30)...
[0]	validation_0-mlogloss:3.24352
[10]	validation_0-mlogloss:2.96486
[20]	validation_0-mlogloss:2.77645
[30]	validation_0-mlogloss:2.63168
[40]	validation_0-mlogloss:2.51359
[50]	validation_0-mlogloss:2.41472
[60]	validation_0-mlogloss:2.32949
[70]	validation_0-mlogloss:2.25483
[80]	validation_0-mlogloss:2.18865
[90]	validation_0-mlogloss:2.12914
[100]	validation_0-mlogloss:2.07503
[110]	validation_0-mlogloss:2.02639
[120]	validation_0-mlogloss:1.98197
[130]	validation_0-mlogloss:1.94093
[140]	validation_0-mlogloss:1.90310
[150]	validation_0-mlogloss:1.86784
[160]	validation_0-mlogloss:1.83531
[170]	validation_0-mlogloss:1.80492
[180]	validation_0-mlogloss:1.77670
[190]	validation_0-mlogloss:1.75038
[200]	validation_0-mlogloss:1.72573
[210]	validation_0-mlogloss:1.70271
[220]	validation_0-mlogloss:1.68098
[230]	validation_0-mlogloss:1.

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


>>> TITAN FINI - acc: 0.6319 | f1: 0.6212
sauvegardé : C:\Users\amisf\Desktop\datascientest_projet\implementation\outputs\M2_IMAGE_Classic_XGBoost.json


In [3]:
import torch
import torch.nn as nn
import xgboost as xgb
import os
import pandas as pd
import numpy as np
from torchvision import models
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, f1_score
import joblib
import cv2
from tqdm import tqdm
import time

print(">>> ETAPE 1 : EXTRACTION DES FEATURES (OBLIGATOIRE) <<<")

# config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
path_data = r"C:\Users\amisf\Desktop\datascientest_projet"
path_out = os.path.join(path_data, "implementation", "outputs")

# 1. radar a images
print("Recherche des images...")
real_path_img = None
cands = [
    os.path.join(path_data, "data", "raw", "images", "images", "image_train"),
    r"C:\Users\amisf\Desktop\datascientest_projet\data\raw\images\images\image_train"
]
for p in cands:
    if os.path.exists(p) and len(os.listdir(p)) > 100: real_path_img = p; break

if not real_path_img: raise FileNotFoundError("Pas d'images trouvées.")

# 2. dataframe
df = pd.read_csv(os.path.join(path_data, "data", "raw", "X_train_update.csv"))
y = pd.read_csv(os.path.join(path_data, "data", "raw", "Y_train_CVw08PX.csv"))
if 'prdtypecode' not in y.columns: y = y.rename(columns={y.columns[1]: 'prdtypecode'})
df = df.merge(y, left_index=True, right_index=True)
df['path'] = df.apply(lambda r: os.path.join(real_path_img, f"image_{r['imageid']}_product_{r['productid']}.jpg"), axis=1)
df = df[df['path'].apply(os.path.exists)]

# encodage
le = preprocessing.LabelEncoder()
y_encoded = le.fit_transform(df['prdtypecode'])

# 3. extracteur resnet
print("Chargement ResNet50...")
resnet = models.resnet50(weights="IMAGENET1K_V1")
extractor = nn.Sequential(*list(resnet.children())[:-1]) # on vire la fin
extractor.to(device).eval()

# 4. boucle extraction
print("Extraction des features en cours (Patience ~10min)...")
class FeatDS(Dataset):
    def __init__(self, p): self.p = p
    def __len__(self): return len(self.p)
    def __getitem__(self, i):
        try:
            im = cv2.imread(self.p[i]); im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
            im = cv2.resize(im, (224, 224))
            im = im / 255.0
            im = (im - np.array([0.485, 0.456, 0.406])) / np.array([0.229, 0.224, 0.225])
            return torch.tensor(im.transpose(2,0,1), dtype=torch.float32)
        except: return torch.zeros((3,224,224))

loader = DataLoader(FeatDS(df['path'].values), batch_size=64, shuffle=False, num_workers=0)

feats = []
with torch.no_grad():
    for b in tqdm(loader, desc="Extraction"):
        b = b.to(device)
        f = extractor(b).squeeze(-1).squeeze(-1)
        feats.append(f.cpu().numpy())

X_features = np.concatenate(feats)
print(f"Features prêtes : {X_features.shape}")

# ICI COMMENCE LA PARTIE XGBOOST (SEPARÉE PROPREMENT)


print("\n>>> ETAPE 2 : PROTOCOLE COMMANDO (RECHERCHE DU SCORE ELITE) <<<")

# 1. split stratifié
print("Préparation du terrain (Split)...")
X_train, X_val, y_train, y_val = train_test_split(
    X_features, y_encoded, 
    test_size=0.15, 
    stratify=y_encoded, 
    random_state=42
)

# 2. definition des 3 profils
configs = [
    {
        "name": "COMMANDO 1 (SNIPER)",
        "params": {
            'max_depth': 12,        # Tres profond
            'learning_rate': 0.02,  # Lent et precis
            'n_estimators': 4000,
            'gamma': 0.2,           # Securite anti-overfit
            'subsample': 0.85
        }
    },
    {
        "name": "COMMANDO 2 (RUSHER - EX 85%)",
        "params": {
            'max_depth': 8,         # Standard efficace
            'learning_rate': 0.05,  # Rapide
            'n_estimators': 3000,
            'gamma': 0.1,           
            'subsample': 0.8
        }
    },
    {
        "name": "COMMANDO 3 (TANK)",
        "params": {
            'max_depth': 10,
            'learning_rate': 0.03,
            'n_estimators': 3500,
            'gamma': 0,             # AUCUN FREIN
            'subsample': 0.9
        }
    }
]

best_score = 0
best_model = None
best_name = ""

# 3. execution de la bataille
for cfg in configs:
    print(f"\n--- MISSION : {cfg['name']} ---")
    p = cfg['params']
    
    model = xgb.XGBClassifier(
        objective='multi:softmax',
        num_class=len(le.classes_),
        tree_method='hist',      # Moteur GPU
        device='cuda',
        early_stopping_rounds=200, # Patience
        **p
    )
    
    start = time.time()
    # J'ai mis verbose=50 pour eviter de faire laguer ton navigateur
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=50  
    )
    duration = time.time() - start
    
    # eval
    preds = model.predict(X_val)
    f1 = f1_score(y_val, preds, average='weighted')
    acc = accuracy_score(y_val, preds)
    
    print(f"RESULTAT {cfg['name']} : F1 = {f1:.4f} | Acc = {acc:.4f} | Temps = {duration:.0f}s")
    
    # selection du champion
    if f1 > best_score:
        best_score = f1
        best_model = model
        best_name = cfg['name']
        print(f">>> NOUVEAU LEADER : {best_name}")

# 4. sauvegarde du vainqueur
print("\n" + "="*30)
print(f"VICTOIRE FINALE : {best_name}")
print(f"MEILLEUR SCORE F1 : {best_score:.4f}")
print("="*30)

path_model = os.path.join(path_out, "M2_IMAGE_Classic_XGBoost.json")
if best_model:
    best_model.save_model(path_model)
    # sauvegarde encodeur obligatoire
    joblib.dump(le, os.path.join(path_out, "M2_IMAGE_XGBoost_Encoder.pkl"))
    print(f"Modèle Champion sauvegardé sous : {path_model}")
else:
    print("Echec de l'entrainement.")

print("Prêt pour le Voting.")

>>> ETAPE 1 : EXTRACTION DES FEATURES (OBLIGATOIRE) <<<
Recherche des images...
Chargement ResNet50...
Extraction des features en cours (Patience ~10min)...


Extraction: 100%|██████████| 1327/1327 [07:02<00:00,  3.14it/s]


Features prêtes : (84916, 2048)

>>> ETAPE 2 : PROTOCOLE COMMANDO (RECHERCHE DU SCORE ELITE) <<<
Préparation du terrain (Split)...

--- MISSION : COMMANDO 1 (SNIPER) ---
[0]	validation_0-mlogloss:3.20469
[50]	validation_0-mlogloss:2.07340
[100]	validation_0-mlogloss:1.73100
[150]	validation_0-mlogloss:1.55275
[200]	validation_0-mlogloss:1.44954
[250]	validation_0-mlogloss:1.38642
[300]	validation_0-mlogloss:1.34732
[350]	validation_0-mlogloss:1.32220
[400]	validation_0-mlogloss:1.30536
[450]	validation_0-mlogloss:1.29403
[500]	validation_0-mlogloss:1.28605
[550]	validation_0-mlogloss:1.28043
[600]	validation_0-mlogloss:1.27693
[650]	validation_0-mlogloss:1.27398
[700]	validation_0-mlogloss:1.27215
[750]	validation_0-mlogloss:1.27076
[800]	validation_0-mlogloss:1.26946
[850]	validation_0-mlogloss:1.26887
[900]	validation_0-mlogloss:1.26827
[950]	validation_0-mlogloss:1.26799
[1000]	validation_0-mlogloss:1.26774
[1050]	validation_0-mlogloss:1.26762
[1100]	validation_0-mlogloss:1.26752
[1

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


RESULTAT COMMANDO 1 (SNIPER) : F1 = 0.6192 | Acc = 0.6297 | Temps = 26640s
>>> NOUVEAU LEADER : COMMANDO 1 (SNIPER)

--- MISSION : COMMANDO 2 (RUSHER - EX 85%) ---
[0]	validation_0-mlogloss:3.10034
[50]	validation_0-mlogloss:1.63326
[100]	validation_0-mlogloss:1.40853
[150]	validation_0-mlogloss:1.33477
[200]	validation_0-mlogloss:1.30185
[250]	validation_0-mlogloss:1.28627
[300]	validation_0-mlogloss:1.27729
[350]	validation_0-mlogloss:1.27367
[400]	validation_0-mlogloss:1.27150
[450]	validation_0-mlogloss:1.27150
[500]	validation_0-mlogloss:1.27130
[550]	validation_0-mlogloss:1.27106
[600]	validation_0-mlogloss:1.27193
[650]	validation_0-mlogloss:1.27250
[700]	validation_0-mlogloss:1.27327
[749]	validation_0-mlogloss:1.27380
RESULTAT COMMANDO 2 (RUSHER - EX 85%) : F1 = 0.6213 | Acc = 0.6312 | Temps = 960s
>>> NOUVEAU LEADER : COMMANDO 2 (RUSHER - EX 85%)

--- MISSION : COMMANDO 3 (TANK) ---
[0]	validation_0-mlogloss:3.16680
[50]	validation_0-mlogloss:1.86255
[100]	validation_0-mloglo

In [4]:
import torch
import torch.nn as nn
import xgboost as xgb
import os
import pandas as pd
import numpy as np
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, f1_score
import joblib
import cv2
from tqdm import tqdm
from PIL import Image

print(">>> OPÉRATION SAUVETAGE : AUGMENTATION COMPATIBLE VOTING <<<")

# config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
path_data = r"C:\Users\amisf\Desktop\datascientest_projet"
path_out = os.path.join(path_data, "implementation", "outputs")

# 1. Chargement Dataframe
df = pd.read_csv(os.path.join(path_data, "data", "raw", "X_train_update.csv"))
y = pd.read_csv(os.path.join(path_data, "data", "raw", "Y_train_CVw08PX.csv"))
if 'prdtypecode' not in y.columns: y = y.rename(columns={y.columns[1]: 'prdtypecode'})
df = df.merge(y, left_index=True, right_index=True)

# Recherche images
path_img = os.path.join(path_data, "data", "raw", "images", "images", "image_train")
if not os.path.exists(path_img): # fallback
    path_img = r"C:\Users\amisf\Desktop\datascientest_projet\data\raw\images\images\image_train"

df['path'] = df.apply(lambda r: os.path.join(path_img, f"image_{r['imageid']}_product_{r['productid']}.jpg"), axis=1)
df = df[df['path'].apply(os.path.exists)]

# Encodage labels
le = preprocessing.LabelEncoder()
y_encoded = le.fit_transform(df['prdtypecode'])

# 2. Préparation Augmentation + ResNet
# C'est ici que la magie opère. On garde le meme ResNet que le voting.
resnet = models.resnet50(weights="IMAGENET1K_V1")
extractor = nn.Sequential(*list(resnet.children())[:-1])
extractor.to(device).eval()

# Transformation de base (celle du voting)
transform_base = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Transformation Miroir (Augmentation 1)
transform_flip = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(p=1.0), # Force flip
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Transformation Rotation (Augmentation 2)
transform_rot = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomRotation(degrees=15),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

class AugmentDS(Dataset):
    def __init__(self, paths, labels, transform):
        self.paths = paths
        self.labels = labels
        self.transform = transform
    def __len__(self): return len(self.paths)
    def __getitem__(self, i):
        try:
            # On utilise PIL pour etre compatible avec transforms
            img = Image.open(self.paths[i]).convert('RGB')
            img = self.transform(img)
            return img, self.labels[i]
        except:
            return torch.zeros((3,224,224)), self.labels[i]

# 3. Extraction Massive (x3)
all_features = []
all_labels = []

configs_aug = [
    ("Original", transform_base),
    ("Miroir", transform_flip),
    ("Rotation", transform_rot)
]

print("\n--- DEBUT EXTRACTION MULTIPLE ---")
for name, trans in configs_aug:
    print(f"Extraction version : {name}")
    ds = AugmentDS(df['path'].values, y_encoded, trans)
    loader = DataLoader(ds, batch_size=64, shuffle=False, num_workers=0)
    
    current_feats = []
    with torch.no_grad():
        for bx, by in tqdm(loader):
            bx = bx.to(device)
            f = extractor(bx).squeeze(-1).squeeze(-1)
            current_feats.append(f.cpu().numpy())
            
    all_features.append(np.concatenate(current_feats))
    all_labels.append(y_encoded) # On duplique les labels aussi

# Fusion
X_final = np.concatenate(all_features)
y_final = np.concatenate(all_labels)

print(f"\nDONNÉES PRÊTES : {X_final.shape} (C'est ça la puissance !)")

# 4. Entraînement XGBoost (Config Rusher)
print("\n>>> LANCEMENT XGBOOST SUR DONNÉES AUGMENTÉES <<<")

X_train, X_val, y_train, y_val = train_test_split(
    X_final, y_final, test_size=0.1, stratify=y_final, random_state=42
)

model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=len(le.classes_),
    tree_method='hist',
    device='cuda',
    # Parametres Rusher optimisés
    max_depth=8,
    learning_rate=0.05,
    n_estimators=3000,
    subsample=0.8,
    colsample_bytree=0.8,
    early_stopping_rounds=100
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=50
)

# 5. Sauvegarde et Verdict
preds = model.predict(X_val)
f1 = f1_score(y_val, preds, average='weighted')
print(f"\n>>> SCORE FINAL AVEC AUGMENTATION : {f1:.4f} <<<")

path_model = os.path.join(path_out, "M2_IMAGE_Classic_XGBoost.json")
model.save_model(path_model)
joblib.dump(le, os.path.join(path_out, "M2_IMAGE_XGBoost_Encoder.pkl"))
print("Sauvegardé. Ce modèle est compatible Voting (il mange du ResNet50).")

>>> OPÉRATION SAUVETAGE : AUGMENTATION COMPATIBLE VOTING <<<

--- DEBUT EXTRACTION MULTIPLE ---
Extraction version : Original


100%|██████████| 1327/1327 [09:58<00:00,  2.22it/s]


Extraction version : Miroir


100%|██████████| 1327/1327 [10:51<00:00,  2.04it/s]


Extraction version : Rotation


100%|██████████| 1327/1327 [12:04<00:00,  1.83it/s]



DONNÉES PRÊTES : (254748, 2048) (C'est ça la puissance !)

>>> LANCEMENT XGBOOST SUR DONNÉES AUGMENTÉES <<<
[0]	validation_0-mlogloss:3.10264
[50]	validation_0-mlogloss:1.55982
[100]	validation_0-mlogloss:1.28229
[150]	validation_0-mlogloss:1.16014
[200]	validation_0-mlogloss:1.08543
[250]	validation_0-mlogloss:1.03192
[300]	validation_0-mlogloss:0.98979
[350]	validation_0-mlogloss:0.95513
[400]	validation_0-mlogloss:0.92646
[450]	validation_0-mlogloss:0.90150
[500]	validation_0-mlogloss:0.87918
[550]	validation_0-mlogloss:0.85988
[600]	validation_0-mlogloss:0.84320
[650]	validation_0-mlogloss:0.82833
[700]	validation_0-mlogloss:0.81535
[750]	validation_0-mlogloss:0.80411
[800]	validation_0-mlogloss:0.79489
[850]	validation_0-mlogloss:0.78649
[900]	validation_0-mlogloss:0.77934
[950]	validation_0-mlogloss:0.77321
[1000]	validation_0-mlogloss:0.76771
[1050]	validation_0-mlogloss:0.76292
[1100]	validation_0-mlogloss:0.75869
[1150]	validation_0-mlogloss:0.75509
[1200]	validation_0-mloglo

In [5]:
import xgboost as xgb
import os
import joblib
from sklearn.metrics import f1_score, accuracy_score
import time

print(">>> LANCEMENT DU FINAL PUSH (8000 ARBRES) <<<")

# Verif que les données augmentées sont là
if 'X_final' not in locals() or 'y_final' not in locals():
    raise SystemExit("ERREUR : Les données augmentées (X_final) ne sont plus en mémoire. Ne relance pas tout, relance juste la fin de l'extraction précédente.")

print(f"Entraînement sur {X_final.shape[0]} images (Original + Miroir + Rotation)")

# Config "Marathon"
# On baisse le learning rate a 0.04 pour profiter des 8000 arbres
# On garde la depth 8 qui marche bien
model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=len(le.classes_),
    tree_method='hist',
    device='cuda',
    max_depth=8,            # On garde ce qui marche
    learning_rate=0.04,     # Un poil plus fin car on a le temps
    n_estimators=8000,      # On pousse le bouchon
    subsample=0.85,         # On augmente un peu car on a beaucoup de data
    colsample_bytree=0.8,
    early_stopping_rounds=300 # On lui laisse le temps de respirer
)

start = time.time()
print("Démarrage... (Va prendre un moment, observe le loss)")

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=100 # Point tous les 100
)

duration = time.time() - start

# Verdict
preds = model.predict(X_val)
f1 = f1_score(y_val, preds, average='weighted')
acc = accuracy_score(y_val, preds)

print(f"\n>>> RESULTAT FINAL PUSH : F1 = {f1:.4f} | Acc = {acc:.4f}")
print(f"Temps : {duration:.0f}s")

# Si on dépasse 80%, c'est champagne.
path_model = os.path.join(path_out, "M2_IMAGE_Classic_XGBoost.json")
model.save_model(path_model)
print(f"Modèle écrasé et sauvegardé : {path_model}")

>>> LANCEMENT DU FINAL PUSH (8000 ARBRES) <<<
Entraînement sur 254748 images (Original + Miroir + Rotation)
Démarrage... (Va prendre un moment, observe le loss)
[0]	validation_0-mlogloss:3.13731
[100]	validation_0-mlogloss:1.35791
[200]	validation_0-mlogloss:1.13826
[300]	validation_0-mlogloss:1.03754
[400]	validation_0-mlogloss:0.97148
[500]	validation_0-mlogloss:0.92240
[600]	validation_0-mlogloss:0.88376
[700]	validation_0-mlogloss:0.85240
[800]	validation_0-mlogloss:0.82677
[900]	validation_0-mlogloss:0.80617
[1000]	validation_0-mlogloss:0.78994
[1100]	validation_0-mlogloss:0.77718
[1200]	validation_0-mlogloss:0.76691
[1300]	validation_0-mlogloss:0.75855
[1400]	validation_0-mlogloss:0.75155
[1500]	validation_0-mlogloss:0.74576
[1600]	validation_0-mlogloss:0.74103
[1700]	validation_0-mlogloss:0.73698
[1800]	validation_0-mlogloss:0.73342
[1900]	validation_0-mlogloss:0.73017
[2000]	validation_0-mlogloss:0.72733
[2100]	validation_0-mlogloss:0.72458
[2200]	validation_0-mlogloss:0.72197


In [7]:
import pandas as pd
import os
import joblib
from sklearn import preprocessing

print(">>> reation du fichier pkl manquant <<<")

# config
path_data = r"C:\Users\amisf\Desktop\datascientest_projet"
path_out = os.path.join(path_data, "implementation", "outputs")

# 1 chargement des cibles originales
df_y = pd.read_csv(os.path.join(path_data, "data", "raw", "Y_train_CVw08PX.csv"))

# secu nom colonne
if 'prdtypecode' not in df_y.columns: 
    df_y = df_y.rename(columns={df_y.columns[1]: 'prdtypecode'})

# 2 recreation encodeur
le = preprocessing.LabelEncoder()
le.fit(df_y['prdtypecode'])

print(f"encodeur pret. classes detectees : {len(le.classes_)}")

# 3 sauvegarde
path_pkl = os.path.join(path_out, "M2_IMAGE_XGBoost_Encoder.pkl")
joblib.dump(le, path_pkl)

print(f"sauvegarde ok sous : {path_pkl}")

>>> reation du fichier pkl manquant <<<
encodeur pret. classes detectees : 27
sauvegarde ok sous : C:\Users\amisf\Desktop\datascientest_projet\implementation\outputs\M2_IMAGE_XGBoost_Encoder.pkl
