In [1]:
import numpy as np
import pandas as pd
import os
import time
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
import torch
import gc

# je definis les chemins
path_root = r"C:\Users\amisf\Desktop\datascientest_projet"
path_out = os.path.join(path_root, "implementation", "outputs")

# je verifie si le gpu est dispo pour xgboost
# xgboost a besoin de savoir si cuda est la
use_gpu = False
try:
    if torch.cuda.is_available():
        print("gpu detecte on va accelerer xgboost")
        use_gpu = True
    else:
        print("pas de gpu on reste sur cpu")
except:
    print("erreur detection gpu passage cpu")

# je cree le dossier sortie si besoin
if not os.path.exists(path_out):
    os.makedirs(path_out)

gpu detecte on va accelerer xgboost


In [2]:
print("chargement des donnees converties")

# je tente de charger les features extraites par le reseau
# c est ce qui donne les meilleurs resultats en ml classique
try:
    path_x = os.path.join(path_out, 'train_features_resnet50_augmented.npy')
    path_y = os.path.join(path_out, 'train_labels_augmented.npy')
    
    if not os.path.exists(path_x):
        # plan b si le fichier augmente n est pas la je cherche le simple
        path_x = os.path.join(path_out, 'train_features_resnet50.npy')
        path_y = os.path.join(path_out, 'train_labels.npy')

    x_data = np.load(path_x)
    y_data = np.load(path_y)
    
    print("donnees chargees en memoire")
    print(x_data.shape)

except Exception as e:
    print("erreur critique impossible de trouver les fichiers npy")
    print("tu dois avoir extrait les features avant de lancer xgboost")
    raise e

# encodage des cibles
le = LabelEncoder()
y_encoded = le.fit_transform(y_data)

# split train val
# je garde 20% pour verifier si on overfit pas
x_train, x_val, y_train, y_val = train_test_split(
    x_data, y_encoded, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_encoded
)

print("split termine pret a entrainer")

chargement des donnees converties
donnees chargees en memoire
(405000, 2048)
split termine pret a entrainer


In [4]:
print("lancement optimisation xgboost")

# config du modele champion
# je pousse les parametres pour exploiter ta machine
params = {
    'objective': 'multi:softmax',
    'num_class': len(le.classes_),
    'n_estimators': 3000, # on vise haut l arret auto coupera avant
    'max_depth': 8, # profondeur pour capter les nuances
    'learning_rate': 0.05, # vitesse lente pour precision
    'subsample': 0.8, # evite overfit
    'colsample_bytree': 0.8,
    'early_stopping_rounds': 50, # securite anti overfit
    'eval_metric': 'mlogloss'
}

# gestion hardware
if use_gpu:
    params['tree_method'] = 'hist'
    params['device'] = 'cuda'
else:
    params['tree_method'] = 'hist'
    params['device'] = 'cpu'
    params['n_jobs'] = -1

model = xgb.XGBClassifier(**params)

t_start = time.time()
print("demarrage entrainement surveille")

try:
    # je lance l apprentissage
    # le verbose affiche le score regulierement
    model.fit(
        x_train, y_train,
        eval_set=[(x_train, y_train), (x_val, y_val)],
        verbose=10
    )
    print("entrainement gpu termine")

except Exception as e:
    print("echec gpu detection saturation memoire")
    print("bascule automatique sur cpu ram 128go")
    
    # nettoyage avant reprise
    if 'model' in globals(): del model
    torch.cuda.empty_cache()
    gc.collect()
    
    # reconfig cpu forcee
    params['device'] = 'cpu'
    params['n_jobs'] = -1
    model = xgb.XGBClassifier(**params)
    
    model.fit(
        x_train, y_train,
        eval_set=[(x_train, y_train), (x_val, y_val)],
        verbose=10
    )
    print("entrainement cpu termine")

duration = time.time() - t_start
print(f"temps total {duration:.1f} sec")

lancement optimisation xgboost
demarrage entrainement surveille
[0]	validation_0-mlogloss:3.12367	validation_1-mlogloss:3.14457
[10]	validation_0-mlogloss:2.31332	validation_1-mlogloss:2.45704
[20]	validation_0-mlogloss:1.91287	validation_1-mlogloss:2.12983
[30]	validation_0-mlogloss:1.64494	validation_1-mlogloss:1.91686
[40]	validation_0-mlogloss:1.44806	validation_1-mlogloss:1.76370
[50]	validation_0-mlogloss:1.29666	validation_1-mlogloss:1.64825
[60]	validation_0-mlogloss:1.17473	validation_1-mlogloss:1.55718
[70]	validation_0-mlogloss:1.07402	validation_1-mlogloss:1.48375
[80]	validation_0-mlogloss:0.99035	validation_1-mlogloss:1.42363
[90]	validation_0-mlogloss:0.91929	validation_1-mlogloss:1.37334
[100]	validation_0-mlogloss:0.85748	validation_1-mlogloss:1.33024
[110]	validation_0-mlogloss:0.80232	validation_1-mlogloss:1.29228
[120]	validation_0-mlogloss:0.75441	validation_1-mlogloss:1.25961
[130]	validation_0-mlogloss:0.71083	validation_1-mlogloss:1.23014
[140]	validation_0-mlog

In [5]:
print("analyse des performances")

# predictions
preds = model.predict(x_val)
score = f1_score(y_val, preds, average='weighted')

print(f"score f1 final obtenu {score:.4f}")

# rapport detaille
print("rapport par classe")
print(classification_report(y_val, preds))

# sauvegarde intelligente
# je ne sauvegarde que si le modele est valide
save_path = os.path.join(path_out, "best_xgboost_gpu_model.json")
model.save_model(save_path)
print(f"modele sauvegarde sous {save_path}")

# generation csv pour comparaison
results = pd.DataFrame({
    'y_true': le.inverse_transform(y_val),
    'y_pred': le.inverse_transform(preds)
})
csv_path = os.path.join(path_out, "resultats_xgboost_detail.csv")
results.to_csv(csv_path, index=False)
print("fichier csv detaille genere")

analyse des performances


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


score f1 final obtenu 0.8532
rapport par classe
              precision    recall  f1-score   support

           0       0.76      0.79      0.77      3000
           1       0.78      0.78      0.78      3000
           2       0.89      0.89      0.89      3000
           3       0.93      0.98      0.95      3000
           4       0.84      0.86      0.85      3000
           5       0.93      0.93      0.93      3000
           6       0.95      0.96      0.95      3000
           7       0.76      0.62      0.68      3000
           8       0.81      0.77      0.79      3000
           9       0.78      0.85      0.82      3000
          10       0.96      0.99      0.97      3000
          11       0.86      0.87      0.86      3000
          12       0.84      0.80      0.82      3000
          13       0.83      0.73      0.78      3000
          14       0.88      0.89      0.88      3000
          15       0.93      0.99      0.96      3000
          16       0.74      0.75