In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from xgboost import XGBRegressor

: 

### Chargement des données

In [5]:
train_data = pd.read_csv('data/x_train.csv')  # Remplacer par le chemin réel
y_data = pd.read_csv('data/y_train.csv')
test_data = pd.read_csv('data/x_test.csv')  # Optionnel : si disponible

### Data Preparation

In [6]:
# Suppression de la colonne 'ID' (pas utile pour l'entraînement)
X = train_data.drop(columns=['ID'])
y = y_data.drop(columns=['ID'])
# Diviser les données en ensemble d'entraînement et de validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

### Modele

In [None]:
def custom_weighted_rmse(y_true, y_pred):
    weights = np.where(y_true < 0.5, 1, 1.2)
    error_per_class = weights * (y_true - y_pred) ** 2
    mean_error = np.mean(error_per_class)
    return np.sqrt(mean_error)

#### Cross Validation

In [8]:
model = RandomForestRegressor(n_estimators=5, max_depth=7, min_samples_split=0.01, min_samples_leaf=30, random_state=42)

# 4. Validation croisée avec la métrique personnalisée


custom_scorer = make_scorer(custom_weighted_rmse, greater_is_better=False)

# Effectuer une validation croisée à 5 folds
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring=custom_scorer)
print(f"Scores de validation croisée : {cv_scores}")
print(f"Score moyen de validation croisée : {np.mean(cv_scores)}")

Scores de validation croisée : [-0.12234302 -0.1188173  -0.12150708 -0.122199   -0.11883688]
Score moyen de validation croisée : -0.12074065824142181


#### Grid Search

In [9]:
# 5. Optimisation des hyperparamètres avec GridSearchCV
param_grid = {
    'n_estimators': [5, 10, 20],
    'max_depth': [7, 10, 15],
    'min_samples_split': [0.01, 0.05, 0.1],
    'min_samples_leaf': [30, 50, 70]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring=custom_scorer, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Meilleurs hyperparamètres
print(f"Meilleurs hyperparamètres : {grid_search.best_params_}")

# 6. Entraîner le modèle optimisé
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# 7. Évaluation finale sur les données de validation
y_pred = best_model.predict(X_val)
rmse = custom_weighted_rmse(y_val, y_pred)
print(f"Erreur quadratique moyenne pondérée (RMSE) sur les données de validation : {rmse}")

#### Boosting

In [None]:
model = XGBRegressor(n_estimators=100, max_depth=7, learning_rate=0.1, random_state=42)

## Test Pred

In [34]:
# 6. Prédictions finales (sur test_data, si disponible)
test_pred = model.predict(test_data.drop(columns=['ID']))


# Enregistrement des résultats

output = pd.DataFrame(test_pred, columns=[f'c{i}' for i in range(1, 24)])
# Ajouter la colonne "ID" au début
output.insert(0, 'ID', test_data['ID'])
output.to_csv('predictions.csv', index=False)


In [35]:
output

Unnamed: 0,ID,c1,c2,c3,c4,c5,c6,c7,c8,c9,...,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23
0,202933,0.100391,0.000374,0.155939,0.155939,0.000006,0.000006,0.000374,0.000018,0.000480,...,0.021459,0.0,0.000448,0.015265,0.000417,0.000628,0.000374,0.000006,0.155939,0.038348
1,202934,0.119782,0.000388,0.283572,0.283572,0.000000,0.000000,0.000388,0.000003,0.006045,...,0.117425,0.0,0.002526,0.001025,0.000034,0.009737,0.000388,0.000000,0.283572,0.144069
2,202935,0.001154,0.013621,0.396257,0.396257,0.378463,0.378463,0.013621,0.000000,0.000000,...,0.000000,0.0,0.000000,0.003019,0.001154,0.000000,0.013621,0.378463,0.396257,0.000000
3,202936,0.085573,0.012065,0.205891,0.205891,0.070877,0.070877,0.012065,0.000000,0.000000,...,0.016740,0.0,0.000000,0.003142,0.068833,0.021085,0.012065,0.070877,0.205891,0.013148
4,202937,0.130224,0.080851,0.220101,0.220101,0.008305,0.008305,0.080851,0.000000,0.000000,...,0.052603,0.0,0.000000,0.000000,0.077621,0.000000,0.080851,0.008305,0.220101,0.000721
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134668,337601,0.010933,0.154728,0.217278,0.217278,0.040641,0.040641,0.154728,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.010474,0.154728,0.040641,0.217278,0.000502
134669,337602,0.120368,0.006529,0.188660,0.188660,0.000002,0.000002,0.006529,0.000009,0.000561,...,0.026787,0.0,0.000000,0.020682,0.000439,0.000097,0.006529,0.000002,0.188660,0.040421
134670,337603,0.010933,0.154728,0.217278,0.217278,0.040641,0.040641,0.154728,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.010474,0.154728,0.040641,0.217278,0.000502
134671,337604,0.000000,0.000000,0.655606,0.655606,0.654914,0.654914,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000693,0.000000,0.000000,0.000000,0.654914,0.655606,0.000000
