In [None]:
import pandas as pd
import numpy as np
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, make_scorer,r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

import warnings
warnings.filterwarnings("ignore")

### Chargement des données

In [2]:
train_data = pd.read_csv('data/x_train.csv')  
y_data = pd.read_csv('data/y_train.csv')
test_data = pd.read_csv('data/x_test.csv')

In [3]:
def custom_weighted_rmse(y_true, y_pred):
    weights = np.where(y_true < 0.5, 1, 1.2)
    error_per_class = weights * (y_true - y_pred) ** 2
    mean_error = np.mean(error_per_class)
    return np.sqrt(mean_error)

custom_scorer = make_scorer(custom_weighted_rmse, greater_is_better=False)

## Data Preparation

In [4]:
# Suppression de la colonne 'ID' (pas utile pour l'entraînement)
X = train_data.drop(columns=['ID'])
y = y_data.drop(columns=['ID'])

# Diviser les données en ensemble d'entraînement et de validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

#### Application d un standard scaler

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)

### Polynomial features

In [8]:
#Now, we extract polynomial features and interactions up to a degree of 2
poly = PolynomialFeatures(degree=2)
poly.fit(X_train_scaled)
X_train_poly = poly.transform(X_train_scaled)
X_val_poly = poly.transform(X_val_scaled)
print("X_train.shape: {}".format(X_train.shape))
print("X_train_poly.shape: {}".format(X_train_poly.shape))

X_train.shape: (162346, 13)
X_train_poly.shape: (162346, 105)


## Etude classique 1 seul Model pour prédire 

### Selection des Modeles

On cherche à tester sur un grand nombre de datasets afin de voir si un modele marche mieux ou nom 

In [None]:
models = {
    'XGBoost' : XGBRegressor(),
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet': ElasticNet(),
    'Decision Tree Regression': DecisionTreeRegressor(),
    'Random Forest Regression': RandomForestRegressor(),
    'K-Neighbors Regression': KNeighborsRegressor()}

res = list()
for mod in models.values() :
    print(f"Model: {mod.__class__.__name__}")
    # Initialiser le modèle
    model = mod
    # Entraîner le modèle de base
    model.fit(X_train_scaled, y_train)
    # Évaluation du modèle
    y_pred = model.predict(X_val_scaled)
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred) 
    custom_rmse = custom_weighted_rmse(y_val, y_pred)
    # Validation croisée
    #cv_scores = cross_val_score(model, X_train, y_train, cv=10, scoring=custom_scorer)
    res.append([mod.__class__.__name__,custom_rmse, mse,r2,])

Model: XGBRegressor
Model: LinearRegression
Model: Ridge
Model: Lasso


  model = cd_fast.enet_coordinate_descent(


Model: ElasticNet


  model = cd_fast.enet_coordinate_descent(


Model: DecisionTreeRegressor
Model: RandomForestRegressor
Model: KNeighborsRegressor


In [23]:
res = pd.DataFrame(res,columns=["model","custom_rmse","mse","r2"]).sort_values(by="custom_rmse",ascending=False)
# res["cv_mean"] = res['cv_score'].apply(lambda x : np.mean(x))
res

Unnamed: 0,model,custom_rmse,mse,r2
3,Lasso,0.217702,0.041588,0.043445
4,ElasticNet,0.217702,0.041588,0.043445
2,Ridge,0.178308,0.028143,0.272862
1,LinearRegression,0.178304,0.028143,0.272863
7,KNeighborsRegressor,0.026284,0.000649,0.97369
0,XGBRegressor,0.022887,0.000487,0.947358
5,DecisionTreeRegressor,0.014776,0.000204,0.993547
6,RandomForestRegressor,0.00953,8.6e-05,0.997001


Le meilleur model semble etre le random Forest Regressor. On va donc essayer d optimiser les parametres

### Optimisation des hyperparamètres - Random Forest

On va commencer par utiliser un modèle de Random Forest simple que l'on va evaluer

In [11]:
# model_rf = RandomForestRegressor(n_estimators=5, max_depth=7, min_samples_split=0.01, min_samples_leaf=30, random_state=42)
model_rf = RandomForestRegressor(n_estimators=5, max_depth=7, min_samples_split=0.01, min_samples_leaf=30, random_state=42)
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_val)
rmse = custom_weighted_rmse(y_val, y_pred)
print(f"Erreur quadratique moyenne pondérée (RMSE) sur les données de validation : {rmse}")

Erreur quadratique moyenne pondérée (RMSE) sur les données de validation : 0.11800015451516697


Debut de l'optimisation des hyperparamètres

In [None]:
# Optimisation des hyperparamètres avec GridSearchCV
# param_grid = {
#     'n_estimators': [5, 10, 20],
#     'max_depth': [7, 10, 15],
#     'min_samples_split': [0.01, 0.05, 0.1],
#     'min_samples_leaf': [30, 50, 70]
# }
param_grid = {
    'n_estimators': [11,12,13],
    'max_depth': [15,20],
    'min_samples_split': [0.01],
    'min_samples_leaf': [30]
}
grid_search = GridSearchCV(estimator=model_rf, param_grid=param_grid, cv=5, scoring=custom_scorer, verbose=2, n_jobs=-1)
grid_search.fit(X_train_poly, y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
# Meilleurs hyperparamètres
print(f"Meilleurs hyperparamè tres : {grid_search.best_params_}")

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [None]:
# Évaluation finale sur les données de validation
y_pred = best_model.predict(X_val_poly)
best_mse = mean_squared_error(y_val, y_pred)
best_r2 = r2_score(y_val, y_pred)
custom_mse = custom_weighted_rmse(y_val, y_pred)
print(f"Best Mean Squared Error: {best_mse}")
print(f"Best R^2 Score: {best_r2}")
print(f"Custom MSE sur les données de validation : {custom_mse}")

In [None]:
# Évaluation finale sur les données de validation
y_pred = best_model.predict(X_val_poly)
best_mse = mean_squared_error(y_val, y_pred)
best_r2 = r2_score(y_val, y_pred)
custom_mse = custom_weighted_rmse(y_val, y_pred)
print(f"Best Mean Squared Error: {best_mse}")
print(f"Best R^2 Score: {best_r2}")
print(f"Custom MSE sur les données de validation : {custom_mse}")

Best Mean Squared Error: 0.007539696090108572
Best R^2 Score: 0.7263825866396201
Custom MSE sur les données de validation : 0.09115404934287714


In [14]:
# Effectuer une validation croisée à 5 folds
best_cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring=custom_scorer)
print(f"Best Cross-Validation Scores: {-best_cv_scores}")
print(f"Mean Best Cross-Validation Score : {-best_cv_scores.mean()}")

Best Cross-Validation Scores: [0.08968177 0.09232792 0.09127526 0.09227364 0.09024462]
Mean Best Cross-Validation Score : 0.09116064294290777


### Application des modeles avec Polynomial Features

In [9]:
models = {
    'XGBoost' : XGBRegressor(),
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet': ElasticNet(),
    'Decision Tree Regression': DecisionTreeRegressor(),
    'Random Forest Regression': RandomForestRegressor(n_estimators=5, max_depth=7, min_samples_split=0.01, min_samples_leaf=30, random_state=42),
    'K-Neighbors Regression': KNeighborsRegressor()}

res = list()
for mod in models.values() :
    print(f"Model: {mod.__class__.__name__}")
    # Initialiser le modèle
    model = mod
    # Entraîner le modèle de base
    model.fit(X_train_poly, y_train)
    # Évaluation du modèle
    y_pred = model.predict(X_val_poly)
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred) 
    custom_rmse = custom_weighted_rmse(y_val, y_pred)
    # Validation croisée
    #cv_scores = cross_val_score(model, X_train, y_train, cv=10, scoring=custom_scorer)
    res.append([mod.__class__.__name__,"Polynomial Features",custom_rmse, mse,r2,])

Model: XGBRegressor
Model: LinearRegression
Model: Ridge
Model: Lasso


  model = cd_fast.enet_coordinate_descent(


Model: ElasticNet


  model = cd_fast.enet_coordinate_descent(


Model: DecisionTreeRegressor
Model: RandomForestRegressor
Model: KNeighborsRegressor


In [10]:
res = pd.DataFrame(res,columns=["model","Type","custom_rmse","mse","r2"]).sort_values(by="custom_rmse",ascending=False)
display(res)

Unnamed: 0,model,Type,custom_rmse,mse,r2
3,Lasso,Polynomial Features,0.217702,0.041588,0.043445
4,ElasticNet,Polynomial Features,0.217137,0.04138,0.051383
2,Ridge,Polynomial Features,0.149709,0.020049,0.481192
1,LinearRegression,Polynomial Features,0.149502,0.019998,0.482415
6,RandomForestRegressor,Polynomial Features,0.117838,0.012479,0.611522
0,XGBRegressor,Polynomial Features,0.020608,0.000395,0.948205
7,KNeighborsRegressor,Polynomial Features,0.019814,0.000368,0.98672
5,DecisionTreeRegressor,Polynomial Features,0.017629,0.000286,0.989868


## Prediction de 1 model par Output

### Selection de modèles

In [None]:
# Liste des modèles
models = {
    # 'XGBoost' : XGBRegressor(),
    # 'Linear Regression': LinearRegression(),
    # 'Ridge Regression': Ridge(),
    # 'Lasso Regression': Lasso(),
    # 'ElasticNet': ElasticNet(),
    'Support Vector Regression': SVR(),
    #'Decision Tree Regression': DecisionTreeRegressor(),
    #'Random Forest Regression': RandomForestRegressor(),
    'Gradient Boosting Regression': GradientBoostingRegressor(),
    #'K-Neighbors Regression': KNeighborsRegressor()
}

res = list()

for mod in models.values():
    print(f"Model: {mod.__class__.__name__}")
    
    # Initialiser le modèle MultiOutputRegressor avec le modèle de base
    multi_target_model = MultiOutputRegressor(mod)
    
    # Entraîner le modèle multi-cible
    multi_target_model.fit(X_train, y_train)
    
    # Prédictions pour toutes les cibles
    y_pred = multi_target_model.predict(X_val)
      
    # Calcul des métriques pour chaque colonne cible
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    custom_rmse = custom_weighted_rmse(y_val, y_pred)  # Remarque: tu peux ajuster cette fonction selon tes besoins
    
    # Ajouter les résultats dans la liste
    res.append([mod.__class__.__name__, custom_rmse, mse, r2])

Model: SVR


: 

In [None]:
res = pd.DataFrame(res,columns=["model","custom_rmse","mse","r2"]).sort_values(by="custom_rmse",ascending=False)
# res["cv_mean"] = res['cv_score'].apply(lambda x : np.mean(x))
res.head(5)

## Test Pred

In [15]:
# 6. Prédictions finales (sur test_data, si disponible)
test_pred = best_model.predict(test_data.drop(columns=['ID']))

In [17]:
# Enregistrement des résultats

output = pd.DataFrame(test_pred, columns=[f'c{i}' for i in range(1, 24)])
# Ajouter la colonne "ID" au début
output.insert(0, 'ID', test_data['ID'])
output.to_csv('predictions/best_prediction.csv', index=False)