In [164]:
from load_dataset import load_data
import utils
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report
import pandas as pd

## Instancier le jeu de données

In [165]:
X, y, cat_features = load_data().get_data_X_y(data='simplify', OneHot=True)

In [166]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Classification par xgboost (Avec One-Hot Encoding):

In [167]:
xgbClassifier = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgbClassifier.fit(X_train, y_train)

print(f"Train Accuracy : {xgbClassifier.score(X_train, y_train) * 100:.3f} %")

Train Accuracy : 100.000 %


In [168]:
print(f"Test Accuracy : {xgbClassifier.score(X_test, y_test) * 100:.3f} %")

Test Accuracy : 90.266 %


In [169]:
preds = xgbClassifier.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.90      0.84      0.87       414
           1       0.90      0.94      0.92       675

    accuracy                           0.90      1089
   macro avg       0.90      0.89      0.90      1089
weighted avg       0.90      0.90      0.90      1089



In [170]:
# Validation croisée sur les données d'entraînement
cross_val = cross_val_score(xgbClassifier, X_train, y_train, cv=5)
cross_val

array([0.90373281, 0.8976378 , 0.90551181, 0.9015748 , 0.88188976])

In [171]:
print(f"Mean Cross-validation score sur le train dataset : {cross_val.mean() * 100:.3f}%")

Mean Cross-validation score sur le train dataset : 89.807%


## Feature Importance

In [172]:
utils.features_importance(xgbClassifier, X_train.columns)

Unnamed: 0,Importance
Unités curriculaires 2e semestre (approuvées),0.191808
Frais de scolarité à jour_0,0.083047
Unités curriculaires 1er semestre (inscrits),0.041927
International_0,0.019702
Unités curriculaires 2e semestre (inscrits),0.018259
...,...
Qualification père_1,0.000000
Qualification mère_12,0.000000
Qualification mère_11,0.000000
Qualification mère_6,0.000000


## Classification par xgboost (Sans One-Hot Encoding):

En utilisant le paramètre enable_categorical=True, Shap n'est pas compatible avec cette méthode

In [173]:
X, y, cat_features = load_data().get_data_X_y(data='simplify', OneHot=False)

In [174]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [175]:
xgbClassifier = xgb.XGBClassifier(objective="binary:logistic", random_state=42, enable_categorical=True, tree_method="hist")
xgbClassifier.fit(X_train, y_train)

print(f"Train Accuracy : {xgbClassifier.score(X_train, y_train) * 100:.3f} %")

Train Accuracy : 100.000 %


In [176]:
print(f"Test Accuracy : {xgbClassifier.score(X_test, y_test) * 100:.3f} %")

Test Accuracy : 89.532 %


In [177]:
preds = xgbClassifier.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.89      0.83      0.86       414
           1       0.90      0.93      0.92       675

    accuracy                           0.90      1089
   macro avg       0.89      0.88      0.89      1089
weighted avg       0.89      0.90      0.89      1089



In [178]:
# Validation croisée sur les données d'entraînement
cross_val = cross_val_score(xgbClassifier, X_train, y_train, cv=5)
cross_val

array([0.90176817, 0.88976378, 0.90944882, 0.9015748 , 0.88779528])

In [179]:
print(f"Mean Cross-validation score sur le train dataset : {cross_val.mean() * 100:.3f}%")

Mean Cross-validation score sur le train dataset : 89.807%


## Feature Importance

In [180]:
utils.features_importance(xgbClassifier, X_train.columns)

Unnamed: 0,Importance
Unités curriculaires 2e semestre (approuvées),0.338912
Frais de scolarité à jour,0.119559
Unités curriculaires 1er semestre (inscrits),0.060418
International,0.039473
Cours,0.033568
Qualification antérieure,0.032198
Unités curriculaires 1er semestre (créditées),0.027767
Bourse,0.023678
Unités curriculaires 2e semestre (inscrits),0.022832
Unités curriculaires 1er semestre (sans évaluations),0.020452


## Optimisation des Hyperparamètres

In [181]:
X, y, cat_features = load_data().get_data_X_y(data='simplify', OneHot=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [182]:
# init xgb classifier
xgb_clf = xgb.XGBClassifier(verbosity=0, random_state=42)

param_grid = {
    'objective':["binary:logistic", "binary:hinge"],
    'booster':["gblinear", "gbtree"],
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.05, 0.1],
}


# init grid search
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, cv=3, scoring='f1')

grid_search.fit(X_train, y_train)

best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

print("Meilleurs hyperparamèetres:")
best_parameters_df = {key: [value] for key, value in best_parameters.items()}
df = pd.DataFrame(best_parameters_df)
print(df)
print()
print("Meilleur score:", best_score)

Meilleurs hyperparamèetres:
  booster  learning_rate  n_estimators        objective
0  gbtree            0.1            50  binary:logistic

Meilleur score: 0.9205512115483762


In [183]:
model = grid_search.best_estimator_

In [184]:
model.fit(X_train, y_train)

print(f"Train Accuracy : {model.score(X_train, y_train) * 100:.3f} %")

Train Accuracy : 95.789 %


In [185]:
print(f"Test Accuracy : {model.score(X_test, y_test) * 100:.3f} %")

Test Accuracy : 90.542 %
