In [1]:
from data.load_dataset import load_data
import utils.utils
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report
import pandas as pd

## Instancier le jeu de données

In [2]:
X, y, cat_features = load_data().get_data_X_y(data='simplify', OneHot=True)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Classification par xgboost (Avec One-Hot Encoding):

In [4]:
xgbClassifier = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgbClassifier.fit(X_train, y_train)

print(f"Train Accuracy : {xgbClassifier.score(X_train, y_train) * 100:.3f} %")

Train Accuracy : 100.000 %


In [5]:
print(f"Test Accuracy : {xgbClassifier.score(X_test, y_test) * 100:.3f} %")

Test Accuracy : 90.358 %


In [6]:
preds = xgbClassifier.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.90      0.84      0.87       414
           1       0.90      0.94      0.92       675

    accuracy                           0.90      1089
   macro avg       0.90      0.89      0.90      1089
weighted avg       0.90      0.90      0.90      1089



In [7]:
# Validation croisée sur les données d'entraînement
cross_val = cross_val_score(xgbClassifier, X_train, y_train, cv=5)
cross_val

array([0.90569745, 0.90354331, 0.90944882, 0.90551181, 0.88976378])

In [8]:
print(f"Mean Cross-validation score sur le train dataset : {cross_val.mean() * 100:.3f}%")

Mean Cross-validation score sur le train dataset : 90.279%


## Feature Importance

In [9]:
utils.features_importance(xgbClassifier, X_train.columns, plot=False, log=False)

Unnamed: 0,Importance
Unités curriculaires 2e semestre (approuvées),0.190303
Frais de scolarité à jour_0,0.071445
Unités curriculaires 1er semestre (inscrits),0.044223
Mode d'application_5,0.024666
Cours_8,0.020539
...,...
Qualification père_1,0.000000
Qualification mère_12,0.000000
Qualification mère_11,0.000000
Qualification mère_6,0.000000


## Classification par xgboost (Sans One-Hot Encoding):

En utilisant le paramètre enable_categorical=True, Shap n'est pas compatible avec cette méthode

In [10]:
X, y, cat_features = load_data().get_data_X_y(data='simplify', OneHot=False)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
xgbClassifier = xgb.XGBClassifier(objective="binary:logistic", random_state=42, enable_categorical=True, tree_method="hist")
xgbClassifier.fit(X_train, y_train)

print(f"Train Accuracy : {xgbClassifier.score(X_train, y_train) * 100:.3f} %")

Train Accuracy : 100.000 %


In [13]:
print(f"Test Accuracy : {xgbClassifier.score(X_test, y_test) * 100:.3f} %")

Test Accuracy : 90.450 %


In [14]:
preds = xgbClassifier.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.90      0.84      0.87       414
           1       0.90      0.95      0.92       675

    accuracy                           0.90      1089
   macro avg       0.90      0.89      0.90      1089
weighted avg       0.90      0.90      0.90      1089



In [15]:
# Validation croisée sur les données d'entraînement
cross_val = cross_val_score(xgbClassifier, X_train, y_train, cv=5)
cross_val

array([0.90569745, 0.8976378 , 0.91732283, 0.8996063 , 0.89370079])

In [16]:
print(f"Mean Cross-validation score sur le train dataset : {cross_val.mean() * 100:.3f}%")

Mean Cross-validation score sur le train dataset : 90.279%


## Feature Importance

In [17]:
utils.features_importance(xgbClassifier, X_train.columns)

Unnamed: 0,Importance
Unités curriculaires 2e semestre (approuvées),0.346266
Frais de scolarité à jour,0.128336
Unités curriculaires 1er semestre (inscrits),0.047519
International,0.040277
Unités curriculaires 1er semestre (créditées),0.035234
Cours,0.035156
Qualification antérieure,0.025263
Unités curriculaires 1er semestre (sans évaluations),0.024786
Unités curriculaires 1er semestre (approuvées),0.021825
Bourse,0.02163


## Optimisation des Hyperparamètres

In [18]:
X, y, cat_features = load_data().get_data_X_y(data='simplify', OneHot=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [19]:
# init xgb classifier
xgb_clf = xgb.XGBClassifier(verbosity=0, random_state=42)

param_grid = {
    'objective':["binary:logistic", "binary:hinge"],
    'booster':["gblinear", "gbtree"],
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.05, 0.1],
}


# init grid search
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, cv=3, scoring='f1')

grid_search.fit(X_train, y_train)

best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

print("Meilleurs hyperparamèetres:")
best_parameters_df = {key: [value] for key, value in best_parameters.items()}
df = pd.DataFrame(best_parameters_df)
print(df)
print()
print("Meilleur score:", best_score)

Meilleurs hyperparamèetres:
  booster  learning_rate  n_estimators        objective
0  gbtree            0.1            50  binary:logistic

Meilleur score: 0.9194023002199753


In [20]:
model = grid_search.best_estimator_

In [21]:
model.fit(X_train, y_train)

print(f"Train Accuracy : {model.score(X_train, y_train) * 100:.3f} %")

Train Accuracy : 95.907 %


In [22]:
print(f"Test Accuracy : {model.score(X_test, y_test) * 100:.3f} %")

Test Accuracy : 90.174 %
