In [1]:
import pandas as pd
from load_dataset import load_data
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score, roc_auc_score

from IPython import display



In [11]:
#dataset original
loader = load_data()
df_original, cat_feat_original = loader.get_original_dataframe()
print(cat_feat_original)
df_original


['État civil', "Mode d'application", 'Cours', 'Présence jour/soir', 'Qualification antérieure', 'Nationalité', 'Qualification mère', 'Qualification père', 'Occupation mère', 'Occupation père', 'Déplacé', 'Besoins éducatifs spéciaux', 'Dettes', 'Frais de scolarité à jour', 'Sexe', 'Bourse', 'International']


Unnamed: 0,État civil,Mode d'application,Ordre d'application,Cours,Présence jour/soir,Qualification antérieure,Nationalité,Qualification mère,Qualification père,Occupation mère,...,Unités curriculaires 2e semestre (créditées),Unités curriculaires 2e semestre (inscrits),Unités curriculaires 2e semestre (évaluations),Unités curriculaires 2e semestre (approuvées),Unités curriculaires 2e semestre (note),Unités curriculaires 2e semestre (sans évaluations),Taux de chômage,Taux d'inflation,PIB,Cible
0,1,8,5,2,1,1,1,13,10,6,...,0,0,0,0,0.000000,0,10.8,1.4,1.74,0
1,1,6,1,11,1,1,1,1,3,4,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,1
2,1,1,5,5,1,1,1,22,27,10,...,0,6,0,0,0.000000,0,10.8,1.4,1.74,0
3,1,8,2,15,1,1,1,23,27,6,...,0,6,10,5,12.400000,0,9.4,-0.8,-3.12,1
4,2,12,1,3,0,1,1,22,28,10,...,0,6,6,6,13.000000,0,13.9,-0.3,0.79,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3625,1,1,6,15,1,1,1,1,1,6,...,0,6,8,5,12.666667,0,15.5,2.8,-4.06,1
3626,1,1,2,15,1,1,19,1,1,10,...,0,6,6,2,11.000000,0,11.1,0.6,2.02,0
3627,1,1,1,12,1,1,1,22,27,10,...,0,8,9,1,13.500000,0,13.9,-0.3,0.79,0
3628,1,1,1,9,1,1,1,22,27,8,...,0,5,6,5,12.000000,0,9.4,-0.8,-3.12,1


In [12]:
#dataset simplifié
df_simplified, cat_feat_simplified = loader.get_simplify_dataframe()
print(cat_feat_simplified)
df_simplified


['État civil', "Mode d'application", 'Cours', 'Présence jour/soir', 'Qualification antérieure', 'Qualification mère', 'Qualification père', 'Occupation mère', 'Occupation père', 'Déplacé', 'Besoins éducatifs spéciaux', 'Dettes', 'Frais de scolarité à jour', 'Sexe', 'Bourse', 'International']


Unnamed: 0,État civil,Mode d'application,Ordre d'application,Cours,Présence jour/soir,Qualification antérieure,Qualification mère,Qualification père,Occupation mère,Occupation père,...,Unités curriculaires 2e semestre (créditées),Unités curriculaires 2e semestre (inscrits),Unités curriculaires 2e semestre (évaluations),Unités curriculaires 2e semestre (approuvées),Unités curriculaires 2e semestre (note),Unités curriculaires 2e semestre (sans évaluations),Taux de chômage,Taux d'inflation,PIB,Cible
0,1,2,5,2,1,7,7,10,6,10,...,0,0,0,0,0.000000,0,10.8,1.4,1.74,0
1,1,10,1,10,1,7,8,4,4,4,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,1
2,1,1,5,4,1,7,7,10,10,10,...,0,6,0,0,0.000000,0,10.8,1.4,1.74,0
3,1,2,2,14,1,7,7,10,6,4,...,0,6,10,5,12.400000,0,9.4,-0.8,-3.12,1
4,2,11,1,9,0,7,7,10,10,10,...,0,6,6,6,13.000000,0,13.9,-0.3,0.79,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3625,1,1,6,14,1,7,8,8,6,5,...,0,6,8,5,12.666667,0,15.5,2.8,-4.06,1
3626,1,1,2,14,1,7,8,8,10,10,...,0,6,6,2,11.000000,0,11.1,0.6,2.02,0
3627,1,1,1,11,1,7,7,10,10,10,...,0,8,9,1,13.500000,0,13.9,-0.3,0.79,0
3628,1,1,1,8,1,7,7,10,8,5,...,0,5,6,5,12.000000,0,9.4,-0.8,-3.12,1


# DecisionTree - RandomForest - NaiveBayes - LogisiticRegression
## dataset: 
### -original + OneHot=False
### -original + OneHot=True
### -simplified + OneHot=False
### -simplified + OneHot=True

In [2]:


#optimisation des hp
def optimize_hyperparameters(clf, param_grid, X_train, y_train):
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='f1')   #on utilise f1 et cv 5
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_

#calcul des metriques
def calculate_metrics(model, X_test, y_test):
    y_pred = model.predict(X_test)
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),    # sans average='binary'
        "Rappel (recall)": recall_score(y_test, y_pred),   # sans average='binary'
        "Precision": precision_score(y_test, y_pred),    # sans average='binary'
        "ROC-AUC": roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    }
    return metrics


def format_best_params(best_params):
    formatted_params = []
    for param, value in best_params.items():
        formatted_param = f"{param}={value}"
        formatted_params.append(formatted_param)
    return formatted_params





classifiers = {
    "Arbre de decision": DecisionTreeClassifier,
    "Foret aleatoire": RandomForestClassifier,
    "Classificateur naif bayesien": GaussianNB,
    "Régression logistique": LogisticRegression
}

#les plages des hyperparametres de chaque classifieur a tester avec recherche par grille (GridSearchCV)
param_grids = {
    "Arbre de decision": {
        'max_depth': [6, 8, 10, 12, 15],
        'min_samples_split': [50, 100, 190, 200, 210, 225],
        'min_samples_leaf': [15, 20, 25, 30, 40, 60, 100], 
        'criterion': ['gini', 'entropy']},

    "Foret aleatoire": {
        'n_estimators': [100, 300, 600, 1000],
        'max_depth': [7, 10, 15, 20, 25], 
        'max_features':  [None, 'sqrt', 0.2, 0.3, 0.4], 
        'criterion': ['gini', 'entropy']},

    "Classificateur naif bayesien": {
        'var_smoothing': [1e-8, 1e-7, 1e-6, 1e-5],
        'priors': [None, [0.5, 0.5], [0.7, 0.3], [0.3, 0.7]]},

    "Régression logistique": {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
        'max_iter': [10000, 20000]}    
}




best_params_dict = {}
metrics_dict = {}
type_de_dataset = ['simplify', 'original']

for type_data in type_de_dataset:

    for oneHot in [True, False]:

        X, y, cat_features = load_data().get_data_X_y(data=type_data, OneHot=oneHot)
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)


        for name, clf_class in classifiers.items():

            if name == "Classificateur naif bayesien":
                clf = clf_class()  # pas de (random_state=42) pour naif bayes
            else:
                clf = clf_class(random_state=42)


            
            best_model, best_params = optimize_hyperparameters(clf, param_grids[name], X_train, y_train)


            #reentrainer les modele avec les best hp
            if name == "Classificateur naif bayesien":
                clf_optimized = best_model  #on peut pas mettre parametre les best hp pour reentrainer CNB, best_model est deja avec les best hp
            else:
                clf_optimized = clf_class(**best_params, random_state=42)

            clf_optimized.fit(X_train, y_train)


            
            metrics = calculate_metrics(clf_optimized, X_test, y_test)

            
            formatted_best_params = format_best_params(best_params)
            best_params_dict[f"{name} ({type_data}, OneHot={oneHot})"] = formatted_best_params
            metrics_dict[f"{name} ({type_data}, OneHot={oneHot})"] = metrics


df_best_params = pd.DataFrame.from_dict(best_params_dict, orient='index')
df_best_params.columns = [f"hp{i+1}" for i in range(df_best_params.shape[1])]
df_best_params_sorted = df_best_params.sort_index()
print("Les meilleurs hyperparametres par classifieur:")
display.display(df_best_params_sorted)

df_metrics = pd.DataFrame.from_dict(metrics_dict, orient='index')
df_metrics['Type de Données'] = [name.split(' ')[-1].split(',')[0] for name in df_metrics.index]
df_metrics['OneHot'] = [name.split('=')[-1] for name in df_metrics.index]
df_metrics_sorted = df_metrics.sort_values(by=['Type de Données', 'OneHot'])
df_metrics_sorted = df_metrics_sorted.drop(columns=['Type de Données', 'OneHot'])
print("\nLes metriques par Classifieur:")
display.display(df_metrics_sorted)



Les meilleurs hyperparametres par classifieur:


Unnamed: 0,hp1,hp2,hp3,hp4
"Arbre de decision (original, OneHot=False)",criterion=entropy,max_depth=6,min_samples_leaf=15,min_samples_split=50
"Arbre de decision (original, OneHot=True)",criterion=gini,max_depth=6,min_samples_leaf=15,min_samples_split=50
"Arbre de decision (simplify, OneHot=False)",criterion=gini,max_depth=6,min_samples_leaf=15,min_samples_split=50
"Arbre de decision (simplify, OneHot=True)",criterion=gini,max_depth=6,min_samples_leaf=15,min_samples_split=50
"Classificateur naif bayesien (original, OneHot=False)","priors=[0.3, 0.7]",var_smoothing=1e-05,,
"Classificateur naif bayesien (original, OneHot=True)","priors=[0.3, 0.7]",var_smoothing=1e-05,,
"Classificateur naif bayesien (simplify, OneHot=False)","priors=[0.3, 0.7]",var_smoothing=1e-05,,
"Classificateur naif bayesien (simplify, OneHot=True)",priors=None,var_smoothing=1e-05,,
"Foret aleatoire (original, OneHot=False)",criterion=gini,max_depth=15,max_features=0.2,n_estimators=600
"Foret aleatoire (original, OneHot=True)",criterion=entropy,max_depth=25,max_features=0.2,n_estimators=1000



Les metriques par Classifieur:


Unnamed: 0,Accuracy,F1 Score,Rappel (recall),Precision,ROC-AUC
"Arbre de decision (simplify, OneHot=False)",0.895317,0.918221,0.948148,0.890125,0.928726
"Foret aleatoire (simplify, OneHot=False)",0.909091,0.929537,0.967407,0.894521,0.952013
"Classificateur naif bayesien (simplify, OneHot=False)",0.842057,0.876437,0.903704,0.850767,0.890862
"Régression logistique (simplify, OneHot=False)",0.914601,0.932852,0.957037,0.909859,0.954811
"Arbre de decision (original, OneHot=False)",0.899908,0.921752,0.951111,0.89415,0.924772
"Foret aleatoire (original, OneHot=False)",0.905418,0.926376,0.96,0.895028,0.951934
"Classificateur naif bayesien (original, OneHot=False)",0.844812,0.880057,0.918519,0.844687,0.893163
"Régression logistique (original, OneHot=False)",0.917355,0.934877,0.957037,0.91372,0.956894
"Arbre de decision (simplify, OneHot=True)",0.89899,0.921316,0.954074,0.890733,0.928007
"Foret aleatoire (simplify, OneHot=True)",0.908173,0.928469,0.961481,0.897649,0.952755
