In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import utils
from load_dataset import load_data

import warnings
warnings.filterwarnings("ignore")

## Instancier le jeu de données

In [2]:
X, y, cat_features = load_data().get_data_X_y(data='simplify', OneHot=True, Scaler='MinMax')

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
# Initialisation du modèle de régression logistique
logistic_regression = LogisticRegression(C=10, max_iter=1000, solver='lbfgs', random_state=42)  # Spécification du nombre maximal d'itérations

# Entraînement du modèle
logistic_regression.fit(X_train, y_train)

print(f"Train Accuracy : {logistic_regression.score(X_train, y_train) * 100:.3f} %")

Train Accuracy : 92.326 %


In [5]:
print(f"Test Accuracy : {logistic_regression.score(X_test, y_test) * 100:.3f} %")

Test Accuracy : 91.093 %


In [6]:
# Prédiction sur l'ensemble de test
y_pred = logistic_regression.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.85      0.88       414
           1       0.91      0.95      0.93       675

    accuracy                           0.91      1089
   macro avg       0.91      0.90      0.90      1089
weighted avg       0.91      0.91      0.91      1089



## Feature Importance

In [7]:
utils.features_importance(logistic_regression, X_train.columns, plot=False, _coef=True)

Unnamed: 0,Importance
Unités curriculaires 2e semestre (approuvées),14.602029
Unités curriculaires 1er semestre (approuvées),10.100678
Unités curriculaires 2e semestre (sans évaluations),4.056225
Unités curriculaires 2e semestre (note),2.731695
Cours_2,2.594284
...,...
Unités curriculaires 2e semestre (créditées),-2.305121
Unités curriculaires 1er semestre (créditées),-2.695534
Unités curriculaires 2e semestre (évaluations),-3.198157
Unités curriculaires 1er semestre (inscrits),-3.430284


## Optimisation des hyperparamètres

In [8]:
#Analyse des meilleurs hyperparametres
logistic_regression = LogisticRegression(random_state=42)

parameters = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 1000, 10000]
}

# Use GridSearchCV for testing different parameters
grid_search = GridSearchCV(logistic_regression, parameters, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

# Get the results of parameter grid search
results = pd.DataFrame(grid_search.cv_results_)

# Print a comparative table of parameter combinations and their performance metrics
print("Comparison of different parameter combinations:")
print(results[['param_C', 'param_solver', 'param_max_iter', 'mean_test_score', 'std_test_score']])

# Get the best parameters and their corresponding accuracy
best_params = grid_search.best_params_
best_f1 = grid_search.best_score_
print(f"\nBest parameters: {best_params}")
print(f"F1 with best parameters: {best_f1}")

Comparison of different parameter combinations:
    param_C param_solver param_max_iter  mean_test_score  std_test_score
0     0.001    newton-cg            100         0.801427        0.007634
1     0.001        lbfgs            100         0.801427        0.007634
2     0.001    liblinear            100         0.800365        0.006827
3     0.001          sag            100         0.801427        0.007634
4     0.001         saga            100         0.801427        0.007634
..      ...          ...            ...              ...             ...
100    1000    newton-cg          10000         0.923779        0.009437
101    1000        lbfgs          10000         0.923779        0.009437
102    1000    liblinear          10000         0.923779        0.009437
103    1000          sag          10000         0.923135        0.009540
104    1000         saga          10000         0.924170        0.009018

[105 rows x 5 columns]

Best parameters: {'C': 100, 'max_iter': 100, 'solve

In [9]:
model = grid_search.best_estimator_

model.fit(X_train, y_train)

print(f"Train Accuracy : {model.score(X_train, y_train) * 100:.3f} %")

Train Accuracy : 92.523 %


In [10]:
print(f"Test Accuracy : {model.score(X_test, y_test) * 100:.3f} %")

Test Accuracy : 90.909 %


In [11]:
# Prédiction sur l'ensemble de test
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.85      0.88       414
           1       0.91      0.95      0.93       675

    accuracy                           0.91      1089
   macro avg       0.91      0.90      0.90      1089
weighted avg       0.91      0.91      0.91      1089

