### Définir les variables X (features) et y (target).

- Chargement de données

In [15]:
import pandas as pd
import os
import numpy as np

df_category = pd.read_csv(os.getcwd() + '/../data/processed/dataset-category.csv')

- Le choix de X et y

In [16]:
X = df_category.drop(["Cluster", "risk_category"], axis=1)
y = df_category["Cluster"]


### Diviser les données en train/test avec train_test_split.

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


- Standardiser les valeurs

In [18]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

### Traiter le déséquilibre des classes (RandomOverSampler, UnderSampler).

Calcul du pourcentage de chaque cluster: le paramètre **normalize=True** est utilisé pour diviser le nombre de lignes appartenant à chaque cluster par le nombre total de lignes du jeu de données, afin d’obtenir la proportion (en pourcentage) de chaque cluster.

In [29]:
print(y.value_counts(normalize=True), '\n')

print(y.value_counts())

Cluster
1    0.547043
0    0.452957
Name: proportion, dtype: float64 

Cluster
1    407
0    337
Name: count, dtype: int64


On remarque que les proportions des classes sont presque similaires. Par conséquent, l’ajout d’un traitement tel que RandomOverSampler, SMOTE ou RandomUnderSampler pourrait entraîner un surapprentissage (overfitting) inutile du modèle.

### Entraîner les modèles

- Random Forest

In [30]:
from sklearn.ensemble import RandomForestClassifier

random_forest_model = RandomForestClassifier(random_state=42)

random_forest_model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


- SVM

In [31]:
from sklearn.svm import SVC

svm_model = SVC(random_state=42)

svm_model.fit(X_train_std, y_train)

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


- Gradient Boosting

In [32]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(random_state=42)

gb_model.fit(X_train, y_train)

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


- Decision Tree

In [33]:
from sklearn.tree import DecisionTreeClassifier

decision_tree_model = DecisionTreeClassifier(random_state=42)

decision_tree_model.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


- Logistic Regression

In [34]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(random_state=42)

logistic_model.fit(X_train_std, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,100


- XGB

In [35]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(random_state=42)

xgb_model.fit(X_train, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [None]:
from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score, accuracy_score

models = {
    "Random Forest": random_forest_model,
    "XGB": xgb_model,
    "Decision Tree": decision_tree_model,
    "Gradient Boosting": gb_model,
}

for name, model in models.items():

    print("\n---", name, "\n")

    y_pred = model.predict(X_test)
    
    y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)

    print(confusion_matrix(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("F1-score:", f1_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))


--- Random Forest 

[[60  7]
 [ 4 78]]
Accuracy: 0.9261744966442953
F1-score: 0.9341317365269461
Recall: 0.9512195121951219
Precision: 0.9176470588235294

--- XGB 

[[63  4]
 [ 6 76]]
Accuracy: 0.9328859060402684
F1-score: 0.9382716049382716
Recall: 0.926829268292683
Precision: 0.95

--- Decision Tree 

[[56 11]
 [ 8 74]]
Accuracy: 0.87248322147651
F1-score: 0.8862275449101796
Recall: 0.9024390243902439
Precision: 0.8705882352941177

--- Gradient Boosting 

[[61  6]
 [ 3 79]]
Accuracy: 0.9395973154362416
F1-score: 0.9461077844311377
Recall: 0.9634146341463414
Precision: 0.9294117647058824


In [44]:
from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score, accuracy_score

models = {
    "SVM": svm_model,
    "LogisticRegression": logistic_model
}

for name, model in models.items():

    print("\n---", name, "\n")

    y_pred = model.predict(X_test_std)
    
    y_train_pred = model.predict(X_train_std)
    train_accuracy = accuracy_score(y_train, y_train_pred)

    print(confusion_matrix(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("F1-score:", f1_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))


--- SVM 

[[61  6]
 [ 1 81]]
Accuracy: 0.9530201342281879
F1-score: 0.9585798816568047
Recall: 0.9878048780487805
Precision: 0.9310344827586207

--- LogisticRegression 

[[62  5]
 [ 0 82]]
Accuracy: 0.9664429530201343
F1-score: 0.9704142011834319
Recall: 1.0
Precision: 0.9425287356321839


### Appliquer la validation croisée

In [67]:
from sklearn.model_selection import cross_val_score

models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGB": XGBClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=10)
    print(name)
    print('\tScores:', [float(round(x, 3)) for x in scores])
    print('\tScores Mean:', scores.mean())
    print('\tScores Std:', scores.std())
    print()

# Logistic Regression et SVM

models = {
    "SVM": SVC(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42)
}

scaler = StandardScaler()
X_std = scaler.fit_transform(X)

for name, model in models.items():
    scores = cross_val_score(model, X_std, y, cv=10)
    print(name)
    print('\tScores:', [float(round(x, 3)) for x in scores])
    print('\tScores Mean:', scores.mean())
    print('\tScores Std:', scores.std())
    print()

Random Forest
	Scores: [0.933, 0.96, 0.987, 0.933, 0.986, 0.905, 0.973, 0.946, 0.919, 0.932]
	Scores Mean: 0.9475495495495494
	Scores Std: 0.026605902281230023

XGB
	Scores: [0.96, 0.947, 0.973, 0.92, 0.959, 0.932, 0.959, 0.959, 0.919, 0.919]
	Scores Mean: 0.944864864864865
	Scores Std: 0.01949612828068319

Decision Tree
	Scores: [0.88, 0.893, 0.88, 0.88, 0.932, 0.905, 0.878, 0.892, 0.919, 0.865]
	Scores Mean: 0.8925225225225224
	Scores Std: 0.01972824874115763

Gradient Boosting
	Scores: [0.96, 0.96, 0.973, 0.907, 0.932, 0.932, 0.946, 0.932, 0.919, 0.946]
	Scores Mean: 0.9408108108108107
	Scores Std: 0.019242070686683863

SVM
	Scores: [0.987, 0.973, 1.0, 0.96, 0.973, 0.959, 0.973, 0.973, 0.959, 1.0]
	Scores Mean: 0.9757837837837838
	Scores Std: 0.014502031012864817

Logistic Regression
	Scores: [1.0, 1.0, 1.0, 0.987, 0.986, 0.973, 0.986, 0.986, 0.973, 1.0]
	Scores Mean: 0.9892072072072073
	Scores Std: 0.010107915400196071



### Optimiser les hyperparamètres avec GridSearchCV ou RandomizedSearchCV.

- Grid Search - Random Forest

In [76]:
from sklearn.model_selection import GridSearchCV

rf_param_grid = {
    'n_estimators': [300, 400, 500],
    'max_depth': [None, 2, 5],
    'min_samples_split': [3, 4, 5],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=rf_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X, y)
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Best score: 0.9529566479230909


- Grid Search - XGB

In [80]:
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 2, 3, 5],
    'learning_rate': [0.1, 0.2, 0.3, 0.4],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.4, 0.6, 0.8],
    'gamma': [0, 0.1, 0.3]
}

grid_search = GridSearchCV(
    estimator=XGBClassifier(random_state=42),
    param_grid=xgb_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X, y)
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'colsample_bytree': 0.6, 'gamma': 0.3, 'learning_rate': 0.3, 'max_depth': 2, 'n_estimators': 200, 'subsample': 0.8}
Best score: 0.9731180845274805


- Grid Search - Gradient Boosting

In [112]:
gb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [2, 3, 5],
    'subsample': [0.4, 0.6, 0.8],
    'min_samples_split': [12, 14, 16],
    'min_samples_leaf': [2, 3]
}

grid_search = GridSearchCV(
    estimator=GradientBoostingClassifier(random_state=42),
    param_grid=gb_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X, y)
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'learning_rate': 0.2, 'max_depth': 2, 'min_samples_leaf': 2, 'min_samples_split': 14, 'n_estimators': 200, 'subsample': 0.8}
Best score: 0.9690730999455832


- Grid Search - Decision Tree

In [87]:
dt_param_grid = {
    'max_depth': [None, 2, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2, 3],
    'max_features': [None, 'sqrt', 'log2']
}

grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=dt_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X, y)
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2}
Best score: 0.9005350988572465


- Grid Search - SVM

In [118]:
svm_param_grid = {
    'C': [1, 2, 3, 4, 5],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3, 4]
}

grid_search = GridSearchCV(
    estimator=SVC(random_state=42),
    param_grid=svm_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_std, y)
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'C': 3, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}
Best score: 0.9986486486486486


- Grid Search - Logistic Regression

In [107]:
lr_param_grid = {
    'solver': ['lbfgs', 'liblinear', 'saga'],
    'penalty': ['l2'],
    'C': [9, 10, 11],
    'max_iter': [10, 20, 30, 50]
}

grid_search = GridSearchCV(
    estimator=LogisticRegression(random_state=42),
    param_grid=lr_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_std, y)
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'C': 10, 'max_iter': 20, 'penalty': 'l2', 'solver': 'lbfgs'}
Best score: 0.9986577181208054


### Comparer les performances et sélectionner le meilleur modèle.

On va entraîner les modèles une nouvelle fois en utilisant les paramètres obtenus avec GridSearch.

In [120]:
models = {
    "Random Forest": RandomForestClassifier(random_state=42, max_depth= None, max_features = 'sqrt', min_samples_leaf = 1, min_samples_split = 5, n_estimators = 300),
    "XGB": XGBClassifier(random_state=42, colsample_bytree = 0.6, gamma = 0.3, learning_rate = 0.3, max_depth = 2, n_estimators = 200, subsample = 0.8),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42, learning_rate = 0.2, max_depth = 2, min_samples_leaf = 2, min_samples_split = 14, n_estimators = 200, subsample = 0.8),
    "Decision Tree": DecisionTreeClassifier(random_state=42, max_depth = None, max_features = None, min_samples_leaf = 2, min_samples_split = 2),
}

result = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    result[name] = acc

    print(f"Accuracy - {name}:", acc)

Accuracy - Random Forest: 0.9060402684563759
Accuracy - XGB: 0.9530201342281879
Accuracy - Gradient Boosting: 0.9530201342281879
Accuracy - Decision Tree: 0.8590604026845637


In [121]:
models = {
    "SVM": SVC(random_state=42, C = 3, degree = 2, gamma = 'scale', kernel = 'linear'),
    "Logistic Regression": LogisticRegression(random_state=42, C = 10, max_iter = 20, penalty = 'l2', solver = 'lbfgs'),
}

for name, model in models.items():
    model.fit(X_train_std, y_train)
    y_pred = model.predict(X_test_std)

    acc = accuracy_score(y_test, y_pred)
    result[name] = acc

    print(f"Accuracy - {name}:", acc)

Accuracy - SVM: 0.9798657718120806
Accuracy - Logistic Regression: 0.9865771812080537


- Affichage de l'accuracy de chaque modèle

In [None]:
for k, v in result.items():
    print(f"{k :20s} - {v}")

Random Forest        - 0.9060402684563759
XGB                  - 0.9530201342281879
Gradient Boosting    - 0.9530201342281879
Decision Tree        - 0.8590604026845637
SVM                  - 0.9798657718120806
Logistic Regression  - 0.9865771812080537


- Selection de modèle

In [None]:
max_accuracy = max(result.items(), key=lambda x: x[1])

print("Le modèle le plus performant est", max_accuracy[0])

Logistic Regression


- Sauvegarder le modèle

In [129]:
from joblib import dump

dump(model, os.getcwd() + '/../models/model.pkl')

['c:\\Users\\anass\\Desktop\\Analyse-et-Prediction-du-Risque-de-Diabete\\notebooks/../models/model.pkl']