## Division des données en ensembles d’entraînement et de test

In [7]:
import time
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import (accuracy_score,classification_report,confusion_matrix,roc_auc_score)

mlflow.set_experiment("Diabetes_Cluster_Classification")

file = r"./data/processed/Clustered_Data.csv"
content = pd.read_csv(file)

X = content.drop("Cluster", axis=1)
y = content["Cluster"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=41, stratify=y
)

over = RandomOverSampler(random_state=42)
X_train_res, y_train_res = over.fit_resample(X_train, y_train)

models = {
    "RandomForestClassifier": RandomForestClassifier(
        n_estimators=200, max_depth=None, min_samples_split=2
    ),
    "GradientBoostingClassifier": GradientBoostingClassifier(
        learning_rate=0.1, max_depth=3, n_estimators=300
    ),
    "SVC": SVC(
        C=10, gamma="scale", kernel="linear", probability=True
    ),
    "DecisionTreeClassifier": DecisionTreeClassifier(
        max_depth=10, min_samples_leaf=4, min_samples_split=2
    ),
    "LogisticRegression": LogisticRegression(
        C=10, penalty="l1", solver="liblinear", max_iter=1000
    )
}

best_accuracy = 0
best_run_id = None
best_model_name = None

for name, model in models.items():

    with mlflow.start_run(run_name=name):

        mlflow.log_param("model_type", name)
        mlflow.log_param("dataset_size", len(X))
        mlflow.log_param("train_size", len(X_train_res))
        mlflow.log_param("test_size", len(X_test))

        mlflow.log_params(model.get_params())

        mlflow.log_metric("class_0_ratio", np.mean(y_train_res == 0))
        mlflow.log_metric("class_1_ratio", np.mean(y_train_res == 1))

        start_train = time.time()
        model.fit(X_train_res, y_train_res)
        train_time = time.time() - start_train
        mlflow.log_metric("training_time_sec", train_time)

        start_pred = time.time()
        y_pred = model.predict(X_test)
        inference_time = time.time() - start_pred
        mlflow.log_metric("inference_time_sec", inference_time)

        acc = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)

        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision_weighted", report["weighted avg"]["precision"])
        mlflow.log_metric("recall_weighted", report["weighted avg"]["recall"])
        mlflow.log_metric("f1_weighted", report["weighted avg"]["f1-score"])

        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

        mlflow.log_metric("true_positive", tp)
        mlflow.log_metric("true_negative", tn)
        mlflow.log_metric("false_positive", fp)
        mlflow.log_metric("false_negative", fn)

        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X_test)[:, 1]
            auc = roc_auc_score(y_test, y_proba)
            mlflow.log_metric("roc_auc", auc)

        mlflow.sklearn.log_model(model, "model")
        
        if acc > best_accuracy:
            best_accuracy = acc
            best_run_id = mlflow.active_run().info.run_id
            best_model_name = name

print("Tous les modèles sont correctement trackés dans MLflow")




Tous les modèles sont correctement trackés dans MLflow




In [8]:
from mlflow.tracking import MlflowClient

model_registry_name = "DiabetesClusterClassifier"

result = mlflow.register_model(
    model_uri=f"runs:/{best_run_id}/model",
    name=model_registry_name
)

client = MlflowClient()
client.transition_model_version_stage(
    name=model_registry_name,
    version=result.version,
    stage="Production"
)

print("Best model registered and promoted to Production successfully")

Registered model 'DiabetesClusterClassifier' already exists. Creating a new version of this model...
2026/01/28 14:55:58 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: DiabetesClusterClassifier, version 3


Best model registered and promoted to Production successfully


Created version '3' of model 'DiabetesClusterClassifier'.
