## Division des donn√©es en ensembles d‚Äôentra√Ænement et de test

In [None]:
import time
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
import sys
from pathlib import Path

sys.path.append(str(Path('../tests/data_validation').resolve()))
sys.path.append(str(Path('../tests/model_validation').resolve()))

from data_validator import DataValidator
from model_validator import ModelValidator

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import (accuracy_score,classification_report,confusion_matrix,roc_auc_score)

mlflow.set_experiment("Diabetes_Cluster_Classification1")

file = r"../data/processed/Clustered_Data.csv"
content = pd.read_csv(file)

print(" Validating data...")
data_validator = DataValidator()
try:
    validated_content = data_validator.validate_data(content)
    print("Data validation passed!")
except ValueError as e:
    print(f"Data validation failed: {e}")
    raise

X = content.drop("Cluster", axis=1)
y = content["Cluster"]

print("Validating model input...")
model_validator = ModelValidator()
try:
    model_validator.validate_model_input(X)
    print("Model input validation passed!")
except ValueError as e:
    print(f"Model input validation failed: {e}")
    raise

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=41, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

over = RandomOverSampler(random_state=42)
X_train_res, y_train_res = over.fit_resample(X_train_scaled, y_train)

models = {
    "RandomForestClassifier": RandomForestClassifier(
        n_estimators=200, max_depth=None, min_samples_split=2
    ),
    "GradientBoostingClassifier": GradientBoostingClassifier(
        learning_rate=0.1, max_depth=3, n_estimators=300
    ),
    "SVC": SVC(
        C=10, gamma="scale", kernel="linear", probability=True
    ),
    "DecisionTreeClassifier": DecisionTreeClassifier(
        max_depth=10, min_samples_leaf=4, min_samples_split=2
    ),
    "LogisticRegression": LogisticRegression(
        C=10, penalty="l1", solver="liblinear", max_iter=1000
    )
}

best_accuracy = 0
best_run_id = None
best_model_name = None

for name, model in models.items():

    with mlflow.start_run(run_name=name):

        mlflow.log_param("model_type", name)
        mlflow.log_param("dataset_size", len(X))
        mlflow.log_param("train_size", len(X_train_res))
        mlflow.log_param("test_size", len(X_test))

        mlflow.log_params(model.get_params())

        mlflow.log_metric("class_0_ratio", np.mean(y_train_res == 0))
        mlflow.log_metric("class_1_ratio", np.mean(y_train_res == 1))

        start_train = time.time()
        model.fit(X_train_res, y_train_res)
        train_time = time.time() - start_train
        mlflow.log_metric("training_time_sec", train_time)

        start_pred = time.time()
        y_pred = model.predict(X_test_scaled)
        inference_time = time.time() - start_pred
        mlflow.log_metric("inference_time_sec", inference_time)

        acc = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)

        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision_weighted", report["weighted avg"]["precision"])
        mlflow.log_metric("recall_weighted", report["weighted avg"]["recall"])
        mlflow.log_metric("f1_weighted", report["weighted avg"]["f1-score"])

        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

        mlflow.log_metric("true_positive", tp)
        mlflow.log_metric("true_negative", tn)
        mlflow.log_metric("false_positive", fp)
        mlflow.log_metric("false_negative", fn)

        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X_test_scaled)[:, 1]
            auc = roc_auc_score(y_test, y_proba)
            mlflow.log_metric("roc_auc", auc)

        mlflow.sklearn.log_model(model, "model")
        mlflow.sklearn.log_model(scaler, "scaler")
        
        if acc > best_accuracy:
            best_accuracy = acc
            best_run_id = mlflow.active_run().info.run_id
            best_model_name = name

print("Tous les mod√®les sont correctement track√©s dans MLflow")

2026/02/01 21:52:02 INFO mlflow.tracking.fluent: Experiment with name 'Diabetes_Cluster_Classification1' does not exist. Creating a new experiment.
The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

2026/02/01 21:52:13 INFO mlflow.tracking._tracking_service.client: üèÉ View run RandomForestClassifier at: http://mlflow:5000/#/experiments/540716983857295251/runs/006e9622d9864828bf23ed32588cd864.
2026/02/01 21:52:13 INFO mlflow.tracking._tracking_service

Tous les mod√®les sont correctement track√©s dans MLflow


In [2]:
from mlflow.tracking import MlflowClient

model_registry_name = "DiabetesClusterClassifier1"
model_uri = f"runs:/{best_run_id}/model"
print(model_uri)
result = mlflow.register_model(
    model_uri=model_uri,
    name=model_registry_name
)

client = MlflowClient()
client.transition_model_version_stage(
    name=model_registry_name,
    version=result.version,
    stage="Production"
)


print("Best model registered and promoted to Production successfully")

runs:/ea5c1055a88f474eaa95993ca200b296/model


Successfully registered model 'DiabetesClusterClassifier1'.
2026/02/01 21:53:08 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DiabetesClusterClassifier1, version 1
Created version '1' of model 'DiabetesClusterClassifier1'.
  client.transition_model_version_stage(


Best model registered and promoted to Production successfully
