In [1]:
import mlflow
mlflow.end_run()

StatementMeta(, 770dfae2-c506-42ee-b09b-5b55cfd11d78, 3, Finished, Available, Finished)

In [2]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.sql.functions import col
import mlflow
import mlflow.spark
import pandas as pd

# Configuration
MODEL_NAME = "Online_Shopper_Predictor"
EXPERIMENT_NAME = "Shopping_Intention_Analysis"
mlflow.set_experiment(EXPERIMENT_NAME)

StatementMeta(, 770dfae2-c506-42ee-b09b-5b55cfd11d78, 4, Finished, Available, Finished)

<Experiment: artifact_location='sds://onelakecentralus.pbidedicated.windows.net/5ca0301c-08f2-4ed5-9f8f-8bdc1e9da703/ea2e5dec-8bc5-4e6a-a5ce-3f1bc8a9c8a9', creation_time=1767140599847, experiment_id='ea2e5dec-8bc5-4e6a-a5ce-3f1bc8a9c8a9', last_update_time=1767140599847, lifecycle_stage='active', name='Shopping_Intention_Analysis', tags={}>

In [3]:
df = spark.read.table("gold.features_shopping")
# Définition de la liste complète des features (colonnes numériques uniquement)
feature_cols = [
    'Administrative', 'Informational', 'ProductRelated', 
    'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 
    'Month_num', 'Weekend_label', 'TotalPages', 
    'TotalDuration', 'IsReturningVisitor',
    'pages_per_minute', 'is_peak_shopping_season', 'high_intent_session'
]

StatementMeta(, 770dfae2-c506-42ee-b09b-5b55cfd11d78, 5, Finished, Available, Finished)

In [11]:
# Assemblage des features dans un vecteur
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data_ml_complete = assembler.transform(df)

StatementMeta(, 770dfae2-c506-42ee-b09b-5b55cfd11d78, 13, Finished, Available, Finished)

In [12]:
# Division Train/Test (80% entraînement, 20% test)
train_full, test_full = data_ml_complete.randomSplit([0.8, 0.2], seed=42)

StatementMeta(, 770dfae2-c506-42ee-b09b-5b55cfd11d78, 14, Finished, Available, Finished)

In [15]:
# Sauvegarde des données exactes utilisées pour l'entraînement
train_full.write.mode("overwrite").format("delta").saveAsTable("gold.model_training_reference")
print("Table 'gold.model_training_reference' créée avec TOUTES les colonnes.")

StatementMeta(, 770dfae2-c506-42ee-b09b-5b55cfd11d78, 17, Finished, Available, Finished)

Table 'gold.model_training_reference' créée avec TOUTES les colonnes.


In [47]:
# Configuration et Entraînement avec MLflow
# Remarque : Fabric gère automatiquement la connexion à l'expérience active
mlflow.set_experiment("Shopping_Intention_Analysis")

with mlflow.start_run(run_name="RF_Final_Model"):
    
# Paramètres du modèle
    num_trees = 150
    max_depth = 12
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=num_trees, maxDepth=max_depth)

StatementMeta(, 2a03b64c-748c-4dcf-8802-d30ae516b03a, 49, Finished, Available, Finished)

In [48]:
# Entraînement
model = rf.fit(train_data)

StatementMeta(, 2a03b64c-748c-4dcf-8802-d30ae516b03a, 50, Finished, Available, Finished)

In [49]:
# Prédictions
predictions = model.transform(test_data)

StatementMeta(, 2a03b64c-748c-4dcf-8802-d30ae516b03a, 51, Finished, Available, Finished)

In [50]:
with mlflow.start_run(run_name="RF_Final_Enriched"):
    
# Évaluation AUC (Binaire) 
    evaluator_auc = BinaryClassificationEvaluator(metricName="areaUnderROC")
    auc = evaluator_auc.evaluate(predictions)
    
    # Évaluation Multi-métriques (Précision, Rappel, F1, Accuracy) 
    
    multi_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
    
    accuracy = multi_evaluator.setMetricName("accuracy").evaluate(predictions)
    precision = multi_evaluator.setMetricName("weightedPrecision").evaluate(predictions)
    recall = multi_evaluator.setMetricName("weightedRecall").evaluate(predictions)
    f1 = multi_evaluator.setMetricName("f1").evaluate(predictions)   

StatementMeta(, 2a03b64c-748c-4dcf-8802-d30ae516b03a, 52, Finished, Available, Finished)

In [51]:
# LOG DES PARAMÈTRES
mlflow.log_param("numTrees", num_trees)
mlflow.log_param("maxDepth", max_depth)
    
# LOG DES MÉTRIQUES
mlflow.log_metric("auc", auc)
mlflow.log_metric("accuracy", accuracy)
mlflow.log_metric("precision", precision)
mlflow.log_metric("recall", recall)
mlflow.log_metric("f1_score", f1)

StatementMeta(, 2a03b64c-748c-4dcf-8802-d30ae516b03a, 53, Finished, Available, Finished)

In [52]:
# SAUVEGARDE DU MODÈLE 
# Cela enregistre le modèle dans l'artefact de l'expérience MLflow
mlflow.spark.log_model(model, "random_forest_shopping_model")
    
print("Métriques enregistrées et modèle sauvegardé dans MLflow.")
print(f"Accuracy: {accuracy:.2%} | AUC: {auc:.4f} | F1: {f1_score:.4f}")

StatementMeta(, 2a03b64c-748c-4dcf-8802-d30ae516b03a, 54, Finished, Available, Finished)



Métriques enregistrées et modèle sauvegardé dans MLflow.
Accuracy: 89.37% | AUC: 0.9277 | F1: 0.8895


In [53]:
# Importance des variables 
importance = model.featureImportances.toArray()
feat_importance = pd.DataFrame({'Feature': feature_cols, 'Importance': importance}).sort_values(by='Importance', ascending=False)
display(feat_importance)

StatementMeta(, 2a03b64c-748c-4dcf-8802-d30ae516b03a, 55, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, a2fb9e1d-5e14-4f74-8b40-a1d0b2771790)