## Construction d'un pipeline MLlib

In [1]:
from pyspark.sql import functions as F
from modules.spark import spark

In [2]:
df = spark.read.csv("../data/processed/data-balanced", header=True, inferSchema=True)


### Assembler le pipeline MLlib.

In [3]:
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier

indexers = [
    StringIndexer(inputCol="Type", outputCol="Type_index"),
    StringIndexer(inputCol="ShippingMode", outputCol="ShippingMode_index"),
    StringIndexer(inputCol="CategoryName", outputCol="CategoryName_index"),
    StringIndexer(inputCol="CustomerSegment", outputCol="CustomerSegment_index"),
    StringIndexer(inputCol="OrderRegion", outputCol="OrderRegion_index"),
    StringIndexer(inputCol="ShippingMonthName", outputCol="ShippingMonthName_index"),
]
encoder = OneHotEncoder(
    inputCols=['Type_index', 'ShippingMode_index', 'CategoryName_index', 'CustomerSegment_index', 'OrderRegion_index', 'ShippingMonthName_index'],
    outputCols=['Type_vec', 'ShippingMode_vec', 'CategoryName_vec', 'CustomerSegment_vec', 'OrderRegion_vec', 'ShippingMonthName_vec']
)

assembler = VectorAssembler(
    inputCols=['OrderItemTotal', 'Type_vec', 'ShippingMode_vec', 'CategoryName_vec', 'CustomerSegment_vec', 'OrderRegion_vec', 'ShippingMonthName_vec'],
    outputCol='features'
)



- Division de données

In [4]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

### Entrainement des modéles

- Random Forest

In [5]:

rf = RandomForestClassifier(labelCol="LateDeliveryRisk", featuresCol="features")

rf_pipeline = Pipeline(stages=indexers + [encoder, assembler, rf])

rf_model = rf_pipeline.fit(train_df)
rf_predictions = rf_model.transform(test_df)

- Logistic Regression

In [6]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="LateDeliveryRisk", featuresCol="features")

lr_pipeline = Pipeline(stages=indexers + [encoder, assembler, lr])

lr_model = lr_pipeline.fit(train_df)
lr_predictions = lr_model.transform(test_df)


- GBT

In [7]:
from pyspark.ml.classification import GBTClassifier

gb = GBTClassifier(labelCol="LateDeliveryRisk", featuresCol="features")

gb_pipeline = Pipeline(stages=indexers + [encoder, assembler, gb])

gb_model = gb_pipeline.fit(train_df)
gb_predictions = gb_model.transform(test_df)

### Evaluation de performance

- Random Forest

In [8]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# évaluer les prédictions
evaluator = BinaryClassificationEvaluator(
    labelCol="LateDeliveryRisk",     # vraie étiquette
    rawPredictionCol="rawPrediction",  # par défaut
    metricName="areaUnderROC"  # ou "areaUnderPR"
)

auc = evaluator.evaluate(rf_predictions)
print(f"AUC = {auc:.3f}")

AUC = 0.743


In [9]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# accuracy
acc_eval = MulticlassClassificationEvaluator(
    labelCol="LateDeliveryRisk", predictionCol="prediction", metricName="accuracy"
)
accuracy = acc_eval.evaluate(rf_predictions)

# F1-score
f1_eval = MulticlassClassificationEvaluator(
    labelCol="LateDeliveryRisk", predictionCol="prediction", metricName="f1"
)
f1 = f1_eval.evaluate(rf_predictions)

print(f"Accuracy = {accuracy:.3f}")
print(f"F1-score = {f1:.3f}")

Accuracy = 0.712
F1-score = 0.707


- Logistic Regression

In [10]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# évaluer les prédictions
evaluator = BinaryClassificationEvaluator(
    labelCol="LateDeliveryRisk",     # vraie étiquette
    rawPredictionCol="rawPrediction",  # par défaut
    metricName="areaUnderROC"  # ou "areaUnderPR"
)

auc = evaluator.evaluate(lr_predictions)
print(f"AUC = {auc:.3f}")

AUC = 0.742


In [11]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# accuracy
acc_eval = MulticlassClassificationEvaluator(
    labelCol="LateDeliveryRisk", predictionCol="prediction", metricName="accuracy"
)
accuracy = acc_eval.evaluate(lr_predictions)

# F1-score
f1_eval = MulticlassClassificationEvaluator(
    labelCol="LateDeliveryRisk", predictionCol="prediction", metricName="f1"
)
f1 = f1_eval.evaluate(lr_predictions)

print(f"Accuracy = {accuracy:.3f}")
print(f"F1-score = {f1:.3f}")

Accuracy = 0.724
F1-score = 0.715


- GBT

In [12]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# évaluer les prédictions
evaluator = BinaryClassificationEvaluator(
    labelCol="LateDeliveryRisk",     # vraie étiquette
    rawPredictionCol="rawPrediction",  # par défaut
    metricName="areaUnderROC"  # ou "areaUnderPR"
)

auc = evaluator.evaluate(gb_predictions)
print(f"AUC = {auc:.3f}")

AUC = 0.749


In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# accuracy
acc_eval = MulticlassClassificationEvaluator(
    labelCol="LateDeliveryRisk", predictionCol="prediction", metricName="accuracy"
)
accuracy = acc_eval.evaluate(gb_predictions)

# F1-score
f1_eval = MulticlassClassificationEvaluator(
    labelCol="LateDeliveryRisk", predictionCol="prediction", metricName="f1"
)
f1 = f1_eval.evaluate(gb_predictions)

print(f"Accuracy = {accuracy:.3f}")
print(f"F1-score = {f1:.3f}")

Accuracy = 0.724
F1-score = 0.715


Alors, on constate que le GBTClassifier est le modèle dont les valeurs d’AUC, d’Accuracy et de F1-score sont les meilleures.

- Sauvegardez le modèle (pipeline)

In [14]:
gb_model.write().overwrite().save('../models/model_1')