## Construction d'un pipeline MLlib

Les colonnes selectionnées pour ce pipeline sont:
- LateDeliveryRisk
- Type
- ShippingMode
- CategoryName
- CustomerSegment
- OrderItemTotal
- OrderRegion
- ShippingMonthName

In [2]:
from pyspark.sql import functions as F
from modules.spark import spark

In [3]:
df = spark.read.csv("../data/processed/data-balanced", header=True, inferSchema=True)


### Assembler le pipeline MLlib.

In [4]:
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier

indexers = [
    StringIndexer(inputCol="Type", outputCol="Type_index"),
    StringIndexer(inputCol="ShippingMode", outputCol="ShippingMode_index"),
    StringIndexer(inputCol="CategoryName", outputCol="CategoryName_index"),
    StringIndexer(inputCol="CustomerSegment", outputCol="CustomerSegment_index"),
    StringIndexer(inputCol="OrderRegion", outputCol="OrderRegion_index"),
    StringIndexer(inputCol="ShippingMonthName", outputCol="ShippingMonthName_index"),
]
encoder = OneHotEncoder(
    inputCols=['Type_index', 'ShippingMode_index', 'CategoryName_index', 'CustomerSegment_index', 'OrderRegion_index', 'ShippingMonthName_index'],
    outputCols=['Type_vec', 'ShippingMode_vec', 'CategoryName_vec', 'CustomerSegment_vec', 'OrderRegion_vec', 'ShippingMonthName_vec']
)

assembler = VectorAssembler(
    inputCols=['OrderItemTotal', 'Type_vec', 'ShippingMode_vec', 'CategoryName_vec', 'CustomerSegment_vec', 'OrderRegion_vec', 'ShippingMonthName_vec'],
    outputCol='features'
)

scaler = StandardScaler(
    inputCol="features",
    outputCol='scaled_features',
    withMean=True,
    withStd=True,
)




- Division de données

In [5]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

### Entrainement des modéles

- Random Forest

In [5]:

rf = RandomForestClassifier(labelCol="LateDeliveryRisk", featuresCol="features")

rf_pipeline = Pipeline(stages=indexers + [encoder, assembler, scaler, rf])

rf_model = rf_pipeline.fit(train_df)
rf_predictions = rf_model.transform(test_df)

- Logistic Regression

In [6]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="LateDeliveryRisk", featuresCol="features")

lr_pipeline = Pipeline(stages=indexers + [encoder, assembler, scaler, lr])

lr_model = lr_pipeline.fit(train_df)
lr_predictions = lr_model.transform(test_df)


- GBT

In [7]:
from pyspark.ml.classification import GBTClassifier

gb = GBTClassifier(labelCol="LateDeliveryRisk", featuresCol="features")

gb_pipeline = Pipeline(stages=indexers + [encoder, assembler, scaler, gb])

gb_model = gb_pipeline.fit(train_df)
gb_predictions = gb_model.transform(test_df)

### Evaluation de performance

- Random Forest

In [8]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# évaluer les prédictions
evaluator = BinaryClassificationEvaluator(
    labelCol="LateDeliveryRisk",     # vraie étiquette
    rawPredictionCol="rawPrediction",  # par défaut
    metricName="areaUnderROC"  # ou "areaUnderPR"
)

auc = evaluator.evaluate(rf_predictions)
print(f"AUC = {auc:.3f}")

AUC = 0.739


In [9]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# accuracy
acc_eval = MulticlassClassificationEvaluator(
    labelCol="LateDeliveryRisk", predictionCol="prediction", metricName="accuracy"
)
accuracy = acc_eval.evaluate(rf_predictions)

# F1-score
f1_eval = MulticlassClassificationEvaluator(
    labelCol="LateDeliveryRisk", predictionCol="prediction", metricName="f1"
)
f1 = f1_eval.evaluate(rf_predictions)

print(f"Accuracy = {accuracy:.3f}")
print(f"F1-score = {f1:.3f}")

Accuracy = 0.712
F1-score = 0.707


- Logistic Regression

In [10]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# évaluer les prédictions
evaluator = BinaryClassificationEvaluator(
    labelCol="LateDeliveryRisk",     # vraie étiquette
    rawPredictionCol="rawPrediction",  # par défaut
    metricName="areaUnderROC"  # ou "areaUnderPR"
)

auc = evaluator.evaluate(lr_predictions)
print(f"AUC = {auc:.3f}")

AUC = 0.742


In [11]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# accuracy
acc_eval = MulticlassClassificationEvaluator(
    labelCol="LateDeliveryRisk", predictionCol="prediction", metricName="accuracy"
)
accuracy = acc_eval.evaluate(lr_predictions)

# F1-score
f1_eval = MulticlassClassificationEvaluator(
    labelCol="LateDeliveryRisk", predictionCol="prediction", metricName="f1"
)
f1 = f1_eval.evaluate(lr_predictions)

print(f"Accuracy = {accuracy:.3f}")
print(f"F1-score = {f1:.3f}")

Accuracy = 0.724
F1-score = 0.715


- GBT

In [12]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# évaluer les prédictions
evaluator = BinaryClassificationEvaluator(
    labelCol="LateDeliveryRisk",     # vraie étiquette
    rawPredictionCol="rawPrediction",  # par défaut
    metricName="areaUnderROC"  # ou "areaUnderPR"
)

auc = evaluator.evaluate(gb_predictions)
print(f"AUC = {auc:.3f}")

AUC = 0.748


In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# accuracy
acc_eval = MulticlassClassificationEvaluator(
    labelCol="LateDeliveryRisk", predictionCol="prediction", metricName="accuracy"
)
accuracy = acc_eval.evaluate(gb_predictions)

# F1-score
f1_eval = MulticlassClassificationEvaluator(
    labelCol="LateDeliveryRisk", predictionCol="prediction", metricName="f1"
)
f1 = f1_eval.evaluate(gb_predictions)

print(f"Accuracy = {accuracy:.3f}")
print(f"F1-score = {f1:.3f}")

Accuracy = 0.724
F1-score = 0.715


#### Cross Validation

In [7]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

- Logistic Regression

In [15]:
# from pyspark.ml.classification import LogisticRegression

# lr = LogisticRegression(labelCol="LateDeliveryRisk", featuresCol="scaled_features")

# lr_pipeline = Pipeline(stages=indexers + [encoder, assembler, scaler, lr])

# paramGrid = (
#     ParamGridBuilder()
#     .addGrid(lr.regParam, [0, 0.001, 0.005])
#     .addGrid(lr.elasticNetParam, [0, 0.01, 0.05])
#     .addGrid(lr.maxIter, [15, 25, 35])
#     .build()
# )

# evaluator = BinaryClassificationEvaluator(
#     labelCol="LateDeliveryRisk", metricName="areaUnderROC"
# )
# acc_eval = MulticlassClassificationEvaluator(
#     labelCol="LateDeliveryRisk", predictionCol="prediction", metricName="accuracy"
# )

# cv = CrossValidator(
#     estimator=lr_pipeline,
#     estimatorParamMaps=paramGrid,
#     evaluator=evaluator,
#     numFolds=5,       # 5-fold CV
#     parallelism=2     # run folds in parallel
# )

# cvModel = cv.fit(train_df)

# preds = cvModel.transform(test_df)
# auc = evaluator.evaluate(preds)
# accuracy = acc_eval.evaluate(preds)
# print(f"AUC on test set: {auc:.3f}")
# print(f"Accuracy on test set: {accuracy:.3f}")

# lr_bestPipelineModel = cvModel.bestModel
# bestGBModel = lr_bestPipelineModel.stages[-1]
# print("Best regParam:", bestGBModel.getRegParam())
# print("Best maxDepth:", bestGBModel.getElasticNetParam())
# print("Best maxIter:", bestGBModel.getMaxIter())


# Best numTrees: 0.005
# Best maxDepth: 0.0
# Best maxIter: 25

- Random Forest

In [16]:
# from pyspark.ml.classification import RandomForestClassifier

# rf = RandomForestClassifier(labelCol="LateDeliveryRisk", featuresCol="scaled_features")

# rf_pipeline = Pipeline(stages=indexers + [encoder, assembler, scaler, rf])

# paramGrid = (
#     ParamGridBuilder()
#     .addGrid(rf.numTrees, [500, 600, 700])
#     .addGrid(rf.maxDepth, [9, 12, 15])
#     .build()
# )

# evaluator = BinaryClassificationEvaluator(
#     labelCol="LateDeliveryRisk", metricName="areaUnderROC"
# )
# acc_eval = MulticlassClassificationEvaluator(
#     labelCol="LateDeliveryRisk", predictionCol="prediction", metricName="accuracy"
# )

# cv = CrossValidator(
#     estimator=rf_pipeline,
#     estimatorParamMaps=paramGrid,
#     evaluator=evaluator,
#     numFolds=5,       # 5-fold CV
#     parallelism=2     # run folds in parallel
# )

# cvModel = cv.fit(train_df)

# preds = cvModel.transform(test_df)
# auc = evaluator.evaluate(preds)
# accuracy = acc_eval.evaluate(preds)
# print(f"AUC on test set: {auc:.3f}")
# print(f"Accuracy on test set: {accuracy:.3f}")

# rf_bestPipelineModel = cvModel.bestModel
# bestGBModel = rf_bestPipelineModel.stages[-1]
# print("Best numTrees:", bestGBModel.getNumTrees)
# print("Best maxDepth:", bestGBModel.getMaxDepth())


- GBT

In [None]:
from pyspark.ml.classification import GBTClassifier

gb = GBTClassifier(labelCol="LateDeliveryRisk", featuresCol="scaled_features")

gb_pipeline = Pipeline(stages=indexers + [encoder, assembler, scaler, gb])

paramGrid = (
    ParamGridBuilder()
    .addGrid(gb.maxIter, [20, 50, 100])
    .addGrid(gb.maxDepth, [5, 15, 25])
    .addGrid(gb.minInstancesPerNode, [3, 5, 7])
    .addGrid(gb.subsamplingRate, [0.4, 0.8])
    .build()
)

evaluator = BinaryClassificationEvaluator(
    labelCol="LateDeliveryRisk", metricName="areaUnderROC"
)
acc_eval = MulticlassClassificationEvaluator(
    labelCol="LateDeliveryRisk", predictionCol="prediction", metricName="accuracy"
)

cv = CrossValidator(
    estimator=gb_pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=2,       # 5-fold CV
    parallelism=1     # run folds in parallel
)

cvModel = cv.fit(train_df)

preds = cvModel.transform(test_df)
auc = evaluator.evaluate(preds)
accuracy = acc_eval.evaluate(preds)
print(f"AUC on test set: {auc:.3f}")
print(f"Accuracy on test set: {accuracy:.3f}")

gb_bestPipelineModel = cvModel.bestModel
bestGBModel = gb_bestPipelineModel.stages[-1]
print("Best maxIter:", bestGBModel.getMaxIter())
print("Best maxDepth:", bestGBModel.getMaxDepth())
print("Best minInstancesPerNode:", bestGBModel.getMinInstancesPerNode())
print("Best subSamplingRate:", bestGBModel.getSubsamplingRate())

Alors, on constate que le GBTClassifier est le modèle dont les valeurs d’AUC, d’Accuracy et de F1-score sont les meilleures.

- Sauvegardez le modèle (pipeline)

In [14]:
gb_model.write().overwrite().save('../models/model_1')