In [1]:
from pyspark.sql import functions as F
from modules.spark import spark

In [3]:
df = spark.read.csv('../data/processed/data-cleaned', header=True, inferSchema=True)

### Gérer les valeurs manquantes.

In [8]:
type_nulls = df.filter(F.col("Type").isNull()).count()
price_nulls = df.filter(F.col("ProductPrice").isNull()).count()
quantity_nulls = df.filter(F.col("OrderItemQuantity").isNull()).count()
mode_nulls = df.filter(F.col("ShippingMode").isNull()).count()
risk_nulls = df.filter(F.col("LateDeliveryRisk").isNull()).count()
category_nulls = df.filter(F.col("CategoryName").isNull()).count()
segment_nulls = df.filter(F.col("CustomerSegment").isNull()).count()
region_nulls = df.filter(F.col("OrderRegion").isNull()).count()
month_nulls = df.filter(F.col("ShippingMonthName").isNull()).count()

print("Type", type_nulls)
print("ProductPrice", price_nulls)
print("OrderItemQuantity", quantity_nulls)
print("ShippingMode", mode_nulls)
print("LateDeliveryRisk", risk_nulls)
print("CategoryName", category_nulls)
print("CustomerSegment", segment_nulls)
print("OrderRegion", region_nulls)
print("ShippingMonthName", month_nulls)

Type 0
ProductPrice 0
OrderItemQuantity 0
ShippingMode 0
LateDeliveryRisk 0
CategoryName 0
CustomerSegment 0
OrderRegion 0
ShippingMonthName 0


### Encoder les variables catégorielles.

- Encodage avec StringIndexer

In [14]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

indexers = [
    StringIndexer(inputCol="Type", outputCol="Type_index"),
    StringIndexer(inputCol="ShippingMode", outputCol="ShippingMode_index"),
    StringIndexer(inputCol="CategoryName", outputCol="CategoryName_index"),
    StringIndexer(inputCol="CustomerSegment", outputCol="CustomerSegment_index"),
    StringIndexer(inputCol="OrderRegion", outputCol="OrderRegion_index"),
    StringIndexer(inputCol="ShippingMonthName", outputCol="ShippingMonthName_index"),
]

pipeline = Pipeline(stages=indexers)

df_indexed = pipeline.fit(df).transform(df)
df_indexed.show()

+--------+--------------+------------+----------------+--------------+---------------+-----------------+--------------+-----------------+----------+----------+------------------+------------------+---------------------+-----------------+-----------------------+
|    Type|  ShippingMode|ProductPrice|LateDeliveryRisk|  CategoryName|CustomerSegment|OrderItemQuantity|   OrderRegion|ShippingMonthName|is_outlier|Type_index|ShippingMode_index|CategoryName_index|CustomerSegment_index|OrderRegion_index|ShippingMonthName_index|
+--------+--------------+------------+----------------+--------------+---------------+-----------------+--------------+-----------------+----------+----------+------------------+------------------+---------------------+-----------------+-----------------------+
|   DEBIT|Standard Class|      327.75|               0|Sporting Goods|       Consumer|                1|Southeast Asia|         February|         0|       0.0|               0.0|              32.0|                 

- Encodage One-Hot

In [16]:
from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder(
    inputCols=["Type_index", "ShippingMode_index", "CategoryName_index", "CustomerSegment_index", "OrderRegion_index", "ShippingMonthName_index"],
    outputCols=["Type_vec", "ShippingMode_vec", "CategoryName_vec", "CustomerSegment_vec", "OrderRegion_vec", "ShippingMonthName_vec"]
)

pipeline = Pipeline(stages=[encoder])

df_encoded = pipeline.fit(df_indexed).transform(df_indexed)
df_encoded.show(truncate=False)

+--------+--------------+------------+----------------+--------------+---------------+-----------------+--------------+-----------------+----------+----------+------------------+------------------+---------------------+-----------------+-----------------------+-------------+----------------+----------------+-------------------+---------------+---------------------+
|Type    |ShippingMode  |ProductPrice|LateDeliveryRisk|CategoryName  |CustomerSegment|OrderItemQuantity|OrderRegion   |ShippingMonthName|is_outlier|Type_index|ShippingMode_index|CategoryName_index|CustomerSegment_index|OrderRegion_index|ShippingMonthName_index|Type_vec     |ShippingMode_vec|CategoryName_vec|CustomerSegment_vec|OrderRegion_vec|ShippingMonthName_vec|
+--------+--------------+------------+----------------+--------------+---------------+-----------------+--------------+-----------------+----------+----------+------------------+------------------+---------------------+-----------------+-----------------------+---

### Normaliser/scaler les features numériques.

In [18]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

assembler = VectorAssembler(
    inputCols=["ProductPrice", "OrderItemQuantity", "Type_vec", "ShippingMode_vec", "CategoryName_vec", "CustomerSegment_vec", "OrderRegion_vec", "ShippingMonthName_vec"],
    outputCol="features"
)

df_assembled = assembler.transform(df_encoded)

scaler = StandardScaler(
    inputCol="features",
    outputCol="scaled_features",
    withMean=True,
    withStd=True
)

scaler_model = scaler.fit(df_assembled)
df_scaled = scaler_model.transform(df_assembled)

### Équilibrer les classes.

In [19]:
df_scaled.groupBy('LateDeliveryRisk').count().show()

+----------------+-----+
|LateDeliveryRisk|count|
+----------------+-----+
|               1|97843|
|               0|80628|
+----------------+-----+



In [23]:
class_0 = df_scaled.filter(F.col("LateDeliveryRisk") == 0)
class_1 = df_scaled.filter(F.col("LateDeliveryRisk") == 1)

ratio = class_0.count() / class_1.count()

new_class_1 = class_1.sample(withReplacement=False, fraction=ratio)

df_equilibred = class_0.union(new_class_1)


In [24]:
new_class_0 = df_equilibred.filter(F.col("LateDeliveryRisk") == 0).count()
new_class_1 = df_equilibred.filter(F.col("LateDeliveryRisk") == 1).count()

print(new_class_0)
print(new_class_1)

80628
80844


In [31]:
df_equilibred.columns

['Type',
 'ShippingMode',
 'ProductPrice',
 'LateDeliveryRisk',
 'CategoryName',
 'CustomerSegment',
 'OrderItemQuantity',
 'OrderRegion',
 'ShippingMonthName',
 'is_outlier',
 'Type_index',
 'ShippingMode_index',
 'CategoryName_index',
 'CustomerSegment_index',
 'OrderRegion_index',
 'ShippingMonthName_index',
 'Type_vec',
 'ShippingMode_vec',
 'CategoryName_vec',
 'CustomerSegment_vec',
 'OrderRegion_vec',
 'ShippingMonthName_vec',
 'features',
 'scaled_features']

- Sauvegarder le resultat

In [32]:
select_cols = ['Type',
 'ShippingMode',
 'ProductPrice',
 'LateDeliveryRisk',
 'CategoryName',
 'CustomerSegment',
 'OrderItemQuantity',
 'OrderRegion',
 'ShippingMonthName',
 'is_outlier',
 'Type_index',
 'ShippingMode_index',
 'CategoryName_index',
 'CustomerSegment_index',
 'OrderRegion_index',
 'ShippingMonthName_index']

df_equilibred.select(select_cols).write.csv('../data/processed/data-balanced', header=True)