## Nettoyage de données

In [39]:
from pyspark.sql import functions as F
from modules.spark import spark

In [43]:
df = spark.read.csv('../data/processed/data-cleaned', header=True, inferSchema=True)

In [3]:
df.columns

['Type',
 'ShippingMode',
 'LateDeliveryRisk',
 'CategoryName',
 'CustomerSegment',
 'OrderItemTotal',
 'OrderRegion',
 'ShippingMonthName']

### Gérer les valeurs manquantes.

Dejà fait

### Équilibrer les classes.

In [46]:
df.count()

170912

In [47]:
df.groupBy('LateDeliveryRisk').count().show()

+----------------+-----+
|LateDeliveryRisk|count|
+----------------+-----+
|               1|97946|
|               0|72966|
+----------------+-----+



In [48]:
class_0 = df.filter(F.col("LateDeliveryRisk") == 0)
class_1 = df.filter(F.col("LateDeliveryRisk") == 1)

ratio = class_0.count() / class_1.count()

new_class_1 = class_1.sample(withReplacement=False, fraction=ratio)

df_equilibred = class_0.union(new_class_1)


In [52]:
new_class_0 = df_equilibred.filter(F.col("LateDeliveryRisk") == 0).count()
new_class_1 = df_equilibred.filter(F.col("LateDeliveryRisk") == 1).count()

print(new_class_0)
print(new_class_1)

72966
73179


In [51]:
df_equilibred.columns

['Type',
 'ShippingMode',
 'LateDeliveryRisk',
 'CategoryName',
 'CustomerSegment',
 'OrderItemTotal',
 'OrderRegion',
 'ShippingMonthName']

In [None]:
df_equilibred.write.csv('../data/processed/data-balanced', header=True)

### Encoder les variables catégorielles.

- Encodage avec StringIndexer

In [55]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

indexers = [
    StringIndexer(inputCol="Type", outputCol="Type_index"),
    StringIndexer(inputCol="ShippingMode", outputCol="ShippingMode_index"),
    StringIndexer(inputCol="CategoryName", outputCol="CategoryName_index"),
    StringIndexer(inputCol="CustomerSegment", outputCol="CustomerSegment_index"),
    StringIndexer(inputCol="OrderRegion", outputCol="OrderRegion_index"),
    StringIndexer(inputCol="ShippingMonthName", outputCol="ShippingMonthName_index"),
]

pipeline = Pipeline(stages=indexers)

df_indexed = pipeline.fit(df_equilibred).transform(df_equilibred)
df_indexed.show()

+-------+--------------+----------------+---------------+---------------+--------------+---------------+-----------------+----------+------------------+------------------+---------------------+-----------------+-----------------------+
|   Type|  ShippingMode|LateDeliveryRisk|   CategoryName|CustomerSegment|OrderItemTotal|    OrderRegion|ShippingMonthName|Type_index|ShippingMode_index|CategoryName_index|CustomerSegment_index|OrderRegion_index|ShippingMonthName_index|
+-------+--------------+----------------+---------------+---------------+--------------+---------------+-----------------+----------+------------------+------------------+---------------------+-----------------+-----------------------+
|PAYMENT|Standard Class|               0|  Shop By Sport|       Consumer|   179.9600067|Central America|              May|       2.0|               0.0|               8.0|                  0.0|              0.0|                    4.0|
|PAYMENT|Standard Class|               0|Women's Apparel

In [56]:
df_indexed.write.csv('../data/processed/data-indexed', header=True)

- Encodage One-Hot

In [57]:
from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder(
    inputCols=["Type_index", "ShippingMode_index", "CategoryName_index", "CustomerSegment_index", "OrderRegion_index", "ShippingMonthName_index"],
    outputCols=["Type_vec", "ShippingMode_vec", "CategoryName_vec", "CustomerSegment_vec", "OrderRegion_vec", "ShippingMonthName_vec"]
)

pipeline = Pipeline(stages=[encoder])

df_encoded = pipeline.fit(df_indexed).transform(df_indexed)
df_encoded.show(truncate=False)

+-------+--------------+----------------+---------------+---------------+--------------+---------------+-----------------+----------+------------------+------------------+---------------------+-----------------+-----------------------+-------------+----------------+----------------+-------------------+---------------+---------------------+
|Type   |ShippingMode  |LateDeliveryRisk|CategoryName   |CustomerSegment|OrderItemTotal|OrderRegion    |ShippingMonthName|Type_index|ShippingMode_index|CategoryName_index|CustomerSegment_index|OrderRegion_index|ShippingMonthName_index|Type_vec     |ShippingMode_vec|CategoryName_vec|CustomerSegment_vec|OrderRegion_vec|ShippingMonthName_vec|
+-------+--------------+----------------+---------------+---------------+--------------+---------------+-----------------+----------+------------------+------------------+---------------------+-----------------+-----------------------+-------------+----------------+----------------+-------------------+-------------

### Normaliser/scaler les features numériques.

In [59]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

assembler = VectorAssembler(
    inputCols=["OrderItemTotal", "Type_vec", "ShippingMode_vec", "CategoryName_vec", "CustomerSegment_vec", "OrderRegion_vec", "ShippingMonthName_vec"],
    outputCol="features"
)

df_assembled = assembler.transform(df_encoded)

scaler = StandardScaler(
    inputCol="features",
    outputCol="scaled_features",
    withMean=True,
    withStd=True
)

scaler_model = scaler.fit(df_assembled)
df_scaled = scaler_model.transform(df_assembled)