## Chargement et préparation des données.

### Initialiser Spark

In [1]:
from modules.spark import spark

In [2]:
from pyspark.sql import functions as F

- Chargement de données

In [3]:
df = spark.read.csv('../data/raw/DataCoSupplyChainDataset.csv', header=True, inferSchema=True)

### Effectuer un nettoyage initial.

In [98]:
df.printSchema()

root
 |-- Type: string (nullable = true)
 |-- Days for shipping (real): integer (nullable = true)
 |-- Days for shipment (scheduled): integer (nullable = true)
 |-- Benefit per order: double (nullable = true)
 |-- Sales per customer: double (nullable = true)
 |-- Delivery Status: string (nullable = true)
 |-- Late_delivery_risk: integer (nullable = true)
 |-- Category Id: integer (nullable = true)
 |-- Category Name: string (nullable = true)
 |-- Customer City: string (nullable = true)
 |-- Customer Country: string (nullable = true)
 |-- Customer Email: string (nullable = true)
 |-- Customer Fname: string (nullable = true)
 |-- Customer Id: integer (nullable = true)
 |-- Customer Lname: string (nullable = true)
 |-- Customer Password: string (nullable = true)
 |-- Customer Segment: string (nullable = true)
 |-- Customer State: string (nullable = true)
 |-- Customer Street: string (nullable = true)
 |-- Customer Zipcode: integer (nullable = true)
 |-- Department Id: integer (nullable = 

- Les dimensions de dataframes

In [99]:
print("Colonnes :", len(df.columns))
print("Lignes   :", df.count())

Colonnes : 53
Lignes   : 180519


- Afficher les 10 premiere lignes

In [100]:
df.show(10)

+--------+------------------------+-----------------------------+-----------------+------------------+-----------------+------------------+-----------+--------------+-------------+----------------+--------------+--------------+-----------+--------------+-----------------+----------------+--------------+--------------------+----------------+-------------+---------------+-----------+------------+------------+----------+-------------+-----------------+-----------------------+--------+----------------------+-------------------+------------------------+-------------+------------------------+-----------------------+-------------------+------+----------------+----------------------+--------------+---------------+---------------+-------------+---------------+-------------------+-------------------+--------------------+------------+-------------+--------------+--------------------------+--------------+
|    Type|Days for shipping (real)|Days for shipment (scheduled)|Benefit per order|Sales per 

- Delete Canceled Deliveries

In [101]:
df = df.filter(F.col('Delivery Status') != 'Shipping canceled')

- Extraction du nom de mois a partir de la colonne "order date (DateOrders)"

In [102]:
df = df.withColumn(
    "order date (DateOrders)",
    F.to_timestamp("order date (DateOrders)", "M/d/yyyy H:mm")
).withColumn("Order_Month_Name", F.date_format("order date (DateOrders)", "MMMM"))

- Extraction du nom de mois a partir de la colonne "shipping date (DateOrders)"

In [103]:
df = df.withColumn(
    "shipping date (DateOrders)",
    F.to_timestamp("shipping date (DateOrders)", "M/d/yyyy H:mm")
).withColumn("Shipping_Month_Name", F.date_format("shipping date (DateOrders)", "MMMM"))

- Transformer en pandas dataframe

In [104]:
pdf = df.toPandas()

pdf.isnull().sum()

Type                                  0
Days for shipping (real)              0
Days for shipment (scheduled)         0
Benefit per order                     0
Sales per customer                    0
Delivery Status                       0
Late_delivery_risk                    0
Category Id                           0
Category Name                         0
Customer City                         0
Customer Country                      0
Customer Email                        0
Customer Fname                        0
Customer Id                           0
Customer Lname                        8
Customer Password                     0
Customer Segment                      0
Customer State                        0
Customer Street                       0
Customer Zipcode                      3
Department Id                         0
Department Name                       0
Latitude                              0
Longitude                             0
Market                                0


In [105]:
# Columns that can't be used in predictions
pdf = pdf.drop("Days for shipping (real)", axis=1)
pdf = pdf.drop("Delivery Status", axis=1)
pdf = pdf.drop("Order Status", axis=1)

# Dates
pdf = pdf.drop("shipping date (DateOrders)", axis=1)
pdf = pdf.drop("order date (DateOrders)", axis=1)

# Empty Columns
pdf = pdf.drop("Product Description", axis=1)       # All are null
pdf = pdf.drop("Product Status", axis=1)            # All are zeros

# Columns with many values
pdf = pdf.drop("Order Zipcode", axis=1)
pdf = pdf.drop("Customer City", axis=1)
pdf = pdf.drop("Product Name", axis=1)
pdf = pdf.drop("Customer Street", axis=1)
pdf = pdf.drop("Customer Country", axis=1)
pdf = pdf.drop("Order State", axis=1)
pdf = pdf.drop("Order City", axis=1)
pdf = pdf.drop("Customer Zipcode", axis=1)
pdf = pdf.drop("Customer State", axis=1)
pdf = pdf.drop("Order Country", axis=1)

# Columns with no useful information
pdf = pdf.drop("Product Image", axis=1)
pdf = pdf.drop("Customer Fname", axis=1)
pdf = pdf.drop("Customer Lname", axis=1)
pdf = pdf.drop("Customer Email", axis=1)
pdf = pdf.drop("Customer Password", axis=1)

# Ids

pdf = pdf.drop("Customer Id", axis=1)
pdf = pdf.drop("Category Id", axis=1)
pdf = pdf.drop("Department Id", axis=1)
pdf = pdf.drop("Order Customer Id", axis=1)
pdf = pdf.drop("Order Id", axis=1)
pdf = pdf.drop("Order Item Cardprod Id", axis=1)
pdf = pdf.drop("Order Item Id", axis=1)
pdf = pdf.drop("Product Card Id", axis=1)
pdf = pdf.drop("Product Category Id", axis=1)




In [107]:
len(pdf.columns)

24

In [108]:
pdf["Type"].count()

np.int64(172765)

In [109]:
pdf = pdf[~pdf.isnull().any(axis=1)]

In [110]:
pdf.isnull().sum()

Type                             0
Days for shipment (scheduled)    0
Benefit per order                0
Sales per customer               0
Late_delivery_risk               0
Category Name                    0
Customer Segment                 0
Department Name                  0
Latitude                         0
Longitude                        0
Market                           0
Order Item Discount              0
Order Item Discount Rate         0
Order Item Product Price         0
Order Item Profit Ratio          0
Order Item Quantity              0
Sales                            0
Order Item Total                 0
Order Profit Per Order           0
Order Region                     0
Product Price                    0
Shipping Mode                    0
Order_Month_Name                 0
Shipping_Month_Name              0
dtype: int64

- Sauvegarder pour l'utiliser ultérieurement 

In [111]:
pdf.to_csv('../data/processed/data-initial-cleaning.csv', index=False)

- Detection de l'importance des colonnes utilisant RandomForest

In [112]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

# --- Example Data ---

X = pdf.drop('Late_delivery_risk', axis=1)
y = pdf['Late_delivery_risk']

# --- Identify numeric and categorical columns ---
numeric_cols = X.select_dtypes(include=['int32', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(exclude=['int32', 'float64']).columns.tolist()

# --- Preprocessing ---

numeric_transformer = SimpleImputer(strategy='median')

categorical_transformer = Pipeline(steps=[
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# --- Pipeline with RandomForest ---
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1))
])

model.fit(X, y)


0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [113]:
# --- Feature importance ---
feature_names = X.columns
importances = model.named_steps['classifier'].feature_importances_

print(len(feature_names))
print(len(importances))

import pandas as pd
fi = pd.DataFrame({'Feature': feature_names, 'Importance': importances})


23
23


In [114]:
fi.sort_values("Importance", ascending=False)

Unnamed: 0,Feature,Importance
20,Shipping Mode,0.123333
0,Type,0.10794
3,Sales per customer,0.084842
4,Category Name,0.079081
1,Days for shipment (scheduled),0.064247
12,Order Item Product Price,0.063995
8,Longitude,0.052806
5,Customer Segment,0.043443
2,Benefit per order,0.042878
11,Order Item Discount Rate,0.042802


- Supprimer les colonnes non pertinent

On garde:
- Type
- Shipping Mode
- Late_delivery_risk
- Category Name
- Customer Segment
- Order Item Total
- Order Region
- Shipping_Month_Name

In [33]:
keep_cols = [
    'Type',
    'Shipping Mode',
    'Late_delivery_risk',
    'Category Name',
    'Customer Segment',
    'Order Item Total',
    'Order Region',
    'Shipping_Month_Name',
]

In [36]:
df_cleaned = df.select(keep_cols)

- Transformation de colonnes

In [37]:
dict_rename = {
    "Shipping Mode": "ShippingMode",
    "Late_delivery_risk": "LateDeliveryRisk",
    "Category Name": "CategoryName",
    "Customer Segment": "CustomerSegment",
    "Order Item Total": "OrderItemTotal",
    "Order Region": "OrderRegion",
    "Shipping_Month_Name": "ShippingMonthName",
}

for col, new_col in dict_rename.items():
    df_cleaned = df_cleaned.withColumn(new_col, F.col(col)).drop(col)

- Sauvegarder le modéle

In [38]:
df_cleaned.toPandas().to_csv("../data/processed/data-with-relevant-columns.csv", index=False)