import la data set from DataCo Smart Supply Chain

In [108]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import joblib
import json

Cleaning data

In [109]:
df = pd.read_csv('DataCoSupplyChainDataset.csv', encoding='latin-1')
df['Type'] = 'CASH'
print(df['Type'].value_counts())
# effacer la colonne Days for shipment (scheduled) 
df = df.drop(columns=['Days for shipment (scheduled)'])
df['Sales per customer'] = np.random.randint(2000, 10000, size=len(df))
df = df.rename(columns={'Sales per customer': 'Prix_Total_Course_DZD'})
# calculer le benefit par course 
pourcentage_marge = np.random.uniform(0.50, 0.80, size=len(df))
df['Benefit per order'] = (df['Prix_Total_Course_DZD'] * pourcentage_marge).astype(int)
df = df.drop(columns=['Category Id'])
liste_produits = [
    'Pomme de terre', 'Tomate', 'Oignon', 'Ail', 
    'Carotte', 'Courgette', 'Poivron', 'Piment',
    'Datte (Deglet Nour)', 'Orange', 'Mandarine', 'Citron',
    'Pastèque', 'Melon', 'Raisin', 'Pomme', 
    'Banane', 'Fraise', 'Abricot', 'Olive'
]
df['Category Name'] = np.random.choice(liste_produits, size=len(df))
villes_algerie = [
    'Alger', 'Oran', 'Constantine', 'Setif', 'Annaba',
    'Blida', 'Batna', 'Chlef', 'Tlemcen', 'Sidi Bel Abbes',
    'Biskra', 'El Oued', 'Tizi Ouzou', 'Bejaia', 'Mostaganem',
    'Djelfa', 'Skikda', 'Ghardaia', 'Ouargla', 'Tiaret',
    'Medea', 'Bechar', 'Mila', 'Mascara', 'Boumerdes',
    'Tipaza', 'Ain Defla', 'Relizane', 'Bordj Bou Arreridj'
]
df['Customer City'] = np.random.choice(villes_algerie, size=len(df))
df = df.drop(columns=['Customer Country'])
df = df.drop(columns=['Customer Email'])
pd.set_option('display.max_columns', None)
df = df.drop(columns=['Customer Fname'])
df = df.drop(columns=['Customer Lname'])
df = df.drop(columns=['Customer Password'])
df = df.drop(columns=['Customer State'])
df = df.drop(columns=['Customer Street'])
df = df.drop(columns=['Customer Zipcode'])
df = df.drop(columns=['Department Id'])
df = df.drop(columns=['Department Name'])
df = df.drop(columns=['Latitude'])
df = df.drop(columns=['Longitude'])
df = df.drop(columns=['Market'])
df = df.drop(columns=['Order Country'])
df = df.drop(columns=['Order Customer Id'])
df['order date (DateOrders)'] = pd.to_datetime(df['order date (DateOrders)']).dt.strftime('%d/%m/2025')
df = df.drop(columns=['Order Item Cardprod Id','Order Item Discount','Order Item Discount Rate','Order Item Id','Order Item Product Price','Order Item Profit Ratio','Sales'
,'Order Profit Per Order','Order Region','Order State','Order Zipcode','Product Card Id','Product Category Id','Product Description',
                      'Product Image','Product Name','Product Price','Product Status'])
# poids = Quantité * 400kg + un petit nombre aléatoire entre 0 et 50kg
df['Order Item Quantity (KG)'] = (df['Order Item Quantity'] * 400) + np.random.randint(0, 50, size=len(df))
n = len(df)

heures = np.random.randint(5, 21, size=n) 
minutes = np.random.randint(0, 60, size=n)

heures_str = [f"{h:02d}:{m:02d}" for h, m in zip(heures, minutes)]

df['order date (DateOrders)'] = df['order date (DateOrders)'].astype(str) + " " + heures_str

df['order date (DateOrders)'] = df['order date (DateOrders)'].astype(str).str.replace('29/02/2025', '28/02/2025')

temp_dates = pd.to_datetime(df['order date (DateOrders)'], dayfirst=True)

delais_minutes = np.random.randint(30, 180, size=len(df))
temp_shipping = temp_dates + pd.to_timedelta(delais_minutes, unit='m')
df['shipping date (DateOrders)'] = temp_shipping.dt.strftime('%d/%m/%Y %H:%M')

types_camions = {
    'Standard Class': 'Camion Bâché',
    'First Class': 'Camion Frigorifique',
    'Second Class': 'Benne Céréalière',
    'Same Day': 'Utilitaire Express'
}


df['Shipping Mode'] = df['Shipping Mode'].replace(types_camions)

df.rename(columns={'Shipping Mode': 'Type_Vehicule'}, inplace=True)
df = df.drop(columns=['Order Item Total'])
df = df.drop(columns=['Customer Segment'])
df = df[df['Order Status'] != 'PENDING_PAYMENT']
df.dropna(subset=['Order Status'], inplace=True)

df = df[df['Order Status'] != 'ON_HOLD']
minutes_total = np.random.randint(10, 600, size=len(df))

df['Days for shipping (real)'] = np.round(minutes_total / 60, 2)
df.rename(columns={'Days for shipping (real)': 'Duree_Trajet_Heures'}, inplace=True)
df['date_temp'] = pd.to_datetime(df['order date (DateOrders)'], dayfirst=True)
df['Mois'] = df['date_temp'].dt.month

# On dit arbitrairement que la durée moyenne est de 4h
duree_standard = 4.0 


# Logique : Si (Durée Réelle > Standard) OU (Frigo en Été) => Risque élevé

def calculer_risque(row):
    risque = 0
    
    # Règle 1 : Si le trajet est très long (> 6h), le risque augmente
    if row['Duree_Trajet_Heures'] > 6.0:
        risque += 1
        
    # Règle 2 : Saisonnalité (Juin-Août = Été = Risque Chaleur)
    if row['Mois'] in [6, 7, 8]:
        # Si en plus c'est un produit frais (Code Category élevé ou Frigo), c'est critique
        risque += 1
        
    # Règle 3 : Le hasard de la route (Bouchons aléatoires)
    # On garde un peu de la réalité du fichier d'origine si possible, 
    # sinon on simule un aléa (1 chance sur 5)
    if np.random.rand() > 0.8: 
        risque += 1

    # Si on a cumulé au moins 1 point de risque, on classe comme "À Risque" (1)
    return 1 if risque >= 1 else 0


df['Late_delivery_risk'] = df.apply(calculer_risque, axis=1)
df








Type
CASH    180519
Name: count, dtype: int64


Unnamed: 0,Type,Duree_Trajet_Heures,Benefit per order,Prix_Total_Course_DZD,Delivery Status,Late_delivery_risk,Category Name,Customer City,Customer Id,Order City,order date (DateOrders),Order Id,Order Item Quantity,Order Status,shipping date (DateOrders),Type_Vehicule,Order Item Quantity (KG),date_temp,Mois
0,CASH,4.25,1989,3734,Advance shipping,0,Tomate,Relizane,20755,Bekasi,31/01/2025 07:22,77202,1,COMPLETE,31/01/2025 07:53,Camion Bâché,443,2025-01-31 07:22:00,1
1,CASH,3.92,7209,9820,Late delivery,0,Piment,Mila,19492,Bikaner,13/01/2025 13:13,75939,1,PENDING,13/01/2025 15:40,Camion Bâché,426,2025-01-13 13:13:00,1
2,CASH,5.68,5032,9519,Shipping on time,0,Mandarine,Alger,19491,Bikaner,13/01/2025 18:11,75938,1,CLOSED,13/01/2025 18:55,Camion Bâché,445,2025-01-13 18:11:00,1
3,CASH,2.43,2912,4913,Advance shipping,1,Olive,Blida,19490,Townsville,13/01/2025 05:58,75937,1,COMPLETE,13/01/2025 08:42,Camion Bâché,417,2025-01-13 05:58:00,1
5,CASH,0.73,2591,4937,Shipping canceled,0,Courgette,Alger,19488,Toowoomba,13/01/2025 06:53,75935,1,CANCELED,13/01/2025 07:58,Camion Bâché,425,2025-01-13 06:53:00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180508,CASH,8.53,5648,7794,Late delivery,1,Tomate,Skikda,8387,Guangshui,16/01/2025 20:26,26053,1,COMPLETE,16/01/2025 21:56,Benne Céréalière,448,2025-01-16 20:26:00,1
180512,CASH,0.62,1591,2085,Late delivery,0,Abricot,Bordj Bou Arreridj,7396,Chengdu,16/01/2025 19:26,26050,1,COMPLETE,16/01/2025 20:51,Camion Bâché,412,2025-01-16 19:26:00,1
180514,CASH,8.80,4840,6481,Shipping on time,1,Melon,Blida,1005,Shanghái,16/01/2025 16:39,26043,1,CLOSED,16/01/2025 17:12,Camion Bâché,414,2025-01-16 16:39:00,1
180515,CASH,1.10,4879,9342,Late delivery,0,Oignon,Chlef,9141,Hirakata,16/01/2025 10:32,26037,1,COMPLETE,16/01/2025 13:00,Benne Céréalière,406,2025-01-16 10:32:00,1


mettre le data set clean dans un fichier

In [110]:
df.to_csv('Agri_VTC_Cleaned.csv', index=False)

import library d algortihme

In [111]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import joblib  

df = pd.read_csv('Agri_VTC_Cleaned.csv')
features_cols = [
    'Type_Vehicule', 
    'Category Name', 
    'Order City', 
    'Order Item Quantity (KG)', 
    'Duree_Trajet_Heures', 
    'Prix_Total_Course_DZD',
    'Mois'
]

# X = Les indices (Ce que le transporteur nous donne)
X = df[features_cols].copy()
y = df['Late_delivery_risk']

le_vehicule = LabelEncoder()
X['Type_Vehicule'] = le_vehicule.fit_transform(X['Type_Vehicule'])

le_category = LabelEncoder()
X['Category Name'] = le_category.fit_transform(X['Category Name'])

le_city = LabelEncoder()
X['Order City'] = le_city.fit_transform(X['Order City'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
score
print(X_train.shape)

(104706, 7)


mettre dans les fichiers l encoder et le model

In [112]:
joblib.dump(model, 'agri_vtc_model.pkl')          
joblib.dump(le_vehicule, 'encoder_vehicule.pkl')  
joblib.dump(le_category, 'encoder_category.pkl')  
joblib.dump(le_city, 'encoder_city.pkl')

['encoder_city.pkl']