In [None]:
# ====================================================
# Phase 1: Data Preparation
# Project: Predictive Analytics for Supply Chain Optimization
# ====================================================

# --- 1. Import Required Libraries ---
import pandas as pd
import numpy as np


In [15]:

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [None]:
try:
  df = pd.read_csv(r"C:\Users\BOB\Documents\Data Analysis\.data\dynamic_supply_chain_logistics_dataset.csv", encoding='utf-8')
except UnicodeDecodeError:
  df = pd.read_csv(r"C:\Users\BOB\Documents\Data Analysis\.data\dynamic_supply_chain_logistics_dataset.csv", encoding='latin-1')
print("Dataset loaded successfully!")

# Show list of columns
print(df.columns)

Dataset loaded successfully!
Shape: (32065, 26)
Index(['timestamp', 'vehicle_gps_latitude', 'vehicle_gps_longitude',
       'fuel_consumption_rate', 'eta_variation_hours',
       'traffic_congestion_level', 'warehouse_inventory_level',
       'loading_unloading_time', 'handling_equipment_availability',
       'order_fulfillment_status', 'weather_condition_severity',
       'port_congestion_level', 'shipping_costs', 'supplier_reliability_score',
       'lead_time_days', 'historical_demand', 'iot_temperature',
       'cargo_condition_status', 'route_risk_level', 'customs_clearance_time',
       'driver_behavior_score', 'fatigue_monitoring_score',
       'disruption_likelihood_score', 'delay_probability',
       'risk_classification', 'delivery_time_deviation'],
      dtype='object')


In [5]:
# --- 3. Basic Data Overview ---
print("Shape of dataset:", df.shape)
print("\nColumns:\n", df.columns.tolist())
print("\nMissing values summary:\n", df.isna().sum())
print("\nData types:\n", df.dtypes)

Shape of dataset: (32065, 26)

Columns:
 ['timestamp', 'vehicle_gps_latitude', 'vehicle_gps_longitude', 'fuel_consumption_rate', 'eta_variation_hours', 'traffic_congestion_level', 'warehouse_inventory_level', 'loading_unloading_time', 'handling_equipment_availability', 'order_fulfillment_status', 'weather_condition_severity', 'port_congestion_level', 'shipping_costs', 'supplier_reliability_score', 'lead_time_days', 'historical_demand', 'iot_temperature', 'cargo_condition_status', 'route_risk_level', 'customs_clearance_time', 'driver_behavior_score', 'fatigue_monitoring_score', 'disruption_likelihood_score', 'delay_probability', 'risk_classification', 'delivery_time_deviation']

Missing values summary:
 timestamp                          0
vehicle_gps_latitude               0
vehicle_gps_longitude              0
fuel_consumption_rate              0
eta_variation_hours                0
traffic_congestion_level           0
warehouse_inventory_level          0
loading_unloading_time       

In [7]:
# --- 4. Handle Missing Values with NumPy ---
num_cols = df.select_dtypes(include=np.number).columns
cat_cols = df.select_dtypes(exclude=np.number).columns

# Numeric columns â†’ replace NaN with column median using NumPy
for col in num_cols:
    median_val = np.nanmedian(df[col])
    df[col] = df[col].fillna(median_val)

In [16]:
# Categorical columns â†’ replace NaN with most frequent value (mode)
for col in cat_cols:
    mode_val = df[col].mode()[0] if not df[col].mode().empty else 'Unknown'
    df[col] = df[col].fillna(mode_val)

print("\n Missing values handled with NumPy (median/mode).")


 Missing values handled with NumPy (median/mode).


In [10]:
# --- 5. Feature Engineering ---

## 5.1 Create derived features
# Example: combine GPS into one column
df['vehicle_location'] = df['vehicle_gps_latitude'].astype(str) + ',' + df['vehicle_gps_longitude'].astype(str)

# Example: compute delay difference in minutes
df['eta_variation_minutes'] = df['eta_variation_hours'] * 60

# Example: categorize traffic level
df['traffic_category'] = pd.cut(
    df['traffic_congestion_level'],
    bins=[0, 3, 6, 10],
    labels=['Low', 'Medium', 'High']
)

# Example: flag if weather severity exceeds threshold
df['severe_weather_flag'] = np.where(df['weather_condition_severity'] > 7, 1, 0)

print("\n Feature engineering complete. New columns added:", 
      [c for c in df.columns if 'vehicle_location' in c or 'eta_variation_minutes' in c or 'traffic_category' in c or 'severe_weather_flag' in c])


 Feature engineering complete. New columns added: ['vehicle_location', 'eta_variation_minutes', 'traffic_category', 'severe_weather_flag']


In [17]:
# --- 6. Encode Categorical Features ---
label_enc = LabelEncoder()
for col in cat_cols:
    df[col] = label_enc.fit_transform(df[col].astype(str))

print("\n Categorical encoding complete.")


 Categorical encoding complete.


In [18]:
# --- 7. Remove Duplicates ---
initial_shape = df.shape
df.drop_duplicates(inplace=True)
print(f"\n Removed {initial_shape[0] - df.shape[0]} duplicate rows.")


 Removed 0 duplicate rows.


In [19]:
# --- 8. Scale Numerical Features (optional) ---
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
print("\n Numerical features scaled.")


 Numerical features scaled.


In [20]:
# --- 9. Final Dataset Summary ---
print("\nFinal dataset shape:", df.shape)
print("\nSample data:")
display(df.head())


Final dataset shape: (32065, 30)

Sample data:


Unnamed: 0,timestamp,vehicle_gps_latitude,vehicle_gps_longitude,fuel_consumption_rate,eta_variation_hours,traffic_congestion_level,warehouse_inventory_level,loading_unloading_time,handling_equipment_availability,order_fulfillment_status,...,driver_behavior_score,fatigue_monitoring_score,disruption_likelihood_score,delay_probability,risk_classification,delivery_time_deviation,vehicle_location,eta_variation_minutes,traffic_category,severe_weather_flag
0,0,0.339989,0.754353,-0.674161,0.925652,0.265032,2.122389,1.711338,0.547999,0.464108,...,-1.311714,1.092311,-1.065633,0.573834,2,0.945913,"40.375568475194925,-77.0143177425258",299.880553,Medium,0
1,1,-0.652775,-1.549905,-0.682367,-0.839108,-0.962488,0.30128,-0.811548,0.975992,-1.169177,...,-0.837676,0.918766,0.634459,-0.477334,0,0.720945,"33.50781833905297,-117.03690240506452",59.095765,Low,0
2,2,-1.156863,0.854824,-0.684878,0.914507,1.074824,1.648395,1.240888,1.559445,-1.29604,...,-0.661708,-0.597613,0.698391,0.321239,0,-0.936538,"30.020639789030906,-75.2692240366282",298.359906,High,0
3,3,-0.198671,1.147225,0.048729,0.088828,-1.400409,-0.923457,-1.133386,-0.902613,0.610836,...,-0.38262,0.880761,0.679363,-2.074207,0,0.992623,"36.649222512758975,-70.19052930366378",185.703813,Low,0
4,4,-1.159662,1.157492,-0.706151,0.142044,0.853161,1.903516,0.855257,-0.867148,-1.582681,...,-1.359394,-0.98949,0.389631,0.899959,0,0.61926,"30.00127928439569,-70.01219476429307",192.964604,High,0


In [21]:
# --- 10. Save Cleaned Dataset ---
df.to_csv('cleaned_logistics_data.csv', index=False)
print("\nðŸ’¾ Cleaned dataset saved as 'cleaned_logistics_data.csv'.")



ðŸ’¾ Cleaned dataset saved as 'cleaned_logistics_data.csv'.
