In [None]:
# ====================================================
# Phase 1: Data Preparation
# Project: Predictive Analytics for Supply Chain Optimization
# ====================================================

# --- 1. Import Required Libraries ---
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    accuracy_score, classification_report, confusion_matrix
)


In [None]:
try:
  df = pd.read_csv(r"C:\Users\BOB\Documents\Data Analysis\.data\dynamic_supply_chain_logistics_dataset.csv", encoding='utf-8')
except UnicodeDecodeError:
  df = pd.read_csv(r"C:\Users\BOB\Documents\Data Analysis\.data\dynamic_supply_chain_logistics_dataset.csv", encoding='latin-1')
print("Dataset loaded successfully!")

# Show list of columns
print(df.columns)

In [None]:
# --- 3. Basic Data Overview ---
print("Shape of dataset:", df.shape)
print("\nColumns:\n", df.columns.tolist())
print("\nMissing values summary:\n", df.isna().sum())
print("\nData types:\n", df.dtypes)

In [None]:
# --- 4. Handle Missing Values with NumPy ---
num_cols = df.select_dtypes(include=np.number).columns
cat_cols = df.select_dtypes(exclude=np.number).columns

# Numeric columns ‚Üí replace NaN with column median using NumPy
for col in num_cols:
    median_val = np.nanmedian(df[col])
    df[col] = df[col].fillna(median_val)

In [None]:
# Categorical columns ‚Üí replace NaN with most frequent value (mode)
for col in cat_cols:
    mode_val = df[col].mode()[0] if not df[col].mode().empty else 'Unknown'
    df[col] = df[col].fillna(mode_val)

print("\n Missing values handled with NumPy (median/mode).")

In [None]:
# --- 5. Feature Engineering ---

## 5.1 Create derived features
# Example: combine GPS into one column
df['vehicle_location'] = df['vehicle_gps_latitude'].astype(str) + ',' + df['vehicle_gps_longitude'].astype(str)

# Example: compute delay difference in minutes
df['eta_variation_minutes'] = df['eta_variation_hours'] * 60

# Example: categorize traffic level
df['traffic_category'] = pd.cut(
    df['traffic_congestion_level'],
    bins=[0, 3, 6, 10],
    labels=['Low', 'Medium', 'High']
)

# Example: flag if weather severity exceeds threshold
df['severe_weather_flag'] = np.where(df['weather_condition_severity'] > 7, 1, 0)

print("\n Feature engineering complete. New columns added:", 
      [c for c in df.columns if 'vehicle_location' in c or 'eta_variation_minutes' in c or 'traffic_category' in c or 'severe_weather_flag' in c])

In [None]:
# --- 6. Encode Categorical Features ---
label_enc = LabelEncoder()
for col in cat_cols:
    df[col] = label_enc.fit_transform(df[col].astype(str))

print("\n Categorical encoding complete.")

In [None]:
# --- 7. Remove Duplicates ---
initial_shape = df.shape
df.drop_duplicates(inplace=True)
print(f"\n Removed {initial_shape[0] - df.shape[0]} duplicate rows.")

In [None]:
# --- 8. Scale Numerical Features ---
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
print("\n Numerical features scaled.")

In [None]:
# --- 9. Final Dataset Summary ---
print("\nFinal dataset shape:", df.shape)
print("\nSample data:")
display(df.head())

In [None]:
# --- 10. Save Cleaned Dataset ---
df.to_csv('cleaned_logistics_data.csv', index=False)
print("\n Cleaned dataset saved as 'cleaned_logistics_data.csv'.")


In [None]:
# ====================================================
# Phase 2: Exploratory Data Analysis (EDA)
# ====================================================

# --- 1. Import Required Libraries ---

import seaborn as sns

# display settings
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid", context="notebook")

# Assume df is already loaded and cleaned from Phase 1
print(" DataFrame available with shape:", df.shape)

In [None]:
# ====================================================
# 2 CORRELATION ANALYSIS
# ====================================================

# Select only numeric columns for correlation
numeric_cols = df.select_dtypes(include=np.number).columns
corr_matrix = df[numeric_cols].corr()

# Display top correlated features with delivery deviation
target = 'delivery_time_deviation'
if target in corr_matrix.columns:
    print("\n Top 10 correlations with delivery_time_deviation:")
    print(corr_matrix[target].abs().sort_values(ascending=False).head(10))

# Heatmap visualization
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, cmap='coolwarm', center=0)
plt.title("Correlation Heatmap of Numeric Features", fontsize=14)
plt.show()

In [None]:
# ====================================================
# 3Ô∏è DISTRIBUTION ANALYSIS AND ANOMALY DETECTION
# ====================================================

# Plot distribution of key numeric variables
key_vars = [
    'delivery_time_deviation',
    'fuel_consumption_rate',
    'traffic_congestion_level',
    'shipping_costs',
    'lead_time_days'
]
key_vars = [v for v in key_vars if v in df.columns]

for col in key_vars:
    plt.figure(figsize=(7, 4))
    sns.histplot(df[col], bins=30, kde=True)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.show()

In [None]:
# Detect anomalies using IQR method for each key numeric column
def detect_outliers_iqr(series):
    q1, q3 = np.percentile(series.dropna(), [25, 75])
    iqr = q3 - q1
    lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
    return ((series < lower) | (series > upper)).sum()

outlier_summary = {col: detect_outliers_iqr(df[col]) for col in key_vars}
print("\nüîé Outlier count per key variable:")
print(pd.Series(outlier_summary))

In [None]:
# ====================================================
# 4Ô∏è SEASONAL & REGIONAL DELAY TRENDS
# ====================================================

# Convert timestamp to datetime if needed
if not np.issubdtype(df['timestamp'].dtype, np.datetime64):
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

# Extract time-based features
df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month
df['weekday'] = df['timestamp'].dt.day_name()

# --- 4.1 Seasonal trends (monthly average delay)
if 'delivery_time_deviation' in df.columns:
    monthly_delay = df.groupby('month')['delivery_time_deviation'].mean()
    plt.figure(figsize=(8, 4))
    sns.lineplot(x=monthly_delay.index, y=monthly_delay.values, marker='o')
    plt.title(" Average Delivery Time Deviation by Month")
    plt.xlabel("Month")
    plt.ylabel("Avg Delivery Deviation")
    plt.show()

In [None]:
# --- 4.2 Regional trends (by city or location if available)
if 'Customer City' in df.columns:
    regional_delay = df.groupby('Customer City')['delivery_time_deviation'].mean().sort_values(ascending=False).head(10)
    plt.figure(figsize=(10, 5))
    sns.barplot(x=regional_delay.values, y=regional_delay.index, palette='coolwarm')
    plt.title(" Top 10 Cities by Average Delivery Deviation")
    plt.xlabel("Average Delay")
    plt.ylabel("City")
    plt.show()
else:
    print("\n No 'Customer City' column found for regional analysis. Skipping.")

In [None]:
# --- 4.3 Weekday delay trends
if 'delivery_time_deviation' in df.columns:
    weekday_delay = df.groupby('weekday')['delivery_time_deviation'].mean()
    plt.figure(figsize=(8, 4))
    sns.barplot(x=weekday_delay.index, y=weekday_delay.values, palette='crest')
    plt.title(" Average Delivery Deviation by Weekday")
    plt.xlabel("Weekday")
    plt.ylabel("Avg Delivery Deviation")
    plt.xticks(rotation=45)
    plt.show()

print("\n Phase 2: EDA completed successfully.")


In [None]:
# ====================================================
# Phase 3: Machine Learning Modeling
# ====================================================



# Assume df is already loaded and cleaned from previous phases
print(" DataFrame ready with shape:", df.shape)


# ====================================================
# 2Ô∏è FEATURE SELECTION & ENCODING
# ====================================================

# Drop non-numeric / irrelevant columns
exclude_cols = ['timestamp', 'Customer City', 'Customer Email', 'Customer Fname', 'Customer Lname']
df = df.drop(columns=[c for c in exclude_cols if c in df.columns], errors='ignore')

# Encode categorical variables
for col in df.select_dtypes(include='object').columns:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# Replace remaining NaNs with median
df = df.fillna(df.median(numeric_only=True))



In [None]:

# Drop non-relevant identifier columns (keep numeric & categorical features only)
exclude_cols = [
    'timestamp', 'Customer City', 'Customer Email',
    'Customer Fname', 'Customer Lname'
]
df = df.drop(columns=[c for c in exclude_cols if c in df.columns], errors='ignore')

# Identify categorical and numeric columns
cat_cols = df.select_dtypes(include=['object']).columns
num_cols = df.select_dtypes(include=[np.number]).columns

print("Categorical columns:", list(cat_cols))
print("Numeric columns:", list(num_cols))

# Encode categorical columns safely using LabelEncoder
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

for col in cat_cols:
    df[col] = encoder.fit_transform(df[col].astype(str))

# Fill NaN with median for numeric, mode for categorical
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

print("All categorical features encoded successfully. Ready for modeling.")


In [None]:
# ====================================================
# üîß DATA CLEANUP BEFORE MODELING ‚Äî FINAL SAFE VERSION
# ====================================================


# Work on a copy to preserve df
data = df.copy()

# Drop identifier or irrelevant columns if they exist
drop_cols = [
    'timestamp', 'Customer City', 'Customer Email',
    'Customer Fname', 'Customer Lname'
]
data = data.drop(columns=[c for c in drop_cols if c in data.columns], errors='ignore')

# --- 1. Convert booleans to int
bool_cols = data.select_dtypes(include=['bool']).columns
if len(bool_cols):
    data[bool_cols] = data[bool_cols].astype(int)

# --- 2. Encode categoricals (object / category)
cat_cols = data.select_dtypes(include=['object', 'category']).columns
if len(cat_cols):
    print("Encoding categorical columns:", list(cat_cols))
    for col in cat_cols:
        data[col] = data[col].astype(str).astype('category').cat.codes

# --- 3. Replace any infinite or missing values
data = data.replace([np.inf, -np.inf], np.nan)
data = data.fillna(data.median(numeric_only=True))

# --- 4. Enforce numeric dtype globally
data = data.apply(pd.to_numeric, errors='coerce').fillna(0)

# --- 5. Verify everything is numeric
non_numeric = data.select_dtypes(exclude=[np.number]).columns.tolist()
if non_numeric:
    print(" Still non-numeric columns:", non_numeric)
else:
    print(" All columns successfully converted to numeric.")

#  Ready for modeling
df = data


In [None]:
# --- Phase 3: Model Training with Progress Bar ---

#!pip install tqdm --quiet
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, classification_report
from tqdm.notebook import tqdm  # progress bar for Jupyter

# --- Example target columns ---
target_reg = "delivery_time_deviation"
target_cls = "risk_classification"

# --- Ensure numeric features only ---
X = df.select_dtypes(include=[np.number])
y_reg = df[target_reg]
y_cls = df[target_cls].astype(str)  # ensure string for classification

# --- Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(X, y_reg, test_size=0.2, random_state=42)

# --- Scale numeric features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Models ---
models_reg = {
    "Linear Regression": LinearRegression(),
    "Random Forest Regressor": RandomForestRegressor(
        n_estimators=50, max_depth=10, n_jobs=-1, random_state=42
    )
}

# --- Results dict ---
results_reg = {}

# --- Model training with progress bar ---
print(" Training regression models...")
for name in tqdm(models_reg.keys(), desc="Training Progress", leave=False):
    model = models_reg[name]
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    
    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    
    results_reg[name] = {"MAE": mae, "RMSE": rmse, "R¬≤": r2}
    print(f"\n {name} done.")
    print(f"MAE: {mae:.3f} | RMSE: {rmse:.3f} | R¬≤: {r2:.3f}")

# --- Display summary ---
results_reg_df = pd.DataFrame(results_reg).T
display(results_reg_df.style.background_gradient(cmap="Blues").format("{:.3f}"))


In [None]:
# Global storage for trained models
trained_models = {
    "regression": {},
    "classification": {}
}

In [None]:
# ====================================================
# Phase 3: Machine Learning Modeling (Optimized)
# ====================================================

from tqdm.notebook import tqdm
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, classification_report

# --- 1Ô∏è Preprocessing: ensure numeric ---
df_ml = df.copy()

# Drop identifier columns
drop_cols = ['timestamp', 'Customer City', 'Customer Email', 'Customer Fname', 'Customer Lname']
df_ml = df_ml.drop(columns=[c for c in drop_cols if c in df_ml.columns], errors='ignore')

# Encode booleans
bool_cols = df_ml.select_dtypes(include='bool').columns
df_ml[bool_cols] = df_ml[bool_cols].astype(int)

# Encode categorical columns
cat_cols = df_ml.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
    df_ml[col] = LabelEncoder().fit_transform(df_ml[col].astype(str))

# Fill NaNs
df_ml = df_ml.fillna(df_ml.median(numeric_only=True))

print(" Preprocessing complete. Data ready for modeling.")

# --- 2Ô∏è Regression: delivery_time_deviation ---
if 'delivery_time_deviation' in df_ml.columns:
    print("\n Regression: delivery_time_deviation")

    X_reg = df_ml.drop(columns=['delivery_time_deviation'])
    y_reg = df_ml['delivery_time_deviation']

    # --- Optional: sample for speed ---
    sample_size = 5000
    if len(X_reg) > sample_size:
        X_reg = X_reg.sample(sample_size, random_state=42)
        y_reg = y_reg.loc[X_reg.index]

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

    # Scale
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Models
    models_reg = {
        "Linear Regression": LinearRegression(),
        "Random Forest Regressor": RandomForestRegressor(n_estimators=50, max_depth=10, n_jobs=-1, random_state=42)
    }

    results_reg = {}
    for name in tqdm(models_reg.keys(), desc="Regression Models", leave=False):
        model = models_reg[name]
        start = pd.Timestamp.now()
        model.fit(X_train_scaled, y_train)
        preds = model.predict(X_test_scaled)
        elapsed = (pd.Timestamp.now() - start).total_seconds()

        mae = mean_absolute_error(y_test, preds)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        r2 = r2_score(y_test, preds)

        results_reg[name] = {"MAE": mae, "RMSE": rmse, "R¬≤": r2}
        print(f"\n {name} done in {elapsed:.2f} sec | MAE: {mae:.3f} | RMSE: {rmse:.3f} | R¬≤: {r2:.3f}")

        # Feature importance for Random Forest
        if hasattr(model, 'feature_importances_'):
            feat_imp = pd.Series(model.feature_importances_, index=X_reg.columns)
            feat_imp.nlargest(10).plot(kind='barh', figsize=(8,4), title=f"Top 10 Feature Importances ({name})")
            plt.show()

# --- 3Ô∏è Classification: risk_classification & delay_probability ---
def run_classification(target_col):
    print(f"\n Classification: {target_col}")
    if target_col not in df_ml.columns:
        print(f" Column '{target_col}' not found. Skipping.")
        return

    X = df_ml.drop(columns=[target_col])
    y = df_ml[target_col]

    # Optional sampling
    sample_size = 5000
    if len(X) > sample_size:
        X = X.sample(sample_size, random_state=42)
        y = y.loc[X.index]

    # Encode target
    y = LabelEncoder().fit_transform(y.astype(str))

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Models
    models_cls = {
        "Logistic Regression": LogisticRegression(max_iter=500),
        "Random Forest Classifier": RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=-1, random_state=42)
    }

    for name in tqdm(models_cls.keys(), desc=f"{target_col} Models", leave=False):
        model = models_cls[name]
        start = pd.Timestamp.now()
        model.fit(X_train_scaled, y_train)
        preds = model.predict(X_test_scaled)
        # --- Save trained model globally ---
        trained_models['classification'][f"{target}_{name}"] = model
        elapsed = (pd.Timestamp.now() - start).total_seconds()

        acc = accuracy_score(y_test, preds)
        print(f"\n {name} done in {elapsed:.2f} sec | Accuracy: {acc:.3f}")

# Run classification tasks
for target in ['risk_classification', 'delay_probability']:
    run_classification(target)

print("\n Phase 3: Machine Learning Modeling completed successfully.")


In [None]:
# ====================================================
# Phase 4: Insights & Recommendations
# ====================================================


print(" Phase 4: Interpreting model outputs and generating recommendations")

# --- 1Ô∏è Regression Insights: delivery_time_deviation ---
if 'delivery_time_deviation' in df_ml.columns:
    print("\nüîπ Regression Insights: Top drivers of delivery time deviation")

    # Use the last trained Random Forest model from Phase 3
    rf_model = models_reg.get("Random Forest Regressor", None)

    if rf_model is not None:
        # Feature importance
        feat_imp = pd.Series(rf_model.feature_importances_, index=X_reg.columns).sort_values(ascending=False)
        top_features = feat_imp.head(10)
        print("\nTop 10 features driving delivery time deviation:")
        display(top_features)

        # Visualize
        plt.figure(figsize=(8,5))
        sns.barplot(x=top_features.values, y=top_features.index, palette="viridis")
        plt.title("Top 10 Feature Importances ‚Äî Delivery Time Deviation")
        plt.xlabel("Importance")
        plt.ylabel("Feature")
        plt.show()

        # Recommendations
        print("\n Recommendations based on top features:")
        for feat in top_features.index:
            print(f"- Monitor and optimize {feat} to reduce delivery time deviations.")


In [None]:
# --- Classification Insights ---
for target in ['risk_classification', 'delay_probability']:
    print(f"\nüîπ Classification Insights: {target}")

    # Access the trained Random Forest Classifier from the global dictionary
    rf_model_cls = trained_models['classification'].get(f"{target}_Random Forest Classifier", None)

    if rf_model_cls is not None:
        # Ensure we use the same features as in training
        X_train_cls = df_ml.select_dtypes(include=[np.number]).drop(columns=[target], errors='ignore')

        # Feature importance
        feat_imp_cls = pd.Series(rf_model_cls.feature_importances_, index=X_train_cls.columns).sort_values(ascending=False)
        top_features_cls = feat_imp_cls.head(10)

        print("\nTop 10 features driving classification outcome:")
        display(top_features_cls)


        
        plt.figure(figsize=(8,5))
        sns.barplot(x=top_features_cls.values, y=top_features_cls.index, palette="magma")
        plt.title(f"Top 10 Feature Importances ‚Äî {target}")
        plt.xlabel("Importance")
        plt.ylabel("Feature")
        plt.show()

        # Recommendations
        print("\n Recommendations based on top features:")
        for feat in top_features_cls.index:
            print(f"- Investigate and improve {feat} to reduce {target.replace('_',' ')} risks.")
    else:
        print(f" No trained model found for {target}, skipping.")
