# ML: Delivery Time Prediction

Predicts truck dwell times at stores and distribution centers using LightGBM regression.

## Business Value
- Optimize receiving dock scheduling
- Reduce driver wait times and costs
- Improve labor planning for receiving staff
- Better ETA estimates for inventory availability

## Data Flow
```
Silver (fact_truck_moves, dimensions) --> ML Model --> Gold (gold_dwell_predictions)
```

## Model Details
- **Algorithm**: LightGBM Regressor with quantile regression
- **Target**: Dwell time (minutes) = departed_ts - arrived_ts
- **Features**: Location type, arrival hour, day of week, truck type, historical averages
- **Metrics**: MAE < 15 minutes, MAPE < 25%
- **Output**: Point predictions + 80% confidence intervals

## Usage
Schedule this notebook to run **daily** via Fabric pipeline to retrain model and generate predictions.

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType
import os
import pandas as pd
import numpy as np
from datetime import datetime, timezone

# ML imports
try:
    import lightgbm as lgb
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
except ImportError:
    print("Installing required packages...")
    !pip install lightgbm scikit-learn
    import lightgbm as lgb
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

In [None]:
# =============================================================================
# PARAMETERS
# =============================================================================

def get_env(var_name, default=None):
    return os.environ.get(var_name, default)

SILVER_DB = get_env("SILVER_DB", default="ag")
GOLD_DB = get_env("GOLD_DB", default="au")

# Model parameters
TEST_SIZE = 0.2
RANDOM_STATE = 42
TARGET_MAE = 15.0  # Target: MAE < 15 minutes
TARGET_MAPE = 0.25  # Target: MAPE < 25%

print(f"Configuration: SILVER_DB={SILVER_DB}, GOLD_DB={GOLD_DB}")
print(f"Model Targets: MAE < {TARGET_MAE} min, MAPE < {TARGET_MAPE*100}%")

In [None]:
# =============================================================================
# HELPER FUNCTIONS
# =============================================================================

def ensure_database(name):
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {name}")
    print(f"Database '{name}' ready.")

def read_silver(table_name):
    return spark.table(f"{SILVER_DB}.{table_name}")

def save_gold(df, table_name):
    full_name = f"{GOLD_DB}.{table_name}"
    df.write.format("delta").mode("overwrite").saveAsTable(full_name)
    print(f"  {full_name}: {df.count()} rows")

ensure_database(GOLD_DB)

---
## Part 1: Data Preparation

Calculate dwell times from truck arrivals and departures.

In [None]:
print("="*60)
print("PREPARING DWELL TIME DATA")
print("="*60)

# Read fact_truck_moves - contains ARRIVED and DEPARTED events
# ETA is populated on ARRIVED events, ETD is populated on DEPARTED events
df_truck_moves = read_silver("fact_truck_moves")

print(f"Total truck move records: {df_truck_moves.count()}")
df_truck_moves.groupBy("status").count().show()

In [None]:
# Join ARRIVED and DEPARTED events by shipment_id to calculate dwell time
# ARRIVED events have eta populated, DEPARTED events have etd populated
df_arrived = (
    df_truck_moves
    .filter(F.col("status") == "ARRIVED")
    .select(
        F.col("shipment_id"),
        F.col("truck_id"),
        F.col("store_id"),
        F.col("dc_id"),
        F.col("eta").alias("arrived_ts"),
        F.col("event_ts").alias("arrived_event_ts")
    )
)

df_departed = (
    df_truck_moves
    .filter(F.col("status") == "DEPARTED")
    .select(
        F.col("shipment_id"),
        F.col("etd").alias("departed_ts"),
        F.col("event_ts").alias("departed_event_ts")
    )
)

# Join to get complete shipments with both arrival and departure
df_dwell = (
    df_arrived
    .join(df_departed, on="shipment_id", how="inner")
    .withColumn(
        "dwell_minutes",
        (F.unix_timestamp("departed_ts") - F.unix_timestamp("arrived_ts")) / 60
    )
    # Filter out invalid records (negative or zero dwell time)
    .filter((F.col("dwell_minutes") > 0) & (F.col("dwell_minutes") < 480))  # Max 8 hours
)

print(f"\nShipments with complete dwell times: {df_dwell.count()}")

# Show statistics
df_dwell.select(
    F.min("dwell_minutes").alias("min_dwell"),
    F.avg("dwell_minutes").alias("avg_dwell"),
    F.max("dwell_minutes").alias("max_dwell"),
    F.stddev("dwell_minutes").alias("stddev_dwell")
).show()

---
## Part 2: Feature Engineering

Create temporal, location, and historical features.

In [None]:
print("="*60)
print("FEATURE ENGINEERING")
print("="*60)

# Temporal features from arrival time
df_features = (
    df_dwell
    .withColumn("arrival_hour", F.hour("arrived_ts"))
    .withColumn("arrival_day_of_week", F.dayofweek("arrived_ts"))  # 1=Sunday, 7=Saturday
    .withColumn("is_weekend", F.when(F.col("arrival_day_of_week").isin([1, 7]), 1).otherwise(0))
    .withColumn("arrival_date", F.to_date("arrived_ts"))
)

# Location type feature
df_features = df_features.withColumn(
    "location_type",
    F.when(F.col("store_id").isNotNull(), "STORE").otherwise("DC")
)

# Get location ID (store_id or dc_id)
df_features = df_features.withColumn(
    "location_id",
    F.when(F.col("store_id").isNotNull(), F.col("store_id")).otherwise(F.col("dc_id"))
)

print(f"Features created: {df_features.count()} records")

In [None]:
# Join with dimension tables for additional context
print("\nEnriching with dimension data...")

# Get truck information
df_trucks = read_silver("dim_trucks").select(
    F.col("truck_id"),
    F.col("capacity_cubic_feet").alias("truck_capacity"),
    F.col("fuel_type").alias("truck_fuel_type")
)

df_features = df_features.join(df_trucks, on="truck_id", how="left")

# Get store/DC information
df_stores = read_silver("dim_stores").select(
    F.col("store_id"),
    F.col("square_feet").alias("location_size")
)

df_dcs = read_silver("dim_distribution_centers").select(
    F.col("dc_id"),
    F.col("square_feet").alias("location_size")
)

# Join stores for STORE type
df_features = df_features.join(
    df_stores, 
    on="store_id", 
    how="left"
)

# Join DCs for DC type (coalesce location_size)
df_features_dc = df_features.join(
    df_dcs,
    on="dc_id",
    how="left"
).withColumn(
    "location_size",
    F.coalesce(F.col("location_size"), df_dcs["location_size"])
)

df_features = df_features_dc

print("Dimension enrichment complete.")

In [None]:
# Historical average dwell time by location
print("\nCalculating historical averages...")

# Calculate average dwell time per location
df_location_avg = (
    df_features
    .groupBy("location_id", "location_type")
    .agg(
        F.avg("dwell_minutes").alias("location_avg_dwell"),
        F.count("*").alias("location_shipment_count")
    )
)

df_features = df_features.join(
    df_location_avg,
    on=["location_id", "location_type"],
    how="left"
)

# Calculate average dwell time by hour of day
df_hour_avg = (
    df_features
    .groupBy("arrival_hour")
    .agg(F.avg("dwell_minutes").alias("hour_avg_dwell"))
)

df_features = df_features.join(
    df_hour_avg,
    on="arrival_hour",
    how="left"
)

print("Historical features added.")
print(f"\nFinal feature set: {df_features.count()} records")

---
## Part 3: Model Training

Train LightGBM models for point prediction and quantile regression.

In [None]:
print("="*60)
print("MODEL TRAINING")
print("="*60)

# Convert to pandas for training
feature_cols = [
    "arrival_hour",
    "arrival_day_of_week",
    "is_weekend",
    "location_type",
    "truck_capacity",
    "location_size",
    "location_avg_dwell",
    "location_shipment_count",
    "hour_avg_dwell"
]

target_col = "dwell_minutes"

# Select features and target
df_model = df_features.select(
    ["shipment_id", "arrived_ts", "departed_ts"] + feature_cols + [target_col]
).na.drop()  # Drop rows with missing values

print(f"Records for modeling: {df_model.count()}")

# Convert to pandas
pdf = df_model.toPandas()

# Encode categorical features
pdf['location_type'] = pdf['location_type'].map({'STORE': 0, 'DC': 1})

# Prepare training data
X = pdf[feature_cols]
y = pdf[target_col]

print(f"\nFeature shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

print(f"\nTraining set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

In [None]:
# Train main model (point prediction)
print("\nTraining point prediction model...")

params = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': RANDOM_STATE
}

train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

model = lgb.train(
    params,
    train_data,
    num_boost_round=100,
    valid_sets=[test_data],
    callbacks=[lgb.early_stopping(stopping_rounds=10)]
)

print("Point prediction model trained.")

In [None]:
# Train quantile models for confidence intervals (10th and 90th percentiles)
print("\nTraining quantile models for confidence intervals...")

# Lower bound (10th percentile)
params_lower = params.copy()
params_lower['objective'] = 'quantile'
params_lower['alpha'] = 0.10

model_lower = lgb.train(
    params_lower,
    train_data,
    num_boost_round=100,
    valid_sets=[test_data],
    callbacks=[lgb.early_stopping(stopping_rounds=10)]
)

# Upper bound (90th percentile)
params_upper = params.copy()
params_upper['objective'] = 'quantile'
params_upper['alpha'] = 0.90

model_upper = lgb.train(
    params_upper,
    train_data,
    num_boost_round=100,
    valid_sets=[test_data],
    callbacks=[lgb.early_stopping(stopping_rounds=10)]
)

print("Quantile models trained.")

---
## Part 4: Model Evaluation

In [None]:
print("="*60)
print("MODEL EVALUATION")
print("="*60)

# Generate predictions
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"\nTest Set Performance:")
print(f"  MAE:  {mae:.2f} minutes (Target: < {TARGET_MAE} min)")
print(f"  MAPE: {mape*100:.2f}% (Target: < {TARGET_MAPE*100}%)")
print(f"  RMSE: {rmse:.2f} minutes")

# Check if targets are met
mae_pass = mae < TARGET_MAE
mape_pass = mape < TARGET_MAPE

print(f"\nTarget Achievement:")
print(f"  MAE Target:  {'✓ PASS' if mae_pass else '✗ FAIL'}")
print(f"  MAPE Target: {'✓ PASS' if mape_pass else '✗ FAIL'}")

if mae_pass and mape_pass:
    print("\n✓ Model meets all performance targets!")
else:
    print("\n⚠ Model does not meet all targets. Consider:")
    print("  - More training data")
    print("  - Additional features")
    print("  - Hyperparameter tuning")

In [None]:
# Feature importance
print("\nFeature Importance:")
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print(feature_importance.to_string(index=False))

---
## Part 5: Generate Predictions

Create predictions for all completed shipments and save to Gold.

In [None]:
print("="*60)
print("GENERATING PREDICTIONS")
print("="*60)

# Generate predictions for all data
X_all = pdf[feature_cols]

predictions = model.predict(X_all, num_iteration=model.best_iteration)
lower_bound = model_lower.predict(X_all, num_iteration=model_lower.best_iteration)
upper_bound = model_upper.predict(X_all, num_iteration=model_upper.best_iteration)

# Add predictions to dataframe
pdf['predicted_dwell_minutes'] = predictions
pdf['lower_bound_minutes'] = lower_bound
pdf['upper_bound_minutes'] = upper_bound
pdf['prediction_error'] = pdf['dwell_minutes'] - pdf['predicted_dwell_minutes']
pdf['prediction_timestamp'] = datetime.now(timezone.utc)

# Decode location_type back to string
pdf['location_type'] = pdf['location_type'].map({0: 'STORE', 1: 'DC'})

print(f"Generated {len(pdf)} predictions.")
print(f"\nSample predictions:")
print(pdf[[
    'shipment_id', 'arrived_ts', 'dwell_minutes', 
    'predicted_dwell_minutes', 'lower_bound_minutes', 'upper_bound_minutes'
]].head(10))

In [None]:
# Convert back to Spark DataFrame and save to Gold
print("\nSaving predictions to Gold...")

output_cols = [
    'shipment_id',
    'arrived_ts',
    'departed_ts',
    'location_type',
    'arrival_hour',
    'arrival_day_of_week',
    'dwell_minutes',
    'predicted_dwell_minutes',
    'lower_bound_minutes',
    'upper_bound_minutes',
    'prediction_error',
    'prediction_timestamp'
]

df_predictions = spark.createDataFrame(pdf[output_cols])

save_gold(df_predictions, "gold_dwell_predictions")

print("\n✓ Predictions saved to gold_dwell_predictions")

---
## Summary

In [None]:
print("\n" + "="*60)
print("DELIVERY TIME PREDICTION COMPLETE")
print("="*60)

print(f"\nModel Performance:")
print(f"  MAE:  {mae:.2f} minutes")
print(f"  MAPE: {mape*100:.2f}%")
print(f"  RMSE: {rmse:.2f} minutes")

print(f"\nOutput Table:")
print(f"  {GOLD_DB}.gold_dwell_predictions")
print(f"  Rows: {df_predictions.count()}")

print(f"\nTop Features:")
for idx, row in feature_importance.head(5).iterrows():
    print(f"  {row['feature']}: {row['importance']:.0f}")

print("\nNext Steps:")
print("  1. Create logistics planning dashboard")
print("  2. Integrate predictions with dock scheduling system")
print("  3. Monitor model performance over time")
print("  4. Retrain model periodically with new data")