In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb

# --- Configuration Constants ---
RANDOM_STATE = 42
LATE_DELIVERY_THRESHOLD_MINUTES = 10  # Define what constitutes a "Late" delivery
N_SAMPLES = 10000

# --- Haversine Distance Helper (Simulation of GeoPandas/distance calculation) ---
def haversine_distance(lat1, lon1, lat2, lon2):
    """Calculate the great-circle distance between two points on the earth (Haversine formula)."""
    R = 6371  # Radius of Earth in kilometers
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# --- 1. Synthetic Data Generation ---
def generate_synthetic_data(n_samples=N_SAMPLES):
    """Generates synthetic data simulating real-world delivery logs and metadata."""
    print(f"Generating {n_samples} synthetic delivery records...")
    np.random.seed(RANDOM_STATE)

    # 1. Timestamps & Coordinates
    start_date = pd.to_datetime('2023-01-01')
    data = {
        'order_time': start_date + pd.to_timedelta(np.random.randint(0, 365 * 24 * 60, n_samples), unit='min'),
        'rest_lat': 34.0 + np.random.uniform(-0.1, 0.1, n_samples),
        'rest_lon': -118.2 + np.random.uniform(-0.1, 0.1, n_samples),
        'delivery_lat': 34.0 + np.random.uniform(-0.2, 0.2, n_samples),
        'delivery_lon': -118.2 + np.random.uniform(-0.2, 0.2, n_samples),
    }

    # 2. Restaurant Metadata
    restaurants = ['Pizza Palace', 'Sushi Spot', 'Burger Barn', 'Vegan Vibes']
    data['restaurant_id'] = np.random.choice(restaurants, n_samples)
    data['cuisine_type'] = np.random.choice(['Italian', 'Japanese', 'American', 'Healthy'], n_samples)
    data['rest_rating'] = np.random.uniform(3.0, 5.0, n_samples).round(1)

    # 3. Real-time Conditions
    weather = ['Clear', 'Rain', 'Heavy Rain', 'Snow', 'Fog']
    traffic = ['Low', 'Moderate', 'High', 'Extreme']
    data['weather_condition'] = np.random.choice(weather, n_samples, p=[0.6, 0.2, 0.05, 0.05, 0.1])
    data['traffic_intensity'] = np.random.choice(traffic, n_samples, p=[0.4, 0.35, 0.2, 0.05])
    data['temperature_c'] = np.random.uniform(5, 35, n_samples).round(1)

    df = pd.DataFrame(data)

    # 4. Simulated Delivery Time (Target Generation)
    # Base delivery time (proportional to distance) + noise + penalties
    distance_km = haversine_distance(df['rest_lat'], df['rest_lon'], df['delivery_lat'], df['delivery_lon'])
    base_time = distance_km * 3 # 3 min/km base speed

    # Add penalties based on conditions
    weather_penalty = df['weather_condition'].apply(lambda x: {'Clear': 0, 'Rain': 5, 'Heavy Rain': 15, 'Snow': 20, 'Fog': 7}[x])
    traffic_penalty = df['traffic_intensity'].apply(lambda x: {'Low': 0, 'Moderate': 5, 'High': 12, 'Extreme': 25}[x])

    # Random restaurant prep time (simulated)
    prep_time = np.random.normal(loc=15, scale=5, size=n_samples).clip(min=5)

    # Final actual delivery time (in minutes)
    df['simulated_delivery_time_min'] = base_time + prep_time + weather_penalty + traffic_penalty + np.random.normal(loc=0, scale=8, size=n_samples)

    # Calculate scheduled delivery time (assuming 5 minutes buffer on average)
    df['scheduled_delivery_time_min'] = base_time + prep_time + 5

    # Calculate target: Delay
    df['delivery_delay_min'] = df['simulated_delivery_time_min'] - df['scheduled_delivery_time_min']

    # Create the binary target variable (1=Late, 0=On-Time/Early)
    df['is_late'] = (df['delivery_delay_min'] > LATE_DELIVERY_THRESHOLD_MINUTES).astype(int)

    print(f"Data generation complete. Late rate: {df['is_late'].mean():.2f}")
    return df

# --- 2. Feature Engineering and Preprocessing ---
def preprocess_data(df):
    """Performs feature extraction and prepares data for modeling."""
    print("Starting feature engineering...")

    # --- A. Geographical Feature (Haversine Distance) ---
    df['distance_km'] = haversine_distance(df['rest_lat'], df['rest_lon'], df['delivery_lat'], df['delivery_lon'])
    df.drop(columns=['rest_lat', 'rest_lon', 'delivery_lat', 'delivery_lon'], inplace=True)

    # --- B. Temporal Features ---
    df['hour_of_day'] = df['order_time'].dt.hour
    df['day_of_week'] = df['order_time'].dt.dayofweek # Monday=0, Sunday=6
    df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
    df.drop(columns=['order_time'], inplace=True)

    # --- C. Drop Intermediate/Leakage Features ---
    # We drop the actual delay/delivery time columns to prevent data leakage.
    df.drop(columns=['delivery_delay_min', 'simulated_delivery_time_min', 'scheduled_delivery_time_min'], inplace=True)

    # The 'restaurant_id' is kept as a categorical feature, assuming the model needs to learn specific biases.

    # Define feature types for ColumnTransformer
    numerical_features = ['distance_km', 'rest_rating', 'temperature_c']
    categorical_features = ['restaurant_id', 'cuisine_type', 'weather_condition', 'traffic_intensity', 'day_of_week', 'hour_of_day', 'is_weekend']

    # --- D. Column Transformation Pipeline ---
    # Numerical features are scaled (StandardScaler)
    # Categorical features are one-hot encoded (OneHotEncoder)

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
        ],
        remainder='drop' # Drop any other columns not specified
    )

    return df, preprocessor

# --- 3. Model Training and Evaluation (XGBoost) ---
def train_and_evaluate(X, y, preprocessor):
    """Trains the XGBoost model and evaluates performance."""
    print("Splitting data and training model...")

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
    )

    # Initialize the model (XGBoost Classifier)
    xgb_model = xgb.XGBClassifier(
        objective='binary:logistic',
        use_label_encoder=False,
        eval_metric='logloss',
        n_estimators=300,
        learning_rate=0.05,
        max_depth=5,
        random_state=RANDOM_STATE,
        n_jobs=-1
    )

    # Create a full pipeline (Preprocessing + Model)
    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', xgb_model)
    ])

    # Train the model
    full_pipeline.fit(X_train, y_train)

    # Predict on the test set
    y_pred = full_pipeline.predict(X_test)
    y_proba = full_pipeline.predict_proba(X_test)[:, 1]

    print("\n" + "="*50)
    print("XGBoost Model Evaluation")
    print("="*50)

    # Classification Report
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

    # ROC-AUC Score
    auc_score = roc_auc_score(y_test, y_proba)
    print(f"ROC-AUC Score: {auc_score:.4f}")

    # Feature Importance (Extract feature names after one-hot encoding)
    feature_names = full_pipeline['preprocessor'].get_feature_names_out()
    feature_importance = pd.Series(full_pipeline['classifier'].feature_importances_, index=feature_names)

    print("\nTop 10 Feature Importances:")
    print(feature_importance.nlargest(10).to_string())
    print("="*50)

    return full_pipeline

# --- Main Execution ---
if __name__ == "__main__":
    # 1. Generate Data
    df_raw = generate_synthetic_data()

    # Define features (X) and target (y)
    X = df_raw.drop(columns=['is_late'])
    y = df_raw['is_late']

    # 2. Preprocess Data and Get Pipeline
    X_processed, preprocessor = preprocess_data(X)

    # 3. Train and Evaluate Model
    model = train_and_evaluate(X_processed, y, preprocessor)

    print("\nModel training and evaluation complete.")
    # In a real application, you would save the 'model' object here using joblib/pickle:
    # import joblib
    # joblib.dump(model, 'delivery_predictor_model.pkl')

Generating 10000 synthetic delivery records...
Data generation complete. Late rate: 0.27
Starting feature engineering...
Splitting data and training model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost Model Evaluation

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.92      0.88      1452
           1       0.74      0.56      0.63       548

    accuracy                           0.82      2000
   macro avg       0.79      0.74      0.76      2000
weighted avg       0.82      0.82      0.82      2000

ROC-AUC Score: 0.8634

Top 10 Feature Importances:
cat__traffic_intensity_Moderate      0.191245
cat__traffic_intensity_Low           0.127203
cat__traffic_intensity_Extreme       0.120158
cat__weather_condition_Snow          0.080130
cat__weather_condition_Clear         0.079677
cat__traffic_intensity_High          0.062292
cat__weather_condition_Heavy Rain    0.042041
cat__weather_condition_Rain          0.034892
cat__weather_condition_Fog           0.028177
cat__hour_of_day_4                   0.007947

Model training and evaluation complete.


In [2]:
!pip3 freeze> requirements.txt


In [38]:
import sys
print (sys.modules.keys())



In [39]:
print(model.predict)

<bound method Pipeline.predict of Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['distance_km', 'rest_rating',
                                                   'temperature_c']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False),
                                                  ['restaurant_id',
                                                   'cuisine_type',
                                                   'weather_condition',
                                                   'traffic_intensity',
                                                   'day_of_week', 'hour_of_day',
                                                   'is_weekend'])])),
                ('classifier',
              

In [32]:
import pickle

In [49]:
with open ('late_delivery_prediction','wb') as file:
  pickle.dump(model,file)



In [50]:
with open ('late_delivery_prediction','rb') as file:
  xgb=pickle.load(file)