In [44]:
import pandas as pd
import numpy as np

In [45]:
df= pd.read_csv('../data/preprocessed_rapido_dataset.csv')

Driver Delay Prediction Model (Binary Classification)
Predict whether a driver is likely to cause delays or incomplete rides based on:

1. Past delay history
2. Traffic exposure
3. Acceptance behavior


In [46]:
df.columns

Index(['booking_id', 'day_of_week', 'is_weekend', 'hour_of_day', 'city',
       'pickup_location', 'drop_location', 'vehicle_type', 'ride_distance_km',
       'estimated_ride_time_min', 'actual_ride_time_min', 'traffic_level',
       'weather_condition', 'base_fare', 'surge_multiplier', 'booking_value',
       'booking_status', 'incomplete_ride_reason', 'customer_id', 'driver_id',
       'preferred_vehicle_type', 'cancellation_rate', 'avg_customer_rating',
       'customer_cancel_flag', 'driver_age', 'driver_city',
       'driver_experience_years', 'delay_count', 'acceptance_rate',
       'avg_driver_rating', 'avg_pickup_delay_min', 'driver_delay_flag',
       'total_requests', 'completed_rides', 'cancelled_rides',
       'avg_wait_time_min', 'avg_surge_multiplier', 'demand_level',
       'peak_hour_flag', 'fare_per_km', 'fare_per_min', 'long_distance_flag',
       'city_pair', 'driver_reliability_score', 'customer_loyalty_score',
       'booking_datetime'],
      dtype='object')

In [47]:
# Feature Selection
selected_features = [
    'driver_age',
    'driver_experience_years',
    'vehicle_type',
    'acceptance_rate',
    'delay_count',
    'avg_driver_rating',
    'avg_pickup_delay_min',
    'traffic_level',
    'weather_condition',
    'driver_delay_flag'
]

selected_df = df[selected_features]


In [48]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score

In [49]:
X = selected_df.drop(columns=['driver_delay_flag'])
y = selected_df['driver_delay_flag']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

In [50]:
numerical_features = [
    'driver_age',
    'driver_experience_years',
    'acceptance_rate',
    'delay_count',
    'avg_driver_rating',
    'avg_pickup_delay_min'
]
categorical_features = [
    'vehicle_type',
    'weather_condition'
]
ordinal_features = ['traffic_level']

In [51]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),     
        ('ord', OrdinalEncoder(), ordinal_features)
    ],
    remainder='passthrough'
)
dt_rf_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),     
        ('ord', OrdinalEncoder(), ordinal_features)
    ],
    remainder='passthrough'
)


In [52]:
# Logistic Regression Pipeline
logistic_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'))
])

# classification_report, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
print("Logistic Regression:")
logistic_pipeline.fit(X_train, y_train)
y_pred_logistic = logistic_pipeline.predict(X_test)
print(classification_report(y_test, y_pred_logistic))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_logistic))
print("Accuracy:", accuracy_score(y_test, y_pred_logistic))
print("Recall:", recall_score(y_test, y_pred_logistic))
print("Precision:", precision_score(y_test, y_pred_logistic))
print("F1 Score:", f1_score(y_test, y_pred_logistic))



Logistic Regression:
              precision    recall  f1-score   support

           0       1.00      0.84      0.92     17491
           1       0.48      0.99      0.65      2509

    accuracy                           0.86     20000
   macro avg       0.74      0.92      0.78     20000
weighted avg       0.93      0.86      0.88     20000

Confusion Matrix:
 [[14778  2713]
 [   20  2489]]
Accuracy: 0.86335
Recall: 0.9920286966919091
Precision: 0.4784698193002691
F1 Score: 0.645571261833744


In [53]:
# Decision Tree Pipeline
dt_pipeline = Pipeline(steps=[
    ('preprocessor', dt_rf_preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42, class_weight='balanced'))
])
print("Decision Tree:")
dt_pipeline.fit(X_train, y_train)
y_pred_dt = dt_pipeline.predict(X_test)
print(classification_report(y_test, y_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Recall:", recall_score(y_test, y_pred_dt))
print("Precision:", precision_score(y_test, y_pred_dt))
print("F1 Score:", f1_score(y_test, y_pred_dt))


Decision Tree:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     17491
           1       1.00      1.00      1.00      2509

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000

Confusion Matrix:
 [[17491     0]
 [    0  2509]]
Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F1 Score: 1.0


In [54]:
# RandomForest Pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', dt_rf_preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))
])
print("Random Forest:")
rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))


Random Forest:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     17491
           1       1.00      0.99      0.99      2509

    accuracy                           1.00     20000
   macro avg       1.00      0.99      1.00     20000
weighted avg       1.00      1.00      1.00     20000

Confusion Matrix:
 [[17491     0]
 [   32  2477]]
Accuracy: 0.9984
Recall: 0.9872459147070546
Precision: 1.0
F1 Score: 0.9935820296831127
