In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv("../data/preprocessed_rapido_dataset.csv")

In [3]:
df.columns

Index(['booking_id', 'day_of_week', 'is_weekend', 'hour_of_day', 'city',
       'pickup_location', 'drop_location', 'vehicle_type', 'ride_distance_km',
       'estimated_ride_time_min', 'actual_ride_time_min', 'traffic_level',
       'weather_condition', 'base_fare', 'surge_multiplier', 'booking_value',
       'booking_status', 'incomplete_ride_reason', 'customer_id', 'driver_id',
       'preferred_vehicle_type', 'cancellation_rate', 'avg_customer_rating',
       'customer_cancel_flag', 'driver_age', 'driver_city',
       'driver_experience_years', 'delay_count', 'acceptance_rate',
       'avg_driver_rating', 'avg_pickup_delay_min', 'driver_delay_flag',
       'total_requests', 'completed_rides', 'cancelled_rides',
       'avg_wait_time_min', 'avg_surge_multiplier', 'demand_level',
       'peak_hour_flag', 'fare_per_km', 'fare_per_min', 'long_distance_flag',
       'city_pair', 'driver_reliability_score', 'customer_loyalty_score',
       'booking_datetime'],
      dtype='object')

Customer Cancellation Risk Model (Binary Classification)
Predict the probability that a customer will cancel a booking using:

1. Historical cancellation rate
2. Past ratings
3. Peak-time behavior
4. Pricing sensitivity


In [4]:
df.sample(5)

Unnamed: 0,booking_id,day_of_week,is_weekend,hour_of_day,city,pickup_location,drop_location,vehicle_type,ride_distance_km,estimated_ride_time_min,...,avg_surge_multiplier,demand_level,peak_hour_flag,fare_per_km,fare_per_min,long_distance_flag,city_pair,driver_reliability_score,customer_loyalty_score,booking_datetime
9840,B_009841,Wednesday,0,13,Bangalore,Loc_5,Loc_39,Cab,4.19,18.87,...,1.57,Medium,0,55.2,12.26,0,Loc_5_Loc_39,85.93,68.45,2025-11-12 13:23:00
67356,B_067357,Friday,0,18,Bangalore,Loc_25,Loc_49,Cab,24.81,163.74,...,1.88,Low,1,36.98,5.6,1,Loc_25_Loc_49,88.18,85.47,2025-04-04 18:17:00
10251,B_010252,Monday,0,5,Delhi,Loc_47,Loc_20,Auto,10.9,32.7,...,1.3,Low,0,19.72,6.57,0,Loc_47_Loc_20,91.93,87.57,2025-06-23 05:13:00
17806,B_017807,Friday,0,3,Delhi,Loc_6,Loc_31,Auto,21.94,144.83,...,1.44,Low,0,22.81,3.46,1,Loc_6_Loc_31,85.78,70.53,2025-09-19 03:25:00
75073,B_075074,Monday,0,7,Bangalore,Loc_17,Loc_6,Cab,10.89,71.85,...,1.5,Low,1,37.09,5.62,0,Loc_17_Loc_6,82.03,86.31,2025-09-22 07:23:00


In [6]:
# Feature engineering 


df['peak_hour_cancellation_flag'] = (df['peak_hour_flag'] == 1) & (df['booking_status'] == 'Cancelled')


In [7]:
df.columns

Index(['booking_id', 'day_of_week', 'is_weekend', 'hour_of_day', 'city',
       'pickup_location', 'drop_location', 'vehicle_type', 'ride_distance_km',
       'estimated_ride_time_min', 'actual_ride_time_min', 'traffic_level',
       'weather_condition', 'base_fare', 'surge_multiplier', 'booking_value',
       'booking_status', 'incomplete_ride_reason', 'customer_id', 'driver_id',
       'preferred_vehicle_type', 'cancellation_rate', 'avg_customer_rating',
       'customer_cancel_flag', 'driver_age', 'driver_city',
       'driver_experience_years', 'delay_count', 'acceptance_rate',
       'avg_driver_rating', 'avg_pickup_delay_min', 'driver_delay_flag',
       'total_requests', 'completed_rides', 'cancelled_rides',
       'avg_wait_time_min', 'avg_surge_multiplier', 'demand_level',
       'peak_hour_flag', 'fare_per_km', 'fare_per_min', 'long_distance_flag',
       'city_pair', 'driver_reliability_score', 'customer_loyalty_score',
       'booking_datetime', 'peak_hour_cancellation_flag

In [18]:
# Feature Selection

selected_features = [
    'hour_of_day',
    'peak_hour_flag',
    'is_weekend',
    'surge_multiplier',
    'peak_hour_cancellation_flag',
    'fare_per_km',
    'fare_per_min',
    'traffic_level',
    'weather_condition',
    'cancellation_rate',
    'avg_customer_rating',
    'customer_cancel_flag',
]

df = df[selected_features]


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score

In [20]:
X = df.drop(columns=['customer_cancel_flag'])
y = df['customer_cancel_flag']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

In [21]:
numeric_features = [
    'hour_of_day',
    'surge_multiplier',
    'fare_per_km',
    'fare_per_min',
    'cancellation_rate',
    'avg_customer_rating'
]
categorical_features = [
    'peak_hour_flag',
    'is_weekend',
    'peak_hour_cancellation_flag',
    'weather_condition'
]
ordinal_features = ['traffic_level']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ord', OrdinalEncoder(categories=[['Low', 'Medium', 'High']]), ordinal_features)
    ],
    remainder='passthrough'  # keep binary features as is
)

dt_rf_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ord', OrdinalEncoder(categories=[['Low', 'Medium', 'High']]), ordinal_features)
    ],
    remainder='passthrough'  # keep binary features as is
)  

In [22]:
# Logistic Regression Pipeline
logistic_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42,class_weight='balanced',max_iter=1000))
])

lg = logistic_pipeline.fit(X_train, y_train)
y_pred_lg = lg.predict(X_test)

#confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lg))
print("Logistic Regression Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lg))
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lg))
print("Logistic Regression Recall:", recall_score(y_test, y_pred_lg))
print("Logistic Regression Precision:", precision_score(y_test, y_pred_lg))
print("Logistic Regression F1 Score:", f1_score(y_test, y_pred_lg))


Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9135
           1       1.00      1.00      1.00     10865

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000

Logistic Regression Confusion Matrix:
[[ 9135     0]
 [    0 10865]]
Logistic Regression Accuracy: 1.0
Logistic Regression Recall: 1.0
Logistic Regression Precision: 1.0
Logistic Regression F1 Score: 1.0


In [23]:
# Decision Tree Pipeline
dt_pipeline = Pipeline(steps=[
    ('preprocessor', dt_rf_preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42,class_weight='balanced'))
])
dt = dt_pipeline.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))
print("Decision Tree Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt))
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Decision Tree Recall:", recall_score(y_test, y_pred_dt))
print("Decision Tree Precision:", precision_score(y_test, y_pred_dt))
print("Decision Tree F1 Score:", f1_score(y_test, y_pred_dt))


Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9135
           1       1.00      1.00      1.00     10865

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000

Decision Tree Confusion Matrix:
[[ 9135     0]
 [    0 10865]]
Decision Tree Accuracy: 1.0
Decision Tree Recall: 1.0
Decision Tree Precision: 1.0
Decision Tree F1 Score: 1.0


In [24]:
# Random Forest Pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', dt_rf_preprocessor),
    ('classifier', RandomForestClassifier(random_state=42,class_weight='balanced'))
])
rf = rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Recall:", recall_score(y_test, y_pred_rf))
print("Random Forest Precision:", precision_score(y_test, y_pred_rf))
print("Random Forest F1 Score:", f1_score(y_test, y_pred_rf))


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9135
           1       1.00      1.00      1.00     10865

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000

Random Forest Confusion Matrix:
[[ 9135     0]
 [    0 10865]]
Random Forest Accuracy: 1.0
Random Forest Recall: 1.0
Random Forest Precision: 1.0
Random Forest F1 Score: 1.0
