In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

In [2]:
df = pd.read_csv("Bookings.csv")

# Data Exploration

In [3]:
df.head()

Unnamed: 0,Date,Time,Booking_ID,Booking_Status,Customer_ID,Vehicle_Type,Pickup_Location,Drop_Location,V_TAT,C_TAT,...,Canceled_Rides_by_Driver,Incomplete_Rides,Incomplete_Rides_Reason,Booking_Value,Payment_Method,Ride_Distance,Driver_Ratings,Customer_Rating,Vehicle Images,Unnamed: 20
0,2024-07-26 14:00:00,14:00:00,CNR7153255142,Canceled by Driver,CID713523,Prime Sedan,Tumkur Road,RT Nagar,,,...,Personal & Car related issue,,,444,,0,,,#NAME?,
1,2024-07-25 22:20:00,22:20:00,CNR2940424040,Success,CID225428,Bike,Magadi Road,Varthur,203.0,30.0,...,,No,,158,Cash,13,4.1,4.0,#NAME?,
2,2024-07-30 19:59:00,19:59:00,CNR2982357879,Success,CID270156,Prime SUV,Sahakar Nagar,Varthur,238.0,130.0,...,,No,,386,UPI,40,4.2,4.8,#NAME?,
3,2024-07-22 3:15:00,03:15:00,CNR2395710036,Canceled by Customer,CID581320,eBike,HSR Layout,Vijayanagar,,,...,,,,384,,0,,,#NAME?,
4,2024-07-02 9:02:00,09:02:00,CNR1797421769,Success,CID939555,Mini,Rajajinagar,Chamarajpet,252.0,80.0,...,,No,,822,Credit Card,45,4.0,3.0,#NAME?,


In [4]:
df.shape

(103024, 21)

- V_TAT - Vehicle turnaround time (Time vehicle takes to pickup customer after accepting the ride.)
- C_TAT - Customer turn around time (Time taken by customer to board cab after cab is reached at location.)


In [5]:
df.describe()

Unnamed: 0,V_TAT,C_TAT,Booking_Value,Ride_Distance,Driver_Ratings,Customer_Rating,Unnamed: 20
count,63967.0,63967.0,103024.0,103024.0,63967.0,63967.0,0.0
mean,170.876952,84.873372,548.751883,14.189927,3.997457,3.998313,
std,80.80364,36.0051,536.541221,15.77627,0.576834,0.578957,
min,35.0,25.0,100.0,0.0,3.0,3.0,
25%,98.0,55.0,242.0,0.0,3.5,3.5,
50%,168.0,85.0,386.0,8.0,4.0,4.0,
75%,238.0,115.0,621.0,26.0,4.5,4.5,
max,308.0,145.0,2999.0,49.0,5.0,5.0,


- V_TAT = mean = 50%val (distribution is normal)
- C_TAT = mean = 50%val (distribution is normal)
- Booking_value = Has some outliers.
- Ride_distance = Has some outliers.
- Driver_ratings = Almost driver ratings are same.
- Customer_ratings = Almost customer ratings are same. 
#### Numerical colums (Above metioned has almost no effect on bookings.)

In [6]:
# count missing values in each column
df.isnull().sum().reset_index()
null_percent = (df.isnull().sum() / len(df)) * 100
null_percent

Date                            0.000000
Time                            0.000000
Booking_ID                      0.000000
Booking_Status                  0.000000
Customer_ID                     0.000000
Vehicle_Type                    0.000000
Pickup_Location                 0.000000
Drop_Location                   0.000000
V_TAT                          37.910584
C_TAT                          37.910584
Canceled_Rides_by_Customer     89.809171
Canceled_Rides_by_Driver       82.107082
Incomplete_Rides               37.910584
Incomplete_Rides_Reason        96.189237
Booking_Value                   0.000000
Payment_Method                 37.910584
Ride_Distance                   0.000000
Driver_Ratings                 37.910584
Customer_Rating                37.910584
Vehicle Images                  0.000000
Unnamed: 20                   100.000000
dtype: float64

## DATA PREPROCESSING


### Cleaning Opportunities 
1. Filling null values in V_TAT and C_TAT column 
2. Date column contains Date + Time and datatype is object.
3. Null values in driver rating columns.
4. Customer rating column null values. 
5. Vehicle images and Unnamed column is not needed.
6. Time columns values are in object format.


In [7]:
# Removing vehicle images and unnamed column 
df.drop(columns={"Vehicle Images","Unnamed: 20"}, inplace=True)

1. Total 10499 rides were cancelled by customer
2. Total 18434 rides were cancelled by drivers 
3. Total 10124 rides were not executed due to other reasons. 

In [8]:
df['Date'] = df['Date'].str.split(' ').str[0]

In [9]:
df['Date'] = pd.to_datetime(df['Date'])

In [10]:
# lets replace booking status column with succes = 1 and cancelled by drivers/riders = 0
df['Booking_Status']=df['Booking_Status'].replace({'Success':1,'Canceled by Driver':0,'Canceled by Customer':0})
df

Unnamed: 0,Date,Time,Booking_ID,Booking_Status,Customer_ID,Vehicle_Type,Pickup_Location,Drop_Location,V_TAT,C_TAT,Canceled_Rides_by_Customer,Canceled_Rides_by_Driver,Incomplete_Rides,Incomplete_Rides_Reason,Booking_Value,Payment_Method,Ride_Distance,Driver_Ratings,Customer_Rating
0,2024-07-26,14:00:00,CNR7153255142,0,CID713523,Prime Sedan,Tumkur Road,RT Nagar,,,,Personal & Car related issue,,,444,,0,,
1,2024-07-25,22:20:00,CNR2940424040,1,CID225428,Bike,Magadi Road,Varthur,203.0,30.0,,,No,,158,Cash,13,4.1,4.0
2,2024-07-30,19:59:00,CNR2982357879,1,CID270156,Prime SUV,Sahakar Nagar,Varthur,238.0,130.0,,,No,,386,UPI,40,4.2,4.8
3,2024-07-22,03:15:00,CNR2395710036,0,CID581320,eBike,HSR Layout,Vijayanagar,,,Driver is not moving towards pickup location,,,,384,,0,,
4,2024-07-02,09:02:00,CNR1797421769,1,CID939555,Mini,Rajajinagar,Chamarajpet,252.0,80.0,,,No,,822,Credit Card,45,4.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103019,2024-07-31,09:06:00,CNR9488489435,1,CID371654,Prime Plus,Richmond Town,Varthur,245.0,35.0,,,No,,111,Cash,41,3.6,3.8
103020,2024-07-31,15:12:00,CNR3151743100,1,CID334158,Auto,Vijayanagar,Richmond Town,84.0,145.0,,,No,,1097,UPI,17,4.3,3.3
103021,2024-07-31,13:59:00,CNR1286151233,1,CID113188,Prime SUV,Bannerghatta Road,JP Nagar,35.0,75.0,,,No,,2201,Cash,37,3.6,3.2
103022,2024-07-31,14:56:00,CNR2027162035,1,CID118301,eBike,Indiranagar,Magadi Road,210.0,140.0,,,No,,267,UPI,47,3.4,3.1


In [11]:
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M:%S')


In [27]:
new_df=df[['Date','Time','Booking_ID','Booking_Status','Customer_ID','Vehicle_Type','Pickup_Location','Drop_Location','V_TAT','C_TAT','Canceled_Rides_by_Customer','Canceled_Rides_by_Driver','Booking_Value','Payment_Method','Ride_Distance','Driver_Ratings','Customer_Rating']]
new_df

Unnamed: 0,Date,Time,Booking_ID,Booking_Status,Customer_ID,Vehicle_Type,Pickup_Location,Drop_Location,V_TAT,C_TAT,Canceled_Rides_by_Customer,Canceled_Rides_by_Driver,Booking_Value,Payment_Method,Ride_Distance,Driver_Ratings,Customer_Rating
0,2024-07-26,1900-01-01 14:00:00,CNR7153255142,0,CID713523,Prime Sedan,Tumkur Road,RT Nagar,,,,Personal & Car related issue,444,,0,,
1,2024-07-25,1900-01-01 22:20:00,CNR2940424040,1,CID225428,Bike,Magadi Road,Varthur,203.0,30.0,,,158,Cash,13,4.1,4.0
2,2024-07-30,1900-01-01 19:59:00,CNR2982357879,1,CID270156,Prime SUV,Sahakar Nagar,Varthur,238.0,130.0,,,386,UPI,40,4.2,4.8
3,2024-07-22,1900-01-01 03:15:00,CNR2395710036,0,CID581320,eBike,HSR Layout,Vijayanagar,,,Driver is not moving towards pickup location,,384,,0,,
4,2024-07-02,1900-01-01 09:02:00,CNR1797421769,1,CID939555,Mini,Rajajinagar,Chamarajpet,252.0,80.0,,,822,Credit Card,45,4.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103019,2024-07-31,1900-01-01 09:06:00,CNR9488489435,1,CID371654,Prime Plus,Richmond Town,Varthur,245.0,35.0,,,111,Cash,41,3.6,3.8
103020,2024-07-31,1900-01-01 15:12:00,CNR3151743100,1,CID334158,Auto,Vijayanagar,Richmond Town,84.0,145.0,,,1097,UPI,17,4.3,3.3
103021,2024-07-31,1900-01-01 13:59:00,CNR1286151233,1,CID113188,Prime SUV,Bannerghatta Road,JP Nagar,35.0,75.0,,,2201,Cash,37,3.6,3.2
103022,2024-07-31,1900-01-01 14:56:00,CNR2027162035,1,CID118301,eBike,Indiranagar,Magadi Road,210.0,140.0,,,267,UPI,47,3.4,3.1


- cancelled_rides_by_customers = cancelled = 1 , not_cancelled = 0
- cancelled_rides_by_drivers = cancelled = 1 , not_cancelled = 0
- Payment_method = replace NaN values with 0 i.e. no payment done.
- Driver ratings - replae Nan values with 0. (Data is showing almost normal distribution)
- Customer ratings - replace NaN values with 0. (Data is showing almost normal distribution.)


In [14]:
# Cancelled_rides column replacement
new_df['Canceled_Rides_by_Customer'] = new_df['Canceled_Rides_by_Customer'].notna().astype(int)
new_df['Canceled_Rides_by_Driver'] = new_df['Canceled_Rides_by_Driver'].notna().astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Canceled_Rides_by_Customer'] = new_df['Canceled_Rides_by_Customer'].notna().astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Canceled_Rides_by_Driver'] = new_df['Canceled_Rides_by_Driver'].notna().astype(int)


In [15]:
new_df

Unnamed: 0,Date,Time,Booking_ID,Booking_Status,Customer_ID,Vehicle_Type,Pickup_Location,Drop_Location,V_TAT,C_TAT,Canceled_Rides_by_Customer,Canceled_Rides_by_Driver,Booking_Value,Payment_Method,Ride_Distance,Driver_Ratings,Customer_Rating
0,2024-07-26,1900-01-01 14:00:00,CNR7153255142,0,CID713523,Prime Sedan,Tumkur Road,RT Nagar,,,0,1,444,,0,,
1,2024-07-25,1900-01-01 22:20:00,CNR2940424040,1,CID225428,Bike,Magadi Road,Varthur,203.0,30.0,0,0,158,Cash,13,4.1,4.0
2,2024-07-30,1900-01-01 19:59:00,CNR2982357879,1,CID270156,Prime SUV,Sahakar Nagar,Varthur,238.0,130.0,0,0,386,UPI,40,4.2,4.8
3,2024-07-22,1900-01-01 03:15:00,CNR2395710036,0,CID581320,eBike,HSR Layout,Vijayanagar,,,1,0,384,,0,,
4,2024-07-02,1900-01-01 09:02:00,CNR1797421769,1,CID939555,Mini,Rajajinagar,Chamarajpet,252.0,80.0,0,0,822,Credit Card,45,4.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103019,2024-07-31,1900-01-01 09:06:00,CNR9488489435,1,CID371654,Prime Plus,Richmond Town,Varthur,245.0,35.0,0,0,111,Cash,41,3.6,3.8
103020,2024-07-31,1900-01-01 15:12:00,CNR3151743100,1,CID334158,Auto,Vijayanagar,Richmond Town,84.0,145.0,0,0,1097,UPI,17,4.3,3.3
103021,2024-07-31,1900-01-01 13:59:00,CNR1286151233,1,CID113188,Prime SUV,Bannerghatta Road,JP Nagar,35.0,75.0,0,0,2201,Cash,37,3.6,3.2
103022,2024-07-31,1900-01-01 14:56:00,CNR2027162035,1,CID118301,eBike,Indiranagar,Magadi Road,210.0,140.0,0,0,267,UPI,47,3.4,3.1


In [16]:
# Driver and customer ratings column fill with 0 for NaN
new_df['Driver_Ratings'] = new_df['Driver_Ratings'].fillna(0)
new_df['Customer_Rating'] = new_df['Customer_Rating'].fillna(0)
new_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Driver_Ratings'] = new_df['Driver_Ratings'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Customer_Rating'] = new_df['Customer_Rating'].fillna(0)


Unnamed: 0,Date,Time,Booking_ID,Booking_Status,Customer_ID,Vehicle_Type,Pickup_Location,Drop_Location,V_TAT,C_TAT,Canceled_Rides_by_Customer,Canceled_Rides_by_Driver,Booking_Value,Payment_Method,Ride_Distance,Driver_Ratings,Customer_Rating
0,2024-07-26,1900-01-01 14:00:00,CNR7153255142,0,CID713523,Prime Sedan,Tumkur Road,RT Nagar,,,0,1,444,,0,0.0,0.0
1,2024-07-25,1900-01-01 22:20:00,CNR2940424040,1,CID225428,Bike,Magadi Road,Varthur,203.0,30.0,0,0,158,Cash,13,4.1,4.0
2,2024-07-30,1900-01-01 19:59:00,CNR2982357879,1,CID270156,Prime SUV,Sahakar Nagar,Varthur,238.0,130.0,0,0,386,UPI,40,4.2,4.8
3,2024-07-22,1900-01-01 03:15:00,CNR2395710036,0,CID581320,eBike,HSR Layout,Vijayanagar,,,1,0,384,,0,0.0,0.0
4,2024-07-02,1900-01-01 09:02:00,CNR1797421769,1,CID939555,Mini,Rajajinagar,Chamarajpet,252.0,80.0,0,0,822,Credit Card,45,4.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103019,2024-07-31,1900-01-01 09:06:00,CNR9488489435,1,CID371654,Prime Plus,Richmond Town,Varthur,245.0,35.0,0,0,111,Cash,41,3.6,3.8
103020,2024-07-31,1900-01-01 15:12:00,CNR3151743100,1,CID334158,Auto,Vijayanagar,Richmond Town,84.0,145.0,0,0,1097,UPI,17,4.3,3.3
103021,2024-07-31,1900-01-01 13:59:00,CNR1286151233,1,CID113188,Prime SUV,Bannerghatta Road,JP Nagar,35.0,75.0,0,0,2201,Cash,37,3.6,3.2
103022,2024-07-31,1900-01-01 14:56:00,CNR2027162035,1,CID118301,eBike,Indiranagar,Magadi Road,210.0,140.0,0,0,267,UPI,47,3.4,3.1


In [28]:
new_df.describe()

Unnamed: 0,Date,Time,V_TAT,C_TAT,Booking_Value,Ride_Distance,Driver_Ratings,Customer_Rating
count,103024,103024,63967.0,63967.0,103024.0,103024.0,63967.0,63967.0
mean,2024-07-15 23:33:01.425687040,1900-01-01 11:58:37.453991424,170.876952,84.873372,548.751883,14.189927,3.997457,3.998313
min,2024-07-01 00:00:00,1900-01-01 00:00:00,35.0,25.0,100.0,0.0,3.0,3.0
25%,2024-07-08 00:00:00,1900-01-01 06:01:00,98.0,55.0,242.0,0.0,3.5,3.5
50%,2024-07-16 00:00:00,1900-01-01 11:58:00,168.0,85.0,386.0,8.0,4.0,4.0
75%,2024-07-24 00:00:00,1900-01-01 17:57:00,238.0,115.0,621.0,26.0,4.5,4.5
max,2024-07-31 00:00:00,1900-01-01 23:59:00,308.0,145.0,2999.0,49.0,5.0,5.0
std,,,80.80364,36.0051,536.541221,15.77627,0.576834,0.578957


# Univariate Analysis

In [31]:
# Seperating numerical and categorical features
numerical_col = new_df[["V_TAT","C_TAT","Canceled_Rides_by_Customer","Canceled_Rides_by_Driver","Booking_Value","Ride_Distance","Driver_Ratings","Customer_Rating"]]
categorical_col = new_df[['Vehicle_Type','Pickup_Location']]

In [32]:
numerical_col.describe()

Unnamed: 0,V_TAT,C_TAT,Booking_Value,Ride_Distance,Driver_Ratings,Customer_Rating
count,63967.0,63967.0,103024.0,103024.0,63967.0,63967.0
mean,170.876952,84.873372,548.751883,14.189927,3.997457,3.998313
std,80.80364,36.0051,536.541221,15.77627,0.576834,0.578957
min,35.0,25.0,100.0,0.0,3.0,3.0
25%,98.0,55.0,242.0,0.0,3.5,3.5
50%,168.0,85.0,386.0,8.0,4.0,4.0
75%,238.0,115.0,621.0,26.0,4.5,4.5
max,308.0,145.0,2999.0,49.0,5.0,5.0
