In [5]:
import pandas as pd

# 1. Load data
df = pd.read_csv(r'C:\Users\adith\OneDrive\Desktop\Project\New folder\Uber_dataset.csv')

# 2. Trim whitespace and standardize text
df.columns = df.columns.str.strip()
df['Booking Status'] = df['Booking Status'].str.strip().str.title()

# 3. Convert data types safely
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Time'] = pd.to_datetime(df['Time'], errors='coerce', format='%H:%M', exact=False)

# 4. Create derived columns (hour, day, month)
df['Hour'] = df['Time'].dt.hour
df['Day'] = df['Date'].dt.day_name()
df['Month'] = df['Date'].dt.month_name()

# 5. Fill missing numeric values
df['Booking Value'] = pd.to_numeric(df['Booking Value'], errors='coerce').fillna(0)
df['Ride Distance'] = pd.to_numeric(df['Ride Distance'], errors='coerce').fillna(0)

# 6. Handle missing critical IDs only
df = df.dropna(subset=['Booking ID', 'Booking Status'])

# 7. Save cleaned data
df.to_csv("Uber_Cleaned.csv", index=False)


In [6]:
print(df.shape)
print(df['Booking Status'].value_counts())
print(df['Vehicle Type'].value_counts())
print(df.isnull().sum())


(150000, 24)
Booking Status
Completed                93000
Cancelled By Driver      27000
No Driver Found          10500
Cancelled By Customer    10500
Incomplete                9000
Name: count, dtype: int64
Vehicle Type
Auto             37419
Go Mini          29806
Go Sedan         27141
Bike             22517
Premier Sedan    18111
eBike            10557
Uber XL           4449
Name: count, dtype: int64
Date                                      0
Time                                      0
Booking ID                                0
Booking Status                            0
Customer ID                               0
Vehicle Type                              0
Pickup Location                           0
Drop Location                             0
Avg VTAT                              10500
Avg CTAT                              48000
Cancelled Rides by Customer          139500
Reason for cancelling by Customer    139500
Cancelled Rides by Driver            123000
Driver Cancellation

In [7]:
df.to_csv(r"C:\Users\adith\OneDrive\Desktop\Project\Uber_Cleaned1.csv", index=False)
