In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [0]:
df = spark.table("uber_data_clean")

print(f"✓ Records loaded: {df.count():,}")
print(f"✓ Columns: {len(df.columns)}")


In [0]:
overall_metrics = spark.sql("""
SELECT 
    COUNT(*) as TotalBookings,
    SUM(IsCompleted) as Completed_rides,
    SUM(IsCustomerCancelled + IsDriverCancelled) as Cancelled_rides,
    SUM(IsNoDriverFound + IsIncomplete) as Incomplete_rides,
    ROUND(SUM(IsCompleted) / COUNT(*) * 100, 2) as SuccessRate_Percent,
    ROUND(SUM(IsCustomerCancelled + IsDriverCancelled) / COUNT(*) * 100, 2) as CancellationRate_Percent,
    ROUND(SUM(IsNoDriverFound + IsIncomplete) / COUNT(*) * 100, 2) as IncompleteRate_Percent
FROM uber_data_clean
""")

print("\n✓ OVERALL METRICS:")
overall_metrics.display()


In [0]:
vehicle_analysis = spark.sql("""
SELECT 
    VehicleType,
    COUNT(*) as TotalBookings,
    SUM(IsCompleted) as CompletedBookings,
    SUM(IsCustomerCancelled + IsDriverCancelled) as CancelledBookings,
    SUM(IsCustomerCancelled) as CustomerCancellations,
    SUM(IsDriverCancelled) as DriverCancellations,
    SUM(IsNoDriverFound) as NoDriverFound,
    SUM(IsIncomplete) as IncompleteRides,
    ROUND(SUM(IsCustomerCancelled + IsDriverCancelled) / COUNT(*) * 100, 2) as CancellationRate_Percent,
    ROUND(SUM(IsCompleted) / COUNT(*) * 100, 2) as CompletedRate_Percent,
    ROUND(AVG(BookingValue), 2) as AvgBookingValue,
    ROUND(SUM(BookingValue), 0) as TotalRevenue,
    ROUND(AVG(DriverRating), 2) as AvgDriverRating,
    ROUND(AVG(CustomerRating), 2) as AvgCustomerRating
FROM uber_data_clean
GROUP BY VehicleType
ORDER BY CancellationRate_Percent DESC
""")

print("\n✓ CANCELLATION BY VEHICLE TYPE:")
vehicle_analysis.display()


In [0]:
hourly_analysis = spark.sql("""
SELECT 
    Hour,
    COUNT(*) as TotalBookings,
    SUM(IsCompleted) as CompletedBookings,
    SUM(IsCustomerCancelled + IsDriverCancelled) as CancelledBookings,
    SUM(IsCustomerCancelled) as CustomerCancellations,
    SUM(IsDriverCancelled) as DriverCancellations,
    SUM(IsNoDriverFound) as NoDriverFound,
    ROUND(SUM(IsCustomerCancelled + IsDriverCancelled) / COUNT(*) * 100, 2) as CancellationRate_Percent,
    ROUND(SUM(BookingValue), 0) as TotalRevenue,
    ROUND(AVG(AvgVTAT), 2) as AvgWaitTime_Min
FROM uber_data_clean
WHERE Hour IS NOT NULL
GROUP BY Hour
ORDER BY Hour
""")

print("\n✓ CANCELLATION BY HOUR:")
hourly_analysis.display()

In [0]:
daily_analysis = spark.sql("""
SELECT 
    DayOfWeek,
    DayOfWeekName,
    IsWeekend,
    COUNT(*) as TotalBookings,
    SUM(IsCompleted) as CompletedBookings,
    SUM(IsCustomerCancelled + IsDriverCancelled) as CancelledBookings,
    ROUND(SUM(IsCustomerCancelled + IsDriverCancelled) / COUNT(*) * 100, 2) as CancellationRate_Percent,
    ROUND(SUM(BookingValue), 0) as TotalRevenue,
    ROUND(AVG(CustomerRating), 2) as AvgCustomerRating
FROM uber_data_clean
GROUP BY DayOfWeek, DayOfWeekName, IsWeekend
ORDER BY DayOfWeek
""")

print("\n✓ CANCELLATION BY DAY OF WEEK:")
daily_analysis.display()


In [0]:
time_period_analysis = spark.sql("""
SELECT 
    TimePeriod,
    COUNT(*) as TotalBookings,
    SUM(IsCompleted) as CompletedBookings,
    SUM(IsCustomerCancelled + IsDriverCancelled) as CancelledBookings,
    ROUND(SUM(IsCustomerCancelled + IsDriverCancelled) / COUNT(*) * 100, 2) as CancellationRate_Percent,
    ROUND(AVG(AvgVTAT), 2) as AvgWaitTime_Min,
    ROUND(AVG(BookingValue), 2) as AvgBookingValue,
    ROUND(SUM(BookingValue), 0) as TotalRevenue
FROM uber_data_clean
GROUP BY TimePeriod
ORDER BY CancellationRate_Percent DESC
""")

print("\n✓ CANCELLATION BY TIME PERIOD (Peak vs Off-Peak):")
time_period_analysis.display()

In [0]:

customer_reasons = spark.sql("""
SELECT 
    CustomerCancellationReason as CancellationReason,
    COUNT(*) as Count,
    ROUND(COUNT(*) / SUM(COUNT(*)) OVER() * 100, 2) as Percentage
FROM uber_data_clean
WHERE IsCustomerCancelled = 1 AND CustomerCancellationReason != 'Unknown'
GROUP BY CustomerCancellationReason
ORDER BY Count DESC
""")

print("\n✓ TOP CUSTOMER CANCELLATION REASONS:")
customer_reasons.display()

In [0]:
driver_reasons = spark.sql("""
SELECT 
    DriverCancellationReason as CancellationReason,
    COUNT(*) as Count,
    ROUND(COUNT(*) / SUM(COUNT(*)) OVER() * 100, 2) as Percentage
FROM uber_data_clean
WHERE IsDriverCancelled = 1 AND DriverCancellationReason != 'Unknown'
GROUP BY DriverCancellationReason
ORDER BY Count DESC
""")

print("\n✓ TOP DRIVER CANCELLATION REASONS:")
driver_reasons.display()

In [0]:
incomplete_reasons = spark.sql("""
SELECT 
    IncompleteRideReason as IncompleteReason,
    COUNT(*) as Count,
    ROUND(COUNT(*) / SUM(COUNT(*)) OVER() * 100, 2) as Percentage
FROM uber_data_clean
WHERE IsIncomplete = 1 AND IncompleteRideReason != 'Unknown'
GROUP BY IncompleteRideReason
ORDER BY Count DESC
""")

print("\n✓ TOP INCOMPLETE RIDE REASONS:")
incomplete_reasons.display()

In [0]:
wait_time_analysis = spark.sql("""
SELECT 
    CASE 
        WHEN AvgVTAT <= 2 THEN 'Under 2 min'
        WHEN AvgVTAT <= 5 THEN '2-5 min'
        WHEN AvgVTAT <= 10 THEN '5-10 min'
        WHEN AvgVTAT <= 15 THEN '10-15 min'
        ELSE 'Over 15 min'
    END as WaitTimeBucket,
    CASE 
        WHEN AvgVTAT <= 2 THEN 1
        WHEN AvgVTAT <= 5 THEN 2
        WHEN AvgVTAT <= 10 THEN 3
        WHEN AvgVTAT <= 15 THEN 4
        ELSE 5
    END as SortOrder,
    COUNT(*) as TotalBookings,
    SUM(IsCompleted) as CompletedBookings,
    SUM(IsCustomerCancelled + IsDriverCancelled) as CancelledBookings,
    ROUND(SUM(IsCustomerCancelled + IsDriverCancelled) / COUNT(*) * 100, 2) as CancellationRate_Percent,
    ROUND(AVG(BookingValue), 2) as AvgBookingValue
FROM uber_data_clean
WHERE AvgVTAT > 0
GROUP BY 
    CASE 
        WHEN AvgVTAT <= 2 THEN 'Under 2 min'
        WHEN AvgVTAT <= 5 THEN '2-5 min'
        WHEN AvgVTAT <= 10 THEN '5-10 min'
        WHEN AvgVTAT <= 15 THEN '10-15 min'
        ELSE 'Over 15 min'
    END,
    CASE 
        WHEN AvgVTAT <= 2 THEN 1
        WHEN AvgVTAT <= 5 THEN 2
        WHEN AvgVTAT <= 10 THEN 3
        WHEN AvgVTAT <= 15 THEN 4
        ELSE 5
    END
ORDER BY SortOrder
""")

print("\n✓ CANCELLATION BY WAIT TIME:")
wait_time_analysis.display()

In [0]:
monthly_trends = spark.sql("""
SELECT 
    Month,
    MonthName,
    COUNT(*) as TotalBookings,
    SUM(IsCompleted) as CompletedBookings,
    SUM(IsCustomerCancelled + IsDriverCancelled) as CancelledBookings,
    ROUND(SUM(IsCustomerCancelled + IsDriverCancelled) / COUNT(*) * 100, 2) as CancellationRate_Percent,
    ROUND(SUM(BookingValue), 0) as TotalRevenue,
    ROUND(AVG(CustomerRating), 2) as AvgCustomerRating
FROM uber_data_clean
GROUP BY Month, MonthName
ORDER BY Month
""")

print("\n✓ MONTHLY TRENDS:")
monthly_trends.display()

In [0]:
insights = spark.sql("""
SELECT 
    'Overall Success Rate' as InsightType,
    CONCAT(ROUND(SUM(IsCompleted) / COUNT(*) * 100, 2), '%') as Value
FROM uber_data_clean
UNION ALL
SELECT 
    'Overall Cancellation Rate' as InsightType,
    CONCAT(ROUND(SUM(IsCustomerCancelled + IsDriverCancelled) / COUNT(*) * 100, 2), '%') as Value
FROM uber_data_clean
UNION ALL
SELECT 
    'Most Common Cancellation Type' as InsightType,
    (SELECT CONCAT(
        CASE 
            WHEN SUM(IsCustomerCancelled) > SUM(IsDriverCancelled) THEN 'Customer'
            ELSE 'Driver'
        END
    ) FROM uber_data_clean) as Value
UNION ALL
SELECT 
    'Highest Cancellation Rate Vehicle' as InsightType,
    (SELECT VehicleType FROM (
        SELECT VehicleType, ROUND(SUM(IsCustomerCancelled + IsDriverCancelled) / COUNT(*) * 100, 2) as rate
        FROM uber_data_clean
        GROUP BY VehicleType
        ORDER BY rate DESC
        LIMIT 1
    )) as Value
UNION ALL
SELECT 
    'Lowest Cancellation Rate Vehicle' as InsightType,
    (SELECT VehicleType FROM (
        SELECT VehicleType, ROUND(SUM(IsCustomerCancelled + IsDriverCancelled) / COUNT(*) * 100, 2) as rate
        FROM uber_data_clean
        GROUP BY VehicleType
        ORDER BY rate ASC
        LIMIT 1
    )) as Value
UNION ALL
SELECT 
    'Peak Cancellation Hour' as InsightType,
    (SELECT CONCAT(Hour, ':00') FROM (
        SELECT Hour, ROUND(SUM(IsCustomerCancelled + IsDriverCancelled) / COUNT(*) * 100, 2) as rate
        FROM uber_data_clean
        WHERE Hour IS NOT NULL
        GROUP BY Hour
        ORDER BY rate DESC
        LIMIT 1
    )) as Value
UNION ALL
SELECT 
    'Total Bookings' as InsightType,
    CAST(COUNT(*) as STRING) as Value
FROM uber_data_clean
""")

print("\n✓ KEY INSIGHTS:")
insights.display()