In [0]:
SELECT 
    'UberLens has started' as project_status,
    current_timestamp() as start_time;


project_status,start_time
UberLens has started,2025-08-04T04:55:55.464Z


In [0]:
DESCRIBE TABLE uber_data;

col_name,data_type,comment
VendorID,bigint,
tpep_pickup_datetime,timestamp,
tpep_dropoff_datetime,timestamp,
passenger_count,bigint,
trip_distance,double,
pickup_longitude,double,
pickup_latitude,double,
RatecodeID,bigint,
store_and_fwd_flag,string,
dropoff_longitude,double,


In [0]:
SELECT * 
FROM uber_data
LIMIT 10;

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
1,2016-03-01T00:00:00.000Z,2016-03-01T00:07:55.000Z,1,2.5,-73.97674560546875,40.765151977539055,1,N,-74.00426483154298,40.74612808227539,1,9.0,0.5,0.5,2.05,0.0,0.3,12.35
1,2016-03-01T00:00:00.000Z,2016-03-01T00:11:06.000Z,1,2.9,-73.98348236083984,40.767925262451165,1,N,-74.00594329833984,40.7331657409668,1,11.0,0.5,0.5,3.05,0.0,0.3,15.35
2,2016-03-01T00:00:00.000Z,2016-03-01T00:31:06.000Z,2,19.98,-73.78202056884764,40.64480972290039,1,N,-73.97454071044923,40.6757698059082,1,54.5,0.5,0.5,8.0,0.0,0.3,63.8
2,2016-03-01T00:00:00.000Z,2016-03-01T00:00:00.000Z,3,10.78,-73.86341857910156,40.769813537597656,1,N,-73.96965026855469,40.757766723632805,1,31.5,0.0,0.5,3.78,5.54,0.3,41.62
2,2016-03-01T00:00:00.000Z,2016-03-01T00:00:00.000Z,5,30.43,-73.97174072265625,40.79218292236328,3,N,-74.17716979980467,40.69505310058594,1,98.0,0.0,0.0,0.0,15.5,0.3,113.8
2,2016-03-01T00:00:00.000Z,2016-03-01T00:00:00.000Z,5,5.92,-74.01719665527344,40.70538330078125,1,N,-73.97807312011719,40.75578689575195,1,23.5,1.0,0.5,5.06,0.0,0.3,30.36
2,2016-03-01T00:00:00.000Z,2016-03-01T00:00:00.000Z,6,5.72,-73.99458312988281,40.72784805297852,1,N,0.0,0.0,2,23.0,0.5,0.5,0.0,0.0,0.3,24.3
1,2016-03-01T00:00:01.000Z,2016-03-01T00:16:04.000Z,1,6.2,-73.78877258300781,40.64775848388672,1,N,-73.82920837402342,40.712345123291016,3,20.5,0.5,0.5,0.0,0.0,0.3,21.8
1,2016-03-01T00:00:01.000Z,2016-03-01T00:05:00.000Z,1,0.7,-73.95822143554686,40.76464080810546,1,N,-73.9678955078125,40.76290130615234,1,5.5,0.5,0.5,2.0,0.0,0.3,8.8
2,2016-03-01T00:00:01.000Z,2016-03-01T00:24:06.000Z,3,7.18,-73.98577880859375,40.74119186401367,1,N,-73.94635009765625,40.79787826538086,1,23.5,0.5,0.5,3.2,0.0,0.3,28.0


### 3. Data Quality Check

Basic data overview

In [0]:
SELECT 
    COUNT(*) as total_records,
    COUNT(DISTINCT VendorID) as unique_vendors,
    MIN(tpep_pickup_datetime) as earliest_ride,
    MAX(tpep_pickup_datetime) as latest_ride,
    ROUND(AVG(trip_distance), 2) as avg_distance,
    ROUND(AVG(total_amount), 2) as avg_total_amount
FROM uber_data;

total_records,unique_vendors,earliest_ride,latest_ride,avg_distance,avg_total_amount
100000,2,2016-03-01T00:00:00.000Z,2016-03-10T14:27:09.000Z,3.03,16.39


Missing data analysis

In [0]:
SELECT 
    COUNT(*) as total_rows,
    COUNT(*) - COUNT(VendorID) as missing_vendor_id,
    COUNT(*) - COUNT(tpep_pickup_datetime) as missing_pickup_time,
    COUNT(*) - COUNT(passenger_count) as missing_passenger_count,
    COUNT(*) - COUNT(trip_distance) as missing_trip_distance,
    COUNT(*) - COUNT(fare_amount) as missing_fare_amount,
    COUNT(*) - COUNT(total_amount) as missing_total_amount
FROM uber_data;

total_rows,missing_vendor_id,missing_pickup_time,missing_passenger_count,missing_trip_distance,missing_fare_amount,missing_total_amount
100000,0,0,0,0,0,0


Data Quality Flag

In [0]:
SELECT 
    COUNT(*) as total_records,
    SUM(CASE WHEN trip_distance <= 0 THEN 1 ELSE 0 END) as zero_distance_trips,
    SUM(CASE WHEN fare_amount <= 0 THEN 1 ELSE 0 END) as zero_fare_trips,
    SUM(CASE WHEN passenger_count <= 0 OR passenger_count > 8 THEN 1 ELSE 0 END) as invalid_passenger_count,
    SUM(CASE WHEN pickup_longitude = 0 OR pickup_latitude = 0 THEN 1 ELSE 0 END) as missing_coordinates
FROM uber_data;

total_records,zero_distance_trips,zero_fare_trips,invalid_passenger_count,missing_coordinates
100000,584,101,3,925


### 4. First Business Insights

1. Temporal Pattern Analysis 
  
  
Hourly Demand Patterns

In [0]:
SELECT 
    HOUR(tpep_pickup_datetime) as pickup_hour,
    COUNT(*) as trip_count,
    ROUND(AVG(fare_amount), 2) as avg_fare,
    ROUND(AVG(tip_amount), 2) as avg_tip,
    ROUND(AVG(total_amount), 2) as avg_total
FROM uber_data
WHERE tpep_pickup_datetime IS NOT NULL
GROUP BY HOUR(tpep_pickup_datetime)
ORDER BY pickup_hour;

pickup_hour,trip_count,avg_fare,avg_tip,avg_total
0,7079,14.01,1.91,17.43
1,4148,13.91,1.75,17.11
2,2602,12.9,1.49,15.8
3,1860,14.09,1.43,17.06
4,1929,16.07,1.88,19.81
5,3697,14.44,2.09,18.41
6,1905,13.58,2.01,17.04
7,9550,12.01,1.82,14.91
8,11708,12.21,1.86,15.13
9,10710,12.59,1.89,15.63


Day of the week analysis

In [0]:
SELECT 
    DAYOFWEEK(tpep_pickup_datetime) as day_number,
    CASE DAYOFWEEK(tpep_pickup_datetime)
        WHEN 1 THEN 'Sunday'
        WHEN 2 THEN 'Monday'
        WHEN 3 THEN 'Tuesday'
        WHEN 4 THEN 'Wednesday'
        WHEN 5 THEN 'Thursday'
        WHEN 6 THEN 'Friday'
        WHEN 7 THEN 'Saturday'
    END as day_name,
    COUNT(*) as trip_count,
    ROUND(AVG(fare_amount), 2) as avg_fare,
    ROUND(SUM(total_amount), 2) as total_revenue
FROM uber_data
WHERE tpep_pickup_datetime IS NOT NULL
GROUP BY DAYOFWEEK(tpep_pickup_datetime)
ORDER BY day_number;

day_number,day_name,trip_count,avg_fare,total_revenue
3,Tuesday,23220,14.08,405907.3
5,Thursday,76780,13.0,1233164.79


2. Revenue Analysis: Payment method analysis

In [0]:
SELECT 
    payment_type,
    CASE payment_type
        WHEN 1 THEN 'Credit Card'
        WHEN 2 THEN 'Cash'
        WHEN 3 THEN 'No Charge'
        WHEN 4 THEN 'Dispute'
        WHEN 5 THEN 'Unknown'
        WHEN 6 THEN 'Voided Trip'
        ELSE 'Other'
    END as payment_method,
    COUNT(*) as trip_count,
    ROUND(AVG(fare_amount), 2) as avg_fare,
    ROUND(AVG(tip_amount), 2) as avg_tip,
    ROUND(SUM(total_amount), 2) as total_revenue,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage_of_trips
FROM uber_data
GROUP BY payment_type
ORDER BY trip_count DESC;

payment_type,payment_method,trip_count,avg_fare,avg_tip,total_revenue,percentage_of_trips
1,Credit Card,66548,13.93,2.81,1202467.81,66.55
2,Cash,33203,11.93,0.0,434002.97,33.2
3,No Charge,173,9.57,0.04,1838.95,0.17
4,Dispute,76,9.45,-0.02,762.36,0.08


3. Geographic Analysis: Top pickup locations

In [0]:
SELECT 
    ROUND(pickup_latitude, 3) as pickup_lat_rounded,
    ROUND(pickup_longitude, 3) as pickup_lng_rounded,
    COUNT(*) as pickup_count,
    ROUND(AVG(fare_amount), 2) as avg_fare,
    ROUND(AVG(trip_distance), 2) as avg_distance
FROM uber_data
WHERE pickup_latitude IS NOT NULL 
    AND pickup_longitude IS NOT NULL
    AND pickup_latitude BETWEEN 40.4 AND 41.0
    AND pickup_longitude BETWEEN -74.3 AND -73.7
GROUP BY ROUND(pickup_latitude, 3), ROUND(pickup_longitude, 3)
HAVING COUNT(*) >= 10
ORDER BY pickup_count DESC
LIMIT 20;

pickup_lat_rounded,pickup_lng_rounded,pickup_count,avg_fare,avg_distance
40.751,-73.994,700,11.88,2.14
40.645,-73.782,531,44.45,15.6
40.75,-73.991,527,11.18,2.08
40.75,-73.992,520,11.4,2.06
40.756,-73.991,460,9.91,1.79
40.774,-73.871,449,32.47,9.67
40.757,-73.99,426,10.03,2.07
40.774,-73.873,377,33.79,9.68
40.645,-73.777,368,42.62,15.16
40.756,-73.99,367,10.14,1.96


5. A dashboard

In [0]:
CREATE OR REPLACE VIEW hourly_demand AS
SELECT 
    HOUR(tpep_pickup_datetime) as pickup_hour,
    COUNT(*) as trip_count,
    ROUND(AVG(fare_amount), 2) as avg_fare,
    ROUND(SUM(total_amount), 2) as total_revenue
FROM uber_data
WHERE tpep_pickup_datetime IS NOT NULL
GROUP BY HOUR(tpep_pickup_datetime);

CREATE OR REPLACE VIEW daily_patterns AS
SELECT 
    DAYOFWEEK(tpep_pickup_datetime) as day_number,
    CASE DAYOFWEEK(tpep_pickup_datetime)
        WHEN 1 THEN 'Sunday'
        WHEN 2 THEN 'Monday'  
        WHEN 3 THEN 'Tuesday'
        WHEN 4 THEN 'Wednesday'
        WHEN 5 THEN 'Thursday'
        WHEN 6 THEN 'Friday'
        WHEN 7 THEN 'Saturday'
    END as day_name,
    COUNT(*) as trip_count,
    ROUND(AVG(fare_amount), 2) as avg_fare,
    ROUND(SUM(total_amount), 2) as total_revenue
FROM uber_data
WHERE tpep_pickup_datetime IS NOT NULL
GROUP BY DAYOFWEEK(tpep_pickup_datetime);

In [0]:
SELECT *
FROM hourly_demand;



pickup_hour,trip_count,avg_fare,total_revenue
1,4148,13.91,70962.13
4,1929,16.07,38209.59
13,10204,14.0,175717.63
8,11708,12.21,177146.47
6,1905,13.58,32469.97
3,1860,14.09,31724.88
11,9609,13.12,154918.11
0,7079,14.01,123377.43
2,2602,12.9,41108.15
7,9550,12.01,142396.05


In [0]:
SELECT *
FROM daily_patterns;

day_number,day_name,trip_count,avg_fare,total_revenue
3,Tuesday,23220,14.08,405907.3
5,Thursday,76780,13.0,1233164.79
