In [1]:
import numpy as np
import pandas as pd

In [2]:
# Project Title: NYC - Yellow Taxi Demand, Revenue, and Efficiency Analysis
# Dataset Source: NYC Yellow Taxi and Limousines Data (NYC.Gov)
# Time Range : December 2024 - November 2025 (12 Months)

# Overview:
This notebook provides a comprehensive overview of the aggregated NYC Yellow Taxi datasets prepared from a raw trip-lebl dataset containing over 30 million records.

Due to large size of the raw cleaned data, analysis is conducted using aggregated dataframes (hourly_summary, daily_summary, monthly_summary, and borough_summary) to ensure efficient memory usage while preserving the analytical integrity of the data.

In [3]:
monthly_summary = pd.read_parquet('monthly_summary_ready.parquet')

In [4]:
monthly_summary

Unnamed: 0,month,total_trips,total_revenue,revenue_per_trip,avg_distance,avg_trip_duration_min,revenue_per_mile,revenue_per_min,avg_speed_mph,pct_trips,pct_revenue
0,2024-12,3131288,90856299.57,29.015632,3.192428,18.348315,9.088892,1.581379,10.439415,9.118236,9.172049
1,2025-01,2752626,74921291.61,27.218115,3.050282,14.577733,8.923149,1.867102,12.554551,8.015581,7.563392
2,2025-02,2599121,69921520.9,26.901988,2.981682,14.850492,9.022419,1.811522,12.046802,7.568578,7.058659
3,2025-03,3015063,84478047.48,28.018667,3.185436,15.649081,8.795866,1.790435,12.213251,8.779792,8.528157
4,2025-04,2992143,84752728.14,28.325093,3.163116,16.174112,8.954807,1.751261,11.733996,8.713049,8.555887
5,2025-05,3121083,92388448.11,29.601407,3.321262,17.439091,8.912698,1.697417,11.426957,9.088519,9.326722
6,2025-06,2838991,83846836.23,29.534027,3.352754,16.881824,8.808887,1.749457,11.916083,8.267074,8.464436
7,2025-07,2594458,75527143.65,29.110953,3.344164,16.37907,8.705001,1.777326,12.250382,7.555,7.624553
8,2025-08,2417131,71136705.1,29.430223,3.492977,16.310868,8.425542,1.804332,12.849016,7.038628,7.181333
9,2025-09,2879717,86034769.29,29.87612,3.307509,17.617658,9.032817,1.695805,11.264296,8.385667,8.68531


In [5]:
monthly_summary.shape

(12, 11)

In [6]:
# Total Rows : 12
# Total Columns : 11

In [7]:
hourly_summary = pd.read_parquet('hourly_summary_ready.parquet')

In [8]:
hourly_summary

Unnamed: 0,hour,total_trips,avg_distance,avg_trip_duration_min,avg_speed_mph,total_revenue,revenue_per_trip,revenue_per_mile,revenue_per_minute
0,12 AM,906085,3.953933,14.814035,16.014271,27206378.03,30.026298,7.594033,2.026882
1,01 AM,586182,3.462436,13.637324,15.233647,15967830.54,27.240397,7.867408,1.997489
2,02 AM,372693,3.039199,12.410103,14.693831,9347621.18,25.08129,8.252598,2.021038
3,03 AM,238342,3.200313,12.076259,15.90052,6162852.87,25.857184,8.079579,2.141158
4,04 AM,155964,4.524116,13.667195,19.861206,5149171.03,33.015125,7.297585,2.415647
5,05 AM,176703,5.64043,14.511529,23.321167,6702692.86,37.93197,6.725014,2.61392
6,06 AM,402413,4.346215,14.351367,18.170597,12285669.47,30.530001,7.024503,2.127324
7,07 AM,845335,3.419053,15.010847,13.66633,23063529.87,27.283302,7.979783,1.817572
8,08 AM,1196676,2.938067,15.114823,11.66299,31345463.28,26.193776,8.915309,1.732986
9,09 AM,1419864,2.869164,15.631537,11.012981,37661083.6,26.52443,9.244656,1.696854


In [9]:
hourly_summary.shape

(24, 9)

In [10]:
# Total Rows : 24
# Total Columns : 9

In [49]:
daily_summary = pd.read_parquet('daily_summary_ready.parquet')

In [50]:
daily_summary

Unnamed: 0,date,total_trips,total_revenue,revenue_per_trip,avg_distance,avg_trip_duration_min,revenue_per_mile,revenue_per_minute,avg_speed_mph,pct_trips,pct_revenue,day_of_week,day_of_year
0,2024-12-01,83203,2599639.59,31.244542,4.129789,17.503925,7.565651,1.785002,14.156101,0.242335,0.262441,6,336
1,2024-12-02,98793,3113766.80,31.518091,3.819319,19.112268,8.252280,1.649103,11.990159,0.287743,0.314344,0,337
2,2024-12-03,114301,3450449.95,30.187399,3.332373,18.832153,9.058829,1.602971,10.617076,0.332911,0.348333,1,338
3,2024-12-04,117696,3543545.29,30.107610,3.188386,19.301463,9.442899,1.559862,9.911330,0.342799,0.357731,2,339
4,2024-12-05,128789,3919818.75,30.435975,3.212644,20.051796,9.473808,1.517868,9.613036,0.375108,0.395717,3,340
...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,2025-11-26,84161,2401797.51,28.538129,2.948132,17.117767,9.680073,1.667164,10.333584,0.245126,0.242468,2,330
361,2025-11-27,55420,1455818.81,26.268835,3.170484,13.677725,8.285435,1.920556,13.907942,0.161415,0.146969,3,331
362,2025-11-28,71584,2031466.39,28.378777,3.165985,15.204803,8.963649,1.866435,12.493360,0.208494,0.205082,4,332
363,2025-11-29,81290,2439054.06,30.004356,3.571083,16.907393,8.402032,1.774629,12.672857,0.236764,0.246229,5,333


In [51]:
daily_summary.shape

(365, 13)

In [52]:
# Total Rows: 365 (Representing 334 days of the year -> (11 months))
# Total Columns: 13

In [15]:
borough_summary = pd.read_parquet('borough_summary_ready.parquet')

In [16]:
borough_summary

Unnamed: 0,borough,zone,total_trips,total_revenue,avg_distance,avg_trip_duration_min,avg_speed_mph,revenue_per_trip,revenue_per_mile,revenue_per_min
0,EWR,Newark Airport,1150,112811.78,3.322617,8.346101,23.886247,98.097200,29.524073,11.753655
1,Queens,Jamaica Bay,58,4084.37,15.051379,39.061207,23.119684,70.420172,4.678652,1.802816
2,Bronx,Allerton/Pelham Gardens,83,2620.61,5.029398,17.543373,17.201016,31.573614,6.277812,1.799746
3,Manhattan,Alphabet City,41564,1014629.74,2.570325,14.114873,10.926026,24.411263,9.497347,1.729471
4,Staten Island,Arden Heights,3,180.68,4.563333,19.766667,13.851602,60.226667,13.197955,3.046880
...,...,...,...,...,...,...,...,...,...,...
255,Bronx,Woodlawn/Wakefield,97,3851.04,6.193814,19.582302,18.977792,39.701443,6.409854,2.027414
256,Queens,Woodside,6919,335871.24,4.434326,18.007665,14.774794,48.543321,10.947171,2.695703
257,Manhattan,World Trade Center,175243,5789869.46,4.062693,20.585504,11.841418,33.039091,8.132314,1.604969
258,Manhattan,Yorkville East,443050,9815077.73,2.274574,12.142443,11.239455,22.153431,9.739596,1.824463


In [17]:
borough_summary.shape

(260, 10)

In [18]:
# Total Rows: 260 (Representating Total 260 zones in NY within 5 boroughs)
# Total Columns: 10

In [19]:
monthly_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype    
---  ------                 --------------  -----    
 0   month                  12 non-null     period[M]
 1   total_trips            12 non-null     int64    
 2   total_revenue          12 non-null     float64  
 3   revenue_per_trip       12 non-null     float64  
 4   avg_distance           12 non-null     float64  
 5   avg_trip_duration_min  12 non-null     float64  
 6   revenue_per_mile       12 non-null     float64  
 7   revenue_per_min        12 non-null     float64  
 8   avg_speed_mph          12 non-null     float64  
 9   pct_trips              12 non-null     float64  
 10  pct_revenue            12 non-null     float64  
dtypes: float64(9), int64(1), period[M](1)
memory usage: 1.2 KB


In [20]:
monthly_summary.head()

Unnamed: 0,month,total_trips,total_revenue,revenue_per_trip,avg_distance,avg_trip_duration_min,revenue_per_mile,revenue_per_min,avg_speed_mph,pct_trips,pct_revenue
0,2024-12,3131288,90856299.57,29.015632,3.192428,18.348315,9.088892,1.581379,10.439415,9.118236,9.172049
1,2025-01,2752626,74921291.61,27.218115,3.050282,14.577733,8.923149,1.867102,12.554551,8.015581,7.563392
2,2025-02,2599121,69921520.9,26.901988,2.981682,14.850492,9.022419,1.811522,12.046802,7.568578,7.058659
3,2025-03,3015063,84478047.48,28.018667,3.185436,15.649081,8.795866,1.790435,12.213251,8.779792,8.528157
4,2025-04,2992143,84752728.14,28.325093,3.163116,16.174112,8.954807,1.751261,11.733996,8.713049,8.555887


In [21]:
hourly_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   hour                   24 non-null     object 
 1   total_trips            24 non-null     int64  
 2   avg_distance           24 non-null     float64
 3   avg_trip_duration_min  24 non-null     float64
 4   avg_speed_mph          24 non-null     float64
 5   total_revenue          24 non-null     float64
 6   revenue_per_trip       24 non-null     float64
 7   revenue_per_mile       24 non-null     float64
 8   revenue_per_minute     24 non-null     float64
dtypes: float64(7), int64(1), object(1)
memory usage: 1.8+ KB


In [22]:
borough_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260 entries, 0 to 259
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   borough                260 non-null    object 
 1   zone                   260 non-null    object 
 2   total_trips            260 non-null    int64  
 3   total_revenue          260 non-null    float64
 4   avg_distance           260 non-null    float64
 5   avg_trip_duration_min  260 non-null    float64
 6   avg_speed_mph          260 non-null    float64
 7   revenue_per_trip       260 non-null    float64
 8   revenue_per_mile       260 non-null    float64
 9   revenue_per_min        260 non-null    float64
dtypes: float64(7), int64(1), object(2)
memory usage: 20.4+ KB


In [23]:
daily_summary.shape

(334, 13)

# Daily_Summary Documentation:
"""
The daily summary aggregates NYC Yellow Taxi trips by calendar date.
The dataset spans **December 2024 through November 2025**. 
After aggregation, the summary contains **365 unique days**, reflecting every day with recorded trips.
This ensures full alignment with the raw dataset and consistency with monthly summary periods.
All daily metrics are based on actual trips, preserving the integrity of the data for temporal analysis.
"""


# Monthly_Summary Documentation:
In the monthly summary, trips were grouped by calendar month. The dataset spans from December 2024 through November 2025, resulting in 12 monthly aggregation periods. Each row represents one complete calendar month with available trip data.

The monthly summary reflects the true temporal coverage of the dataset and is used to analyze long-term demand trends, revenue distribution, and seasonal efficiency patterns across the year.


# Hourly_Summary Documentation:
In the hourly summary, trips were aggregated by hour of day (0–23) across the entire dataset. This aggregation produces exactly 24 rows, corresponding to each hour in a 24-hour day.

The hourly summary enables identification of peak and off-peak demand periods, revenue efficiency by hour, and operational performance trends without being affected by missing calendar dates.


# Borough_Summary Documentation:
In the borough (zone-level) summary, trips were grouped by pickup location zones present in the dataset. The summary contains 260 rows, representing active taxi zones with valid trip records after data cleaning.

Some TLC zones are excluded due to invalid location identifiers or insufficient data coverage. The resulting summary accurately reflects the geographic distribution of taxi demand, revenue, and efficiency across operational areas.


# Dataset Loading and Shape Validation

In [24]:
monthly_summary.shape

(12, 11)

In [53]:
daily_summary.shape

(365, 13)

In [54]:
hourly_summary.shape

(24, 9)

In [55]:
borough_summary.shape

(260, 10)

# Shape and Aggregation Documentation

## Monthly_Summary:
### The monthly_summary dataset represents month-level aggregation of NYC Yellow Taxi trips across the full analysis period
### Rows: 12
### Columns: 11
### Each row corresponds to a calendar month between December 2024 and November 2025, 
### containing aggregated demand, revenue, and efficiency metrics.
### This dataset supports long-term trend analysis and month-over-month comparison.




## Daily_Summary:
### The daily_summary dataset aggregates taxi trips at the daily level across the defined time range.
### Rows: 365
### Columns: 13
### Each row represents one day of taxi operations, enabling weekday vs weekend comparisons
### and short-term demand variability analysis.





## Hourly_Summary:
### The hourly_summary dataset captures hour-of-day aggregation across the entire dataset.
### Rows: 24
### Columns: 9
### each row corresponds to an hour allowing identification of:
### 1. peak demand hours
### 2. revenue and efficiency patterns by time of day



## Borough_Summary:
### The borough_summary dataset provides geographic aggregation of NYC Yellow Taxi activity.
### Rows: 260
### Columns: 10
### each row represens an active taxi pickup zone included in the analysis.
### zones excluded during data cleaning (invalid or low-activiy zones) account for fewer rows than the full TLC Zone count.
### This dataset enables spatial comparison of demand, revenue, and efficiency.