In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_parquet('final.parquet')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34340943 entries, 0 to 34340942
Data columns (total 23 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   vendor_id                      int64  
 1   pickup_datetime                object 
 2   dropoff_datetime               object 
 3   passenger_count                int64  
 4   trip_distance                  float64
 5   rate_code_id                   float64
 6   store_and_fwd_flag             object 
 7   pu_location_id                 int64  
 8   do_location_id                 int64  
 9   payment_type                   object 
 10  fare_amount                    float64
 11  extra                          float64
 12  mta_tax                        float64
 13  tip_amount                     float64
 14  tolls_amount                   float64
 15  improvement_surcharge_applied  float64
 16  total_amount                   float64
 17  congestion_surcharge           float64
 18  

In [5]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])

df['tip_percentage'] = (df['tip_amount'] / df['fare_amount']) * 100

distance_bins = [0,2,5,10, np.inf]
distance_lables = ['0-2 Miles (Micro Urban)', 
                  '2-5 Miles (Urban)', '5-10 Miles (Mid-Range)', '10+ Miles (Long-Haul)']
df['distance_segment'] = pd.cut(df['trip_distance'], bins=distance_bins, labels=distance_lables)

In [7]:
borough_tip = df.groupby(['pu_location_id', 'do_location_id']).agg(
    total_tip=('tip_amount', 'sum'),
    tip_percentage = ('tip_percentage', 'mean')
).reset_index()

In [8]:
borough_tip.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31228 entries, 0 to 31227
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   pu_location_id  31228 non-null  int64  
 1   do_location_id  31228 non-null  int64  
 2   total_tip       31228 non-null  float64
 3   tip_percentage  31208 non-null  float64
dtypes: float64(2), int64(2)
memory usage: 976.0 KB


In [4]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

In [5]:
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])

In [8]:
hourly_summary = df.groupby(df['pickup_datetime'].dt.hour).agg(
    trip_count = ('total_amount', 'count'),
    total_revenue = ('total_amount', 'sum'),
    avg_fare = ('total_amount', 'mean'),
    avg_distance = ('trip_distance', 'mean'),
    avg_trip = ('trip_duration_min', 'mean')
).reset_index().rename(columns={'pickup_datetime':'hour'})
print('The Hourly Summary is getting Printed, please Wait...')
hourly_summary.to_parquet('hourly_summary.parquet', index=False)
# Finalized
daily_summary = df.groupby(df['pickup_datetime'].dt.floor('d')).agg(
    trip_count = ('total_amount', 'count'),
    total_revenue = ('total_amount', 'sum'),
    avg_fare = ('total_amount', 'mean'),
    avg_distance = ('trip_distance', 'mean'),
    avg_trip = ('trip_duration_min', 'mean')
).reset_index().rename(columns={'pickup_datetime':'date'})
print('The Daily Summary is getting Printed, please Wait...')
daily_summary.to_parquet('daily_summary_final.parquet', index=False)
# Finalized
monthly_summary = df.groupby(
    df['pickup_datetime'].dt.to_period('M').rename('month')
).agg(
    trip_count = ('total_amount', 'count'),
    total_revenue = ('total_amount', 'sum'),
    avg_fare = ('total_amount', 'mean'),
    avg_distance = ('trip_distance', 'mean'),
    avg_trip = ('trip_duration_min', 'mean')
).reset_index()

print('The Monthly Summary is getting Printed, please Wait...')
monthly_summary.to_parquet('monthly_summary1.parquet', index=False)



The Hourly Summary is getting Printed, please Wait...
The Daily Summary is getting Printed, please Wait...
The Monthly Summary is getting Printed, please Wait...


In [9]:
borough_summary = df.groupby('pu_location_id').agg(
    total_trips = ('total_amount', 'count'),
    total_revenue = ('total_amount', 'sum'),
    avg_fare = ('total_amount', 'mean'),
    avg_distance = ('trip_distance', 'mean'),
    avg_trip = ('trip_duration_min', 'mean')
).reset_index().rename(columns={'pu_location_id':'borough_id'})
print('The Borough Summary is being generated, please wait...')
borough_summary.to_parquet('borough_summary.parquet', index=False)

The Borough Summary is being generated, please wait...


In [None]:
monthly_summary = df.groupby(df['pickup_datetime'].dt.month).agg(
    trip_count = ('total_amount', 'count'),
    total_revenue = ('total_amount', 'sum'),
    avg_fare = ('total_amount', 'mean'),
    avg_distance = ('trip_distance', 'mean'),
    avg_trip = ('trip_duration_min', 'mean')
).reset_index().rename(columns={'pickup_datetime':'month'})
print('The Monthly Summary is getting Printed, please Wait...')
monthly_summary.to_parquet('monthly_summary1.parquet', index=False)

In [7]:
print(df['pickup_datetime'].dt.month.unique())

[ 4  8 12  2  1  7  6  3  5 11 10  9]


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34340943 entries, 0 to 34340942
Data columns (total 22 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   vendor_id                      int64  
 1   pickup_datetime                object 
 2   dropoff_datetime               object 
 3   passenger_count                int64  
 4   trip_distance                  float64
 5   rate_code_id                   float64
 6   store_and_fwd_flag             object 
 7   pu_location_id                 int64  
 8   do_location_id                 int64  
 9   payment_type                   object 
 10  fare_amount                    float64
 11  extra                          float64
 12  mta_tax                        float64
 13  tip_amount                     float64
 14  tolls_amount                   float64
 15  improvement_surcharge_applied  float64
 16  total_amount                   float64
 17  congestion_surcharge           float64
 18  

In [5]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

In [6]:
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])

In [8]:
print(df['pickup_datetime'].dt.month.value_counts().sort_index())

pickup_datetime
1     2752628
2     2599121
3     3015063
4     2992143
5     3121083
6     2838991
7     2594458
8     2417131
9     2879717
10    3110938
11    2888381
12    3131289
Name: count, dtype: int64


In [9]:
print(df['pickup_datetime'].dt.to_period('M').value_counts().sort_index())

pickup_datetime
2008-12          1
2009-01          2
2024-12    3131288
2025-01    2752626
2025-02    2599121
2025-03    3015063
2025-04    2992143
2025-05    3121083
2025-06    2838991
2025-07    2594458
2025-08    2417131
2025-09    2879717
2025-10    3110938
2025-11    2888381
Freq: M, Name: count, dtype: int64


In [10]:
# Filter rows where year = 2024 and month = 12
december_2024_trips = df[
    (df['pickup_datetime'].dt.year == 2024) &
    (df['pickup_datetime'].dt.month == 12)
]

# Count total trips
total_trips_dec_2024 = december_2024_trips.shape[0]
print("Total trips in December 2024:", total_trips_dec_2024)

Total trips in December 2024: 3131288


In [6]:
df = df[~df['pickup_datetime'].dt.year.isin([2008,2009])]

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34340943 entries, 0 to 34340942
Data columns (total 22 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   vendor_id                      int64  
 1   pickup_datetime                object 
 2   dropoff_datetime               object 
 3   passenger_count                int64  
 4   trip_distance                  float64
 5   rate_code_id                   float64
 6   store_and_fwd_flag             object 
 7   pu_location_id                 int64  
 8   do_location_id                 int64  
 9   payment_type                   object 
 10  fare_amount                    float64
 11  extra                          float64
 12  mta_tax                        float64
 13  tip_amount                     float64
 14  tolls_amount                   float64
 15  improvement_surcharge_applied  float64
 16  total_amount                   float64
 17  congestion_surcharge           float64
 18  

In [4]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

In [5]:
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])

In [10]:
efficiency_by_hour = (
    df.groupby('pickup_hour').agg(
        total_trips = ('pickup_datetime', 'count'),
        total_revenue = ('total_amount', 'sum'),
        avg_fare_per_mile = ('fare_per_mile', 'mean'),
        avg_tip_percentage = ('tip_percentage', 'mean'),
        avg_distance = ('trip_distance', 'mean'),
        avg_duration = ('trip_duration_min', 'mean')
    ).reset_index()
)

KeyError: "Column(s) ['fare_per_mile', 'tip_percentage'] do not exist"

In [8]:
df['pickup_hour'] = df['pickup_datetime'].dt.hour

In [9]:
df.columns

Index(['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
       'trip_distance', 'rate_code_id', 'store_and_fwd_flag', 'pu_location_id',
       'do_location_id', 'payment_type', 'fare_amount', 'extra', 'mta_tax',
       'tip_amount', 'tolls_amount', 'improvement_surcharge_applied',
       'total_amount', 'congestion_surcharge', 'airport_fee',
       'cbd_congestion_fee', 'trip_duration_min', 'amount_diff',
       'pickup_hour'],
      dtype='object')

In [11]:
df = df[
    (df['trip_distance'] > 0) &
    (df['trip_duration_min'] > 0) &
    (df['fare_amount'] > 0)
]

In [12]:
df['fare_per_mile'] = df['fare_amount'] / df['trip_distance']
df['fare_per_minute'] = df['fare_amount'] / df['trip_duration_min']
df['tip_pct'] = (df['tip_amount'] / df['fare_amount']) * 100 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fare_per_mile'] = df['fare_amount'] / df['trip_distance']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fare_per_minute'] = df['fare_amount'] / df['trip_duration_min']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tip_pct'] = (df['tip_amount'] / df['fare_amount']) * 100


In [15]:
df[['fare_per_mile', 'fare_per_minute','tip_pct']].head(30)

Unnamed: 0,fare_per_mile,fare_per_minute,tip_pct
0,4.073684,1.470551,30.103359
1,4.509284,1.583851,28.823529
2,4.177449,2.039098,23.761062
3,10.833333,1.505792,37.692308
4,10.232558,0.274428,0.0
5,4.01566,1.836317,29.442897
6,4.004551,2.080788,14.602273
7,8.8,2.933333,42.045455
8,4.211793,2.360877,23.342857
9,8.404255,1.342776,0.0


In [16]:
efficiency_by_hour = (
    df.groupby('pickup_hour').agg(
        total_trips = ('pickup_datetime', 'count'),
        total_revenue = ('total_amount', 'sum'),
        avg_fare_per_mile = ('fare_per_mile', 'mean'),
        avg_tip_pct = ('tip_pct', 'mean'),
        avg_distance = ('trip_distance', 'mean'),
        avg_duration = ('trip_duration_min', 'mean')
    ).reset_index()
)

In [17]:
efficiency_by_hour.to_parquet('efficiency_by_hour.parquet', index=False)