In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
import pandas as pd
from pathlib import Path
import pyarrow.parquet as pq

In [3]:
month = 1
year = 2024

raw_data_dir = Path('..') / 'data' / 'raw'
raw_data_file_path = raw_data_dir / f'citi_bike_rides_{year}_{month:02}.parquet'

table = pq.read_table(raw_data_file_path)
citi_bike_rides = table.to_pandas()
citi_bike_rides.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,5078F3D302000BD2,electric_bike,2024-01-22 18:43:19.012,2024-01-22 18:48:10.708,Frederick Douglass Blvd & W 145 St,7954.12,St Nicholas Ave & W 126 St,7756.1,40.823072,-73.941738,40.811432,-73.951878,member
1,814337105D37302A,electric_bike,2024-01-11 19:19:18.721,2024-01-11 19:47:36.007,W 54 St & 6 Ave,6771.13,E 74 St & 1 Ave,6953.08,40.761822,-73.977036,40.768974,-73.954823,member
2,A33A920E2B10710C,electric_bike,2024-01-30 19:17:41.693,2024-01-30 19:32:49.857,E 11 St & Ave B,5659.11,W 10 St & Washington St,5847.06,40.727592,-73.979751,40.733424,-74.008515,casual
3,A3A5FC0DD7D34D74,electric_bike,2024-01-27 11:27:01.759,2024-01-27 11:38:01.213,W 54 St & 6 Ave,6771.13,E 74 St & 1 Ave,6953.08,40.761779,-73.977144,40.768974,-73.954823,member
4,6F96728ECEFBDAA4,electric_bike,2024-01-16 15:15:41.000,2024-01-16 15:29:26.156,Madison Ave & E 99 St,7443.01,E 74 St & 1 Ave,6953.08,40.789808,-73.952214,40.768974,-73.954823,member


In [4]:
citi_bike_ride_cp = citi_bike_rides.copy()
citi_bike_ride_cp['duration'] = citi_bike_ride_cp['ended_at'] - citi_bike_ride_cp['started_at']
citi_bike_ride_cp.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,duration
0,5078F3D302000BD2,electric_bike,2024-01-22 18:43:19.012,2024-01-22 18:48:10.708,Frederick Douglass Blvd & W 145 St,7954.12,St Nicholas Ave & W 126 St,7756.1,40.823072,-73.941738,40.811432,-73.951878,member,0 days 00:04:51.696000
1,814337105D37302A,electric_bike,2024-01-11 19:19:18.721,2024-01-11 19:47:36.007,W 54 St & 6 Ave,6771.13,E 74 St & 1 Ave,6953.08,40.761822,-73.977036,40.768974,-73.954823,member,0 days 00:28:17.286000
2,A33A920E2B10710C,electric_bike,2024-01-30 19:17:41.693,2024-01-30 19:32:49.857,E 11 St & Ave B,5659.11,W 10 St & Washington St,5847.06,40.727592,-73.979751,40.733424,-74.008515,casual,0 days 00:15:08.164000
3,A3A5FC0DD7D34D74,electric_bike,2024-01-27 11:27:01.759,2024-01-27 11:38:01.213,W 54 St & 6 Ave,6771.13,E 74 St & 1 Ave,6953.08,40.761779,-73.977144,40.768974,-73.954823,member,0 days 00:10:59.454000
4,6F96728ECEFBDAA4,electric_bike,2024-01-16 15:15:41.000,2024-01-16 15:29:26.156,Madison Ave & E 99 St,7443.01,E 74 St & 1 Ave,6953.08,40.789808,-73.952214,40.768974,-73.954823,member,0 days 00:13:45.156000


In [5]:
citi_bike_ride_cp['duration'].describe().T

count                      1881977
mean     0 days 00:10:50.949568386
std      0 days 00:20:47.307345380
min         0 days 00:01:00.044000
25%         0 days 00:04:39.642000
50%         0 days 00:07:43.293000
75%         0 days 00:12:56.011000
max         1 days 00:59:06.090000
Name: duration, dtype: object

In [6]:
citi_bike_ride_cp['duration'].quantile(0)
citi_bike_ride_cp['duration'].quantile(0.01)
citi_bike_ride_cp['duration'].quantile(0.995)
citi_bike_ride_cp['duration'].quantile(0.999)

Timedelta('0 days 00:01:00.044000')

Timedelta('0 days 00:01:20.040760')

Timedelta('0 days 00:58:33.912799999')

Timedelta('0 days 02:24:25.731416')

In [7]:
duration_filter = (citi_bike_ride_cp['duration'] > pd.Timedelta(0)) & (citi_bike_ride_cp['duration'] <= pd.Timedelta(hours = 5))
sum(~duration_filter)

1013

In [8]:
sorted_df = citi_bike_ride_cp.sort_values(by = 'started_at', ascending = True)  

# Get the top 10 (smallest) and bottom 10 (largest) values  
top_10 = sorted_df.head(10)  
bottom_10 = sorted_df.tail(10)  

top_10

bottom_10

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,duration
1101502,BD613F756C39C4D0,classic_bike,2023-12-31 13:50:28.976,2024-01-01 08:41:20.899,N 12 St & Bedford Ave,5450.04,S 4 St & Rodney St,5156.05,40.720798,-73.954847,40.70934,-73.95608,casual,0 days 18:50:51.923000
1071360,4B080070FD50B858,electric_bike,2023-12-31 14:55:38.927,2024-01-01 07:27:09.362,W 37 St & 5 Ave,6398.06,Amsterdam Ave & W 79 St,7311.02,40.75038,-73.98339,40.782939,-73.978652,member,0 days 16:31:30.435000
209249,24F86164747D9B56,classic_bike,2023-12-31 14:57:39.538,2024-01-01 14:13:10.446,Madison St & Montgomery St,5262.09,Allen St & Stanton St,5484.09,40.713126,-73.984844,40.722055,-73.989111,casual,0 days 23:15:30.908000
650852,165867F584717117,classic_bike,2023-12-31 15:57:36.402,2024-01-01 14:03:00.685,Cadman Plaza E & Red Cross Pl,4821.06,St Marks Pl & 4 Ave,4249.1,40.699918,-73.989718,40.681778,-73.97989,member,0 days 22:05:24.283000
836173,7FA2408E227974A4,classic_bike,2023-12-31 17:32:19.418,2024-01-01 07:02:54.951,Central Park S & 6 Ave,6876.04,Central Park West & W 68 St,7079.06,40.765909,-73.976342,40.773407,-73.977825,casual,0 days 13:30:35.533000
1476116,7B5A7F2EBE5619B7,classic_bike,2023-12-31 17:54:00.651,2024-01-01 16:39:31.783,6 Ave & W 34 St,6364.1,Bleecker St & Crosby St,5679.08,40.74964,-73.98805,40.726156,-73.995102,member,0 days 22:45:31.132000
1172955,D577198D5B511ADB,classic_bike,2023-12-31 18:49:42.512,2024-01-01 12:01:59.014,14 St & 7 Ave,3731.11,Underhill Ave & Lincoln Pl,4042.08,40.663779,-73.983968,40.674012,-73.967146,member,0 days 17:12:16.502000
1173049,27DB996CA83D276C,classic_bike,2023-12-31 18:49:42.620,2024-01-01 12:01:59.014,14 St & 7 Ave,3731.11,Underhill Ave & Lincoln Pl,4042.08,40.663779,-73.983968,40.674012,-73.967146,member,0 days 17:12:16.394000
1795192,147EB043317659FB,classic_bike,2023-12-31 19:22:31.619,2024-01-01 12:19:24.003,Metropolitan Ave & 65 Ln,5160.01,E Fordham Rd & Webster Ave,8582.09,40.71225,-73.89111,40.861748,-73.89105,casual,0 days 16:56:52.384000
215313,DAB454B0ECFEC7B7,classic_bike,2023-12-31 19:34:33.299,2024-01-01 02:22:22.488,Broadway & W 133 St,7903.02,Broadway & Moylan Pl,7823.03,40.819034,-73.956156,40.814326,-73.959025,casual,0 days 06:47:49.189000


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,duration
1178573,DBD5F1EDA05272A2,classic_bike,2024-01-31 23:55:15.155,2024-01-31 23:59:37.157,Carmine St & 6 Ave,5763.03,W Broadway & Spring St,5569.06,40.730386,-74.00215,40.724947,-74.001659,member,0 days 00:04:22.002000
581970,CF4D2CE8EF506FDA,electric_bike,2024-01-31 23:55:15.552,2024-01-31 23:56:42.709,E 11 St & Ave B,5659.11,Ave A & E 14 St,5779.11,40.727697,-73.979802,40.730311,-73.980472,member,0 days 00:01:27.157000
980367,7C068CC129D59029,classic_bike,2024-01-31 23:55:33.484,2024-01-31 23:58:13.282,E 13 St & 2 Ave,5820.08,St Marks Pl & 1 Ave,5626.13,40.731539,-73.985302,40.727791,-73.985649,member,0 days 00:02:39.798000
329887,1C08197BC3A120F6,electric_bike,2024-01-31 23:55:43.784,2024-01-31 23:59:29.515,20 Ave & 38 St,7196.04,31 St & Ditmars Blvd,7144.02,40.777268,-73.902671,40.776168,-73.910485,casual,0 days 00:03:45.731000
1306618,E8E94F7C1E0991F8,electric_bike,2024-01-31 23:55:48.332,2024-01-31 23:58:16.455,Eastern Pkwy & Troy Ave,3862.07,Kingston Ave & Carroll St,3831.03,40.669338,-73.936926,40.66688,-73.9425,member,0 days 00:02:28.123000
150786,8372FE21DCD75179,classic_bike,2024-01-31 23:55:59.368,2024-01-31 23:57:27.770,Kent Ave & N 7 St,5489.03,N 11 St & Kent Ave,5489.04,40.720368,-73.961651,40.722482,-73.959219,member,0 days 00:01:28.402000
1178180,FF5497A5A844C851,electric_bike,2024-01-31 23:56:38.776,2024-01-31 23:59:28.329,Sherman Ave & Thayer St,8583.02,W 190 St & Broadway,8474.02,40.863133,-73.926973,40.856487,-73.93297,member,0 days 00:02:49.553000
286497,00DB0ACCAA339429,classic_bike,2024-01-31 23:57:04.310,2024-01-31 23:59:47.214,Adam Clayton Powell Blvd & W 141 St,7893.05,W 147 St & Adam Clayton Powell Blvd,7971.07,40.819241,-73.941057,40.82281,-73.937413,member,0 days 00:02:42.904000
1036304,335E4CF575E8F41A,electric_bike,2024-01-31 23:57:04.650,2024-01-31 23:59:05.024,W 100 St & Broadway,7580.01,West End Ave & W 94 St,7524.09,40.797373,-73.970256,40.794165,-73.974124,member,0 days 00:02:00.374000
309898,6F464FCB160C98DF,classic_bike,2024-01-31 23:58:30.270,2024-01-31 23:59:36.654,Forsyth St & Broome St,5453.05,Delancey St & Eldridge St,5414.07,40.718939,-73.992663,40.719383,-73.991479,member,0 days 00:01:06.384000


In [9]:
filter_date_range = (citi_bike_ride_cp['started_at'] >= "2024-01-01") & (citi_bike_ride_cp['started_at'] < "2024-02-01")
sum(~filter_date_range)

364

In [10]:
final_filter = duration_filter & filter_date_range
numbers_dropped = final_filter.shape[0] - sum(final_filter) # numbers dropped
numbers_dropped
numbers_dropped/final_filter.shape[0] * 100

1354

0.0719456188890725

In [11]:
citi_bike_rides = citi_bike_rides[final_filter]
citi_bike_rides = citi_bike_rides[['started_at', 'start_station_id']]
#citi_bike_rides.rename(columns = {'started_at': 'pickup_datetime',
#                                  'start_station_id': 'pickup_location_id'}, inplace=True)
citi_bike_rides.head()

Unnamed: 0,started_at,start_station_id
0,2024-01-22 18:43:19.012,7954.12
1,2024-01-11 19:19:18.721,6771.13
2,2024-01-30 19:17:41.693,5659.11
3,2024-01-27 11:27:01.759,6771.13
4,2024-01-16 15:15:41.000,7443.01


In [12]:
year = 2024
month = 1
path = Path("..") / "data" / "processed" / f"rides_{year}_{month:02}.parquet"
citi_bike_rides.to_parquet(path, engine="pyarrow", index=False)