In [6]:
# Import dependencies
import pandas as pd
from pathlib import Path

import re
from datetime import datetime

In [7]:
# Import Oct 2023 dataset
oct_path = Path("resources/JC-202310-citibike-tripdata.csv")
oct_df = pd.read_csv(oct_path)

# Import Nov 2022 dataset
# nov_path = Path("resources/JC-202211-citibike-tripdata.csv")
# nov_df = pd.read_csv(nov_path)

In [8]:
# Check the Oct 2023 dataset columns
oct_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97584 entries, 0 to 97583
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ride_id             97584 non-null  object 
 1   rideable_type       97584 non-null  object 
 2   started_at          97584 non-null  object 
 3   ended_at            97584 non-null  object 
 4   start_station_name  97579 non-null  object 
 5   start_station_id    97579 non-null  object 
 6   end_station_name    97207 non-null  object 
 7   end_station_id      97207 non-null  object 
 8   start_lat           97584 non-null  float64
 9   start_lng           97584 non-null  float64
 10  end_lat             97497 non-null  float64
 11  end_lng             97497 non-null  float64
 12  member_casual       97584 non-null  object 
dtypes: float64(4), object(9)
memory usage: 9.7+ MB


In [11]:
# Check the Nov 2022 dataset columns
# nov_df.info()
csv_paths

[PosixPath('resources/JC-202308-citibike-tripdata.csv'),
 PosixPath('resources/JC-202310-citibike-tripdata.csv'),
 PosixPath('resources/JC-202309-citibike-tripdata.csv')]

In [10]:
# Find all the CSVs in the resources directory
csv_paths = []
for path in Path("resources").iterdir():
    csv_paths.append(path)

# Create a DataFrame from all the CSVs
combined_df = pd.concat((pd.read_csv(filename) for filename in csv_paths))
combined_df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,E2E964A161F786AB,classic_bike,2023-08-07 19:37:47,2023-08-07 19:41:14,6 St & Grand St,HB302,Madison St & 10 St,HB503,40.744398,-74.034501,40.749943,-74.035865,member
1,0660F2E48E3BB87F,classic_bike,2023-08-01 13:16:22,2023-08-01 13:26:02,6 St & Grand St,HB302,6 St & Grand St,HB302,40.744398,-74.034501,40.744398,-74.034501,member
2,940FC7C675232897,classic_bike,2023-08-15 17:28:23,2023-08-15 17:50:35,Heights Elevator,JC059,Heights Elevator,JC059,40.748721,-74.04048,40.748716,-74.040443,member
3,E967660CC5CD585B,classic_bike,2023-08-01 12:44:24,2023-08-01 12:49:45,Hoboken Ave at Monmouth St,JC105,Hoboken Terminal - River St & Hudson Pl,HB102,40.735279,-74.04683,40.736068,-74.029127,member
4,D997CB0B855FE2D6,classic_bike,2023-08-08 12:31:16,2023-08-08 12:40:18,Hoboken Ave at Monmouth St,JC105,Hoboken Terminal - River St & Hudson Pl,HB102,40.735208,-74.046964,40.736068,-74.029127,member


In [12]:
# Display the columns to identify null count
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 303844 entries, 0 to 94232
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   ride_id             303844 non-null  object 
 1   rideable_type       303844 non-null  object 
 2   started_at          303844 non-null  object 
 3   ended_at            303844 non-null  object 
 4   start_station_name  303837 non-null  object 
 5   start_station_id    303837 non-null  object 
 6   end_station_name    302787 non-null  object 
 7   end_station_id      302787 non-null  object 
 8   start_lat           303844 non-null  float64
 9   start_lng           303844 non-null  float64
 10  end_lat             303535 non-null  float64
 11  end_lng             303535 non-null  float64
 12  member_casual       303844 non-null  object 
dtypes: float64(4), object(9)
memory usage: 32.5+ MB


In [13]:
# Drop all rows which have a null value
clean_df = combined_df.dropna(how="any").reset_index(drop=True)

clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302784 entries, 0 to 302783
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   ride_id             302784 non-null  object 
 1   rideable_type       302784 non-null  object 
 2   started_at          302784 non-null  object 
 3   ended_at            302784 non-null  object 
 4   start_station_name  302784 non-null  object 
 5   start_station_id    302784 non-null  object 
 6   end_station_name    302784 non-null  object 
 7   end_station_id      302784 non-null  object 
 8   start_lat           302784 non-null  float64
 9   start_lng           302784 non-null  float64
 10  end_lat             302784 non-null  float64
 11  end_lng             302784 non-null  float64
 12  member_casual       302784 non-null  object 
dtypes: float64(4), object(9)
memory usage: 30.0+ MB


In [14]:
clean_df = clean_df.astype({
    'started_at': 'datetime64[ns]',
    'ended_at': 'datetime64[ns]'
})

clean_df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,E2E964A161F786AB,classic_bike,2023-08-07 19:37:47,2023-08-07 19:41:14,6 St & Grand St,HB302,Madison St & 10 St,HB503,40.744398,-74.034501,40.749943,-74.035865,member
1,0660F2E48E3BB87F,classic_bike,2023-08-01 13:16:22,2023-08-01 13:26:02,6 St & Grand St,HB302,6 St & Grand St,HB302,40.744398,-74.034501,40.744398,-74.034501,member
2,940FC7C675232897,classic_bike,2023-08-15 17:28:23,2023-08-15 17:50:35,Heights Elevator,JC059,Heights Elevator,JC059,40.748721,-74.04048,40.748716,-74.040443,member
3,E967660CC5CD585B,classic_bike,2023-08-01 12:44:24,2023-08-01 12:49:45,Hoboken Ave at Monmouth St,JC105,Hoboken Terminal - River St & Hudson Pl,HB102,40.735279,-74.04683,40.736068,-74.029127,member
4,D997CB0B855FE2D6,classic_bike,2023-08-08 12:31:16,2023-08-08 12:40:18,Hoboken Ave at Monmouth St,JC105,Hoboken Terminal - River St & Hudson Pl,HB102,40.735208,-74.046964,40.736068,-74.029127,member


In [15]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302784 entries, 0 to 302783
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   ride_id             302784 non-null  object        
 1   rideable_type       302784 non-null  object        
 2   started_at          302784 non-null  datetime64[ns]
 3   ended_at            302784 non-null  datetime64[ns]
 4   start_station_name  302784 non-null  object        
 5   start_station_id    302784 non-null  object        
 6   end_station_name    302784 non-null  object        
 7   end_station_id      302784 non-null  object        
 8   start_lat           302784 non-null  float64       
 9   start_lng           302784 non-null  float64       
 10  end_lat             302784 non-null  float64       
 11  end_lng             302784 non-null  float64       
 12  member_casual       302784 non-null  object        
dtypes: datetime64[ns](2), float64

In [16]:
start_stations = clean_df['start_station_name'].value_counts()

In [17]:
start_stations[start_stations.values < 100]

6 Ave & W 33 St                  2
Broadway & Morris St             2
E 48 St & 3 Ave                  2
Brooklyn Bridge Park - Pier 2    2
Mercer St & Spring St            2
                                ..
5 Ave & E 87 St                  1
E 95 St & 3 Ave                  1
77 St & 31 Ave                   1
50 St & Northern Blvd            1
Pioneer St & Richards St         1
Name: start_station_name, Length: 90, dtype: int64

In [19]:
pattern = 'Pioneer St'
for station in start_stations.index:
    if len(re.findall(pattern, station)) > 0:
        print(station)

Pioneer St & Richards St


In [20]:
same_startend = clean_df.loc[(clean_df['start_lat'] == clean_df['end_lat']) & (clean_df['start_lng'] == clean_df['end_lng'])]
same_startend

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
1,0660F2E48E3BB87F,classic_bike,2023-08-01 13:16:22,2023-08-01 13:26:02,6 St & Grand St,HB302,6 St & Grand St,HB302,40.744398,-74.034501,40.744398,-74.034501,member
10,03B9C22973B055BF,electric_bike,2023-08-02 13:23:34,2023-08-02 13:24:39,Jackson Square,JC063,Jackson Square,JC063,40.711130,-74.078900,40.711130,-74.078900,member
11,6A1209C04341249B,electric_bike,2023-08-02 13:25:13,2023-08-02 13:25:36,Jackson Square,JC063,Jackson Square,JC063,40.711130,-74.078900,40.711130,-74.078900,member
13,1DEF9C2A826DF529,classic_bike,2023-08-14 17:43:22,2023-08-14 18:27:11,Hoboken Ave at Monmouth St,JC105,Hoboken Ave at Monmouth St,JC105,40.735208,-74.046964,40.735208,-74.046964,casual
38,AA55978A1EBD9347,classic_bike,2023-08-15 18:44:11,2023-08-15 19:19:42,Pershing Field,JC024,Pershing Field,JC024,40.742677,-74.051789,40.742677,-74.051789,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...
300385,5CF50B644B385CAB,classic_bike,2023-09-11 15:36:07,2023-09-11 15:48:19,8 St & Washington St,HB603,8 St & Washington St,HB603,40.745984,-74.028199,40.745984,-74.028199,member
300387,40E039743C20EEBB,classic_bike,2023-09-17 19:43:20,2023-09-17 19:43:47,Montgomery St,JC099,Montgomery St,JC099,40.719420,-74.050990,40.719420,-74.050990,member
300388,61B1143C8303590E,electric_bike,2023-09-14 11:34:02,2023-09-14 11:34:20,8 St & Washington St,HB603,8 St & Washington St,HB603,40.745984,-74.028199,40.745984,-74.028199,member
300389,1A20B9C643367519,electric_bike,2023-09-17 10:23:55,2023-09-17 10:24:34,11 St & Washington St,HB502,11 St & Washington St,HB502,40.749985,-74.027150,40.749985,-74.027150,casual


In [21]:
duration = same_startend['ended_at'] - same_startend['started_at']
duration

1        0 days 00:09:40
10       0 days 00:01:05
11       0 days 00:00:23
13       0 days 00:43:49
38       0 days 00:35:31
               ...      
300385   0 days 00:12:12
300387   0 days 00:00:27
300388   0 days 00:00:18
300389   0 days 00:00:39
300390   0 days 00:15:52
Length: 12174, dtype: timedelta64[ns]

In [22]:
duration.sort_values()

123669   0 days 00:00:00
1896     0 days 00:00:00
93793    0 days 00:00:00
54238    0 days 00:00:00
132272   0 days 00:00:00
               ...      
183600   0 days 17:12:50
168901   0 days 18:01:53
256249   0 days 18:03:06
181357   0 days 18:12:53
233099   0 days 19:00:20
Length: 12174, dtype: timedelta64[ns]

In [23]:
same_startend[same_startend.index == 170217]

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual


In [24]:
invalid = same_startend.loc[same_startend['ended_at'] == same_startend['started_at']]
len(invalid)

9

In [25]:
# Less than one minute in duration
same_startend.loc[duration < pd.Timedelta(minutes=1)]

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
11,6A1209C04341249B,electric_bike,2023-08-02 13:25:13,2023-08-02 13:25:36,Jackson Square,JC063,Jackson Square,JC063,40.711130,-74.078900,40.711130,-74.078900,member
120,8EF5194B604BF30E,classic_bike,2023-08-09 12:12:14,2023-08-09 12:12:20,Hoboken Terminal - River St & Hudson Pl,HB102,Hoboken Terminal - River St & Hudson Pl,HB102,40.736068,-74.029127,40.736068,-74.029127,casual
121,06A71E513DA709FD,classic_bike,2023-08-27 11:06:34,2023-08-27 11:06:46,Hoboken Terminal - River St & Hudson Pl,HB102,Hoboken Terminal - River St & Hudson Pl,HB102,40.736068,-74.029127,40.736068,-74.029127,member
232,9A5CF8CEFCB5A0F1,electric_bike,2023-08-12 09:20:32,2023-08-12 09:20:40,Leonard Gordon Park,JC080,Leonard Gordon Park,JC080,40.745910,-74.057271,40.745910,-74.057271,member
247,A2542561E767B213,classic_bike,2023-08-11 21:34:32,2023-08-11 21:34:57,Leonard Gordon Park,JC080,Leonard Gordon Park,JC080,40.745910,-74.057271,40.745910,-74.057271,casual
...,...,...,...,...,...,...,...,...,...,...,...,...,...
300352,4BD6C2A59CC68E63,electric_bike,2023-09-02 19:45:55,2023-09-02 19:45:58,Montgomery St,JC099,Montgomery St,JC099,40.719420,-74.050990,40.719420,-74.050990,member
300360,0B8B95C55510D757,classic_bike,2023-09-28 19:20:23,2023-09-28 19:20:25,Montgomery St,JC099,Montgomery St,JC099,40.719420,-74.050990,40.719420,-74.050990,member
300387,40E039743C20EEBB,classic_bike,2023-09-17 19:43:20,2023-09-17 19:43:47,Montgomery St,JC099,Montgomery St,JC099,40.719420,-74.050990,40.719420,-74.050990,member
300388,61B1143C8303590E,electric_bike,2023-09-14 11:34:02,2023-09-14 11:34:20,8 St & Washington St,HB603,8 St & Washington St,HB603,40.745984,-74.028199,40.745984,-74.028199,member


In [28]:
# Longer duration rides
same_startend.loc[duration > pd.Timedelta(hours=6)]

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
9349,0398EF623FB318D5,electric_bike,2023-08-19 00:22:59,2023-08-19 17:17:57,Hilltop,JC019,Hilltop,JC019,40.731169,-74.057574,40.731169,-74.057574,member
12518,8EA10592E6553A27,classic_bike,2023-08-12 11:32:02,2023-08-12 18:37:41,Liberty Light Rail,JC052,Liberty Light Rail,JC052,40.711242,-74.055701,40.711242,-74.055701,casual
15775,031E2DAE9A78FC5E,electric_bike,2023-08-05 18:16:03,2023-08-06 01:51:45,Newport PATH,JC066,Newport PATH,JC066,40.727224,-74.033759,40.727224,-74.033759,casual
22171,E0F3483DD2284AA5,classic_bike,2023-08-06 00:30:51,2023-08-06 07:54:04,Hoboken Terminal - River St & Hudson Pl,HB102,Hoboken Terminal - River St & Hudson Pl,HB102,40.736068,-74.029127,40.736068,-74.029127,casual
47838,01DDB57DB1733A15,electric_bike,2023-08-12 01:02:56,2023-08-12 10:01:09,South Waterfront Walkway - Sinatra Dr & 1 St,HB103,South Waterfront Walkway - Sinatra Dr & 1 St,HB103,40.736982,-74.027781,40.736982,-74.027781,casual
51452,2537C8AC1351C711,classic_bike,2023-08-23 11:21:39,2023-08-23 19:35:20,Lincoln Park,JC053,Lincoln Park,JC053,40.724605,-74.078406,40.724605,-74.078406,casual
56418,2F8936E260CDD4F7,classic_bike,2023-08-22 13:30:40,2023-08-22 19:50:38,14 St Ferry - 14 St & Shipyard Ln,HB202,14 St Ferry - 14 St & Shipyard Ln,HB202,40.752961,-74.024353,40.752961,-74.024353,casual
74016,1BF2A767D0FBD31F,electric_bike,2023-08-02 06:23:49,2023-08-02 17:52:58,9 St HBLR - Jackson St & 8 St,HB305,9 St HBLR - Jackson St & 8 St,HB305,40.747907,-74.038412,40.747907,-74.038412,casual
95389,BC59A7220354FFD5,classic_bike,2023-08-10 00:41:54,2023-08-10 08:42:24,Church Sq Park - 5 St & Park Ave,HB601,Church Sq Park - 5 St & Park Ave,HB601,40.742659,-74.032233,40.742659,-74.032233,casual
168901,35174638CAA29E99,classic_bike,2023-10-05 22:30:30,2023-10-06 16:32:23,Bergen Ave & Sip Ave,JC109,Bergen Ave & Sip Ave,JC109,40.731009,-74.064437,40.731009,-74.064437,casual


In [29]:
clean_df['rideable_type'].value_counts()

classic_bike     275903
electric_bike     26881
Name: rideable_type, dtype: int64

In [30]:
clean_df.loc[clean_df['rideable_type'] == "docked_bike"]

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
