# Data Cleaning

## Import dependencies and files

In [143]:
# Import dependencies
import pandas as pd
from pathlib import Path

import re
from datetime import datetime
import matplotlib.pyplot as plt

In [144]:
# Import Oct 2023 dataset
oct_path = Path("resources/202310-citibike-tripdata.csv")
oct_df = pd.read_csv(
    oct_path,
    dtype = {
        'start_station_id': str, # dtype warning, column 5
        'end_station_id': str # dtype warning, column 7
    })

## Check Data Types

In [145]:
# Check the dataset columns
oct_df.info(show_counts=True)

# CONCLUSION
# Inconsistent non-null counts, incomplete data.
# Latitude and longitude data types correct.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3823673 entries, 0 to 3823672
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   ride_id             3823673 non-null  object 
 1   rideable_type       3823673 non-null  object 
 2   started_at          3823673 non-null  object 
 3   ended_at            3823673 non-null  object 
 4   start_station_name  3817887 non-null  object 
 5   start_station_id    3817887 non-null  object 
 6   end_station_name    3808655 non-null  object 
 7   end_station_id      3808655 non-null  object 
 8   start_lat           3823673 non-null  float64
 9   start_lng           3823673 non-null  float64
 10  end_lat             3821286 non-null  float64
 11  end_lng             3821286 non-null  float64
 12  member_casual       3823673 non-null  object 
dtypes: float64(4), object(9)
memory usage: 379.2+ MB


In [146]:
# Check the `ride_id` column
print(len(oct_df.ride_id.unique()))

# CONCLUSION
# Each `ride_id` is unique, irrelevant to the analysis.
# Drop this column.

3823673


In [147]:
# Check the `rideable_type` column
oct_df.rideable_type.value_counts()

# CONCLUSION
# Two types: classic, electric.

rideable_type
classic_bike     3621284
electric_bike     202389
Name: count, dtype: int64

In [148]:
# Check the `started_at` and `ended_at` columns
print(f"started_at:\n{oct_df.started_at[:10]}\n")
print(f"ended_at:\n{oct_df.ended_at[:10]}")

# CONCLUSION
# Convert to datetime datatype.

started_at:
0    2023-10-03 02:48:38
1    2023-10-11 16:03:17
2    2023-10-11 19:57:13
3    2023-10-10 20:18:22
4    2023-10-17 16:26:58
5    2023-10-16 02:44:56
6    2023-10-16 08:46:06
7    2023-10-15 21:47:42
8    2023-10-18 11:43:10
9    2023-10-18 14:08:42
Name: started_at, dtype: object

ended_at:
0    2023-10-03 02:48:40
1    2023-10-11 16:45:26
2    2023-10-11 20:20:10
3    2023-10-10 20:18:37
4    2023-10-17 16:34:27
5    2023-10-16 03:06:06
6    2023-10-16 08:52:23
7    2023-10-15 21:56:48
8    2023-10-18 12:44:16
9    2023-10-18 14:09:50
Name: ended_at, dtype: object


In [149]:
# Check the `start_station_name` and `end_station_name` columns
start_stations = oct_df.start_station_name.value_counts()
end_stations = oct_df.end_station_name.value_counts()

# Isolate station names
start_names = set(start_stations.index)
end_names = set(end_stations.index)

# Print statements
print(start_stations, "\n")
print(end_stations, "\n")
print(f"start/end only stations: {start_names.difference(end_names)}")

# CONCLUSION
# All stations can be a start or end station.

start_station_name
W 21 St & 6 Ave                      14676
West St & Chambers St                13392
E 41 St & Madison Ave (SE corner)    12052
University Pl & E 14 St              11993
Broadway & W 58 St                   11790
                                     ...  
Warren St                                1
Madison St & 1 St                        1
JC Medical Center                        1
Morris Canal                             1
Columbus Park - Clinton St & 9 St        1
Name: count, Length: 2116, dtype: int64 

end_station_name
W 21 St & 6 Ave                      14682
West St & Chambers St                13450
University Pl & E 14 St              12049
E 41 St & Madison Ave (SE corner)    11918
Broadway & W 58 St                   11623
                                     ...  
Sedgwick Ave & Hall of Fame Tce          1
Madison St & 1 St                        1
Journal Square                           1
Bergen Ave                               1
River St & 1 St    

In [150]:
# Check the `start_station_id` and `end_station_id` columns
print(oct_df.start_station_id.value_counts(), "\n")
print(oct_df.end_station_id.value_counts())

# CONCLUSION
# Inconsistent column values, drop these columns and
# use station names instead.

start_station_id
6140.05    14676
5329.03    13392
6432.10    12052
5905.14    11993
6948.10    11790
           ...  
JC110          1
JC072          1
HB103          1
HB402          1
HB501          1
Name: count, Length: 2063, dtype: int64 

end_station_id
6140.05       14682
5329.03       13450
5905.14       12049
6432.10       11918
6948.10       11623
              ...  
JC053             1
JC034             1
JC084             1
190 Morgan        1
HB609             1
Name: count, Length: 2098, dtype: int64


In [151]:
# Check the `member_casual` column
oct_df.member_casual.value_counts()

member_casual
member    3100635
casual     723038
Name: count, dtype: int64

## Drop Rows and Columns

In [152]:
# Drop rows with null values
nonulls_df = oct_df.dropna(how="any").reset_index(drop=True)
nonulls_df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3806758 entries, 0 to 3806757
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   ride_id             3806758 non-null  object 
 1   rideable_type       3806758 non-null  object 
 2   started_at          3806758 non-null  object 
 3   ended_at            3806758 non-null  object 
 4   start_station_name  3806758 non-null  object 
 5   start_station_id    3806758 non-null  object 
 6   end_station_name    3806758 non-null  object 
 7   end_station_id      3806758 non-null  object 
 8   start_lat           3806758 non-null  float64
 9   start_lng           3806758 non-null  float64
 10  end_lat             3806758 non-null  float64
 11  end_lng             3806758 non-null  float64
 12  member_casual       3806758 non-null  object 
dtypes: float64(4), object(9)
memory usage: 377.6+ MB


In [153]:
# Drop columns: `ride_id`, `start_station_id`, `end_station_id`
drop_columns = ['ride_id', 'start_station_id', 'end_station_id']

reduced_df = nonulls_df.drop(columns=drop_columns)
reduced_df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3806758 entries, 0 to 3806757
Data columns (total 10 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   rideable_type       3806758 non-null  object 
 1   started_at          3806758 non-null  object 
 2   ended_at            3806758 non-null  object 
 3   start_station_name  3806758 non-null  object 
 4   end_station_name    3806758 non-null  object 
 5   start_lat           3806758 non-null  float64
 6   start_lng           3806758 non-null  float64
 7   end_lat             3806758 non-null  float64
 8   end_lng             3806758 non-null  float64
 9   member_casual       3806758 non-null  object 
dtypes: float64(4), object(6)
memory usage: 290.4+ MB


## Convert to datetime

In [155]:
# Convert the `started_at` and `ended_at` columns to datetime
clean_df = reduced_df.astype({
    'started_at': 'datetime64[ns]',
    'ended_at': 'datetime64[ns]'
})
clean_df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3806758 entries, 0 to 3806757
Data columns (total 10 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   rideable_type       3806758 non-null  object        
 1   started_at          3806758 non-null  datetime64[ns]
 2   ended_at            3806758 non-null  datetime64[ns]
 3   start_station_name  3806758 non-null  object        
 4   end_station_name    3806758 non-null  object        
 5   start_lat           3806758 non-null  float64       
 6   start_lng           3806758 non-null  float64       
 7   end_lat             3806758 non-null  float64       
 8   end_lng             3806758 non-null  float64       
 9   member_casual       3806758 non-null  object        
dtypes: datetime64[ns](2), float64(4), object(4)
memory usage: 290.4+ MB


In [156]:
# Display cleaned DataFrame
clean_df.head()

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual
0,classic_bike,2023-10-03 02:48:38,2023-10-03 02:48:40,Columbus Pl & Atlantic Ave,Columbus Pl & Atlantic Ave,40.677223,-73.922792,40.67717,-73.92285,casual
1,classic_bike,2023-10-11 16:03:17,2023-10-11 16:45:26,Central Park West & W 85 St,Central Park West & W 85 St,40.78476,-73.969862,40.78476,-73.969862,casual
2,classic_bike,2023-10-11 19:57:13,2023-10-11 20:20:10,Hicks St & Montague St,5 St & 6 Ave,40.694974,-73.995936,40.670484,-73.98209,casual
3,classic_bike,2023-10-10 20:18:22,2023-10-10 20:18:37,Atlantic Ave & Furman St,Atlantic Ave & Furman St,40.691669,-74.000139,40.691652,-73.999979,casual
4,classic_bike,2023-10-17 16:26:58,2023-10-17 16:34:27,E 41 St & Madison Ave (SE corner),E 58 St & 3 Ave,40.751845,-73.979585,40.760958,-73.967245,casual


## Check for Invalid Trips

In [157]:
# Calculate the trip duration
duration = clean_df.ended_at - clean_df.started_at

duration.sort_values(ascending=True)

2672150   -1 days +23:47:45
2576833   -1 days +23:50:34
2179299   -1 days +23:55:08
3685032   -1 days +23:55:32
2358376   -1 days +23:55:52
                 ...       
3751717     1 days 00:58:19
399235      1 days 00:58:44
2427636     1 days 00:59:12
3725100     1 days 00:59:13
2719744     1 days 00:59:26
Length: 3806758, dtype: timedelta64[ns]

In [158]:
# Drop rows with a negative duration
positive_durations = clean_df.loc[duration > pd.Timedelta(0)].copy()

# Display the shape and head
print(positive_durations.shape)
positive_durations.head()

(3805477, 10)


Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual
0,classic_bike,2023-10-03 02:48:38,2023-10-03 02:48:40,Columbus Pl & Atlantic Ave,Columbus Pl & Atlantic Ave,40.677223,-73.922792,40.67717,-73.92285,casual
1,classic_bike,2023-10-11 16:03:17,2023-10-11 16:45:26,Central Park West & W 85 St,Central Park West & W 85 St,40.78476,-73.969862,40.78476,-73.969862,casual
2,classic_bike,2023-10-11 19:57:13,2023-10-11 20:20:10,Hicks St & Montague St,5 St & 6 Ave,40.694974,-73.995936,40.670484,-73.98209,casual
3,classic_bike,2023-10-10 20:18:22,2023-10-10 20:18:37,Atlantic Ave & Furman St,Atlantic Ave & Furman St,40.691669,-74.000139,40.691652,-73.999979,casual
4,classic_bike,2023-10-17 16:26:58,2023-10-17 16:34:27,E 41 St & Madison Ave (SE corner),E 58 St & 3 Ave,40.751845,-73.979585,40.760958,-73.967245,casual


In [176]:
# Calculate updated duration values and convert to minutes
duration = positive_durations.ended_at - positive_durations.started_at
duration_in_mins = duration / pd.Timedelta(minutes=1)
duration_in_mins.describe()
# duration_in_mins = (duration.dt.seconds//60)%60
# plt.hist(duration_in_mins)

count    3.805477e+06
mean     1.354020e+01
std      2.891120e+01
min      1.666667e-02
25%      5.183333e+00
50%      9.216667e+00
75%      1.606667e+01
max      1.499433e+03
dtype: float64

In [142]:
# Check long duration trips
long_duration = positive_durations.loc[duration > pd.Timedelta(hours=)]
long_duration

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual
5839,classic_bike,2023-10-08 10:29:34,2023-10-09 04:30:43,12 Ave & W 40 St,W 22 St & 10 Ave,40.760875,-74.002777,40.746920,-74.004519,casual
5978,classic_bike,2023-10-08 13:26:15,2023-10-09 04:30:43,12 Ave & W 40 St,W 22 St & 10 Ave,40.760875,-74.002777,40.746920,-74.004519,member
5994,classic_bike,2023-10-28 12:43:57,2023-10-28 21:32:12,1 Ave & E 18 St,E 6 St & Ave D,40.733812,-73.980544,40.722281,-73.976687,member
7390,classic_bike,2023-10-01 09:26:08,2023-10-02 08:10:38,12 Ave & W 40 St,W 54 St & 11 Ave,40.760875,-74.002777,40.768333,-73.992573,casual
7533,classic_bike,2023-10-01 11:18:04,2023-10-02 08:10:38,12 Ave & W 40 St,W 54 St & 11 Ave,40.760875,-74.002777,40.768333,-73.992573,casual
...,...,...,...,...,...,...,...,...,...,...
3801122,classic_bike,2023-10-03 11:39:55,2023-10-03 18:15:38,2 Ave & E 29 St,E 58 St & 3 Ave,40.741724,-73.978093,40.760958,-73.967245,member
3801422,classic_bike,2023-10-19 13:13:44,2023-10-20 08:24:02,Central Park West & W 76 St,5 Ave & E 72 St,40.778968,-73.973747,40.772828,-73.966853,casual
3802527,classic_bike,2023-10-19 13:19:47,2023-10-20 08:24:02,W 110 St & Amsterdam Ave,5 Ave & E 72 St,40.802692,-73.962950,40.772828,-73.966853,casual
3803812,classic_bike,2023-10-19 13:33:34,2023-10-20 08:24:02,Central Park North & Adam Clayton Powell Blvd,5 Ave & E 72 St,40.799484,-73.955613,40.772828,-73.966853,casual


In [135]:
# Check for trips which started and ended in the same location
lat_condition = (positive_durations['start_lat'] == positive_durations['end_lat'])
lng_condition = (positive_durations['start_lng'] == positive_durations['end_lng'])

same_startend = positive_durations.loc[lat_condition & lng_condition]

# Display the shape and head
print(same_startend.shape)
same_startend.head()

(96738, 10)


Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual
1,classic_bike,2023-10-11 16:03:17,2023-10-11 16:45:26,Central Park West & W 85 St,Central Park West & W 85 St,40.78476,-73.969862,40.78476,-73.969862,casual
9,classic_bike,2023-10-18 14:08:42,2023-10-18 14:09:50,W 48 St & Rockefeller Plaza,W 48 St & Rockefeller Plaza,40.757769,-73.979294,40.757769,-73.979294,casual
13,classic_bike,2023-10-27 13:12:05,2023-10-27 13:13:00,5 Ave & E 72 St,5 Ave & E 72 St,40.772828,-73.966853,40.772828,-73.966853,casual
22,classic_bike,2023-10-08 07:31:17,2023-10-08 07:32:59,W 56 St & 6 Ave,W 56 St & 6 Ave,40.763406,-73.977225,40.763406,-73.977225,casual
23,classic_bike,2023-10-08 06:52:23,2023-10-08 07:18:25,E 58 St & 3 Ave,E 58 St & 3 Ave,40.760958,-73.967245,40.760958,-73.967245,member


In [137]:
# Given the same start/end location, calculate the updated trip duration
duration = same_startend['ended_at'] - same_startend['started_at']

duration.sort_values(ascending=True)

3296259   0 days 00:00:01
1951620   0 days 00:00:01
1063902   0 days 00:00:01
2569018   0 days 00:00:01
2657039   0 days 00:00:01
                ...      
2260012   1 days 00:29:07
1709170   1 days 00:30:04
3298396   1 days 00:43:11
3684818   1 days 00:50:20
1175999   1 days 00:50:59
Length: 96738, dtype: timedelta64[ns]

In [138]:
# Drop trips <1 minute in duration (incorrect docking, etc.)
valid_trips = same_startend.loc[duration > pd.Timedelta(minutes=1)].copy()
valid_trips.head()

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual
1,classic_bike,2023-10-11 16:03:17,2023-10-11 16:45:26,Central Park West & W 85 St,Central Park West & W 85 St,40.78476,-73.969862,40.78476,-73.969862,casual
9,classic_bike,2023-10-18 14:08:42,2023-10-18 14:09:50,W 48 St & Rockefeller Plaza,W 48 St & Rockefeller Plaza,40.757769,-73.979294,40.757769,-73.979294,casual
22,classic_bike,2023-10-08 07:31:17,2023-10-08 07:32:59,W 56 St & 6 Ave,W 56 St & 6 Ave,40.763406,-73.977225,40.763406,-73.977225,casual
23,classic_bike,2023-10-08 06:52:23,2023-10-08 07:18:25,E 58 St & 3 Ave,E 58 St & 3 Ave,40.760958,-73.967245,40.760958,-73.967245,member
29,classic_bike,2023-10-08 18:20:27,2023-10-08 18:55:46,Convent Ave & W 151 St,Convent Ave & W 151 St,40.82831,-73.942924,40.82831,-73.942924,casual


In [126]:
# Identify long duration trips
valid_trips.loc[duration > pd.Timedelta(hours=6)]

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual
188048,classic_bike,2023-10-08 08:09:41,2023-10-08 14:53:05,E 27 St & 1 Ave,E 27 St & 1 Ave,40.739445,-73.976806,40.739445,-73.976806,member
188668,classic_bike,2023-10-20 20:41:20,2023-10-21 17:24:58,Cliff St & Fulton St,Cliff St & Fulton St,40.708380,-74.004950,40.708380,-74.004950,member
248315,electric_bike,2023-10-22 02:28:42,2023-10-22 09:05:47,N 12 St & Bedford Ave,N 12 St & Bedford Ave,40.720798,-73.954847,40.720798,-73.954847,casual
327650,classic_bike,2023-10-22 15:30:57,2023-10-23 08:24:34,5 Ave & E 72 St,5 Ave & E 72 St,40.772828,-73.966853,40.772828,-73.966853,casual
327877,classic_bike,2023-10-27 15:00:22,2023-10-28 08:02:41,5 Ave & E 72 St,5 Ave & E 72 St,40.772828,-73.966853,40.772828,-73.966853,casual
...,...,...,...,...,...,...,...,...,...,...
3698707,classic_bike,2023-10-08 13:55:38,2023-10-08 20:20:37,10 Ave & W 14 St,10 Ave & W 14 St,40.741982,-74.008316,40.741982,-74.008316,casual
3735506,classic_bike,2023-10-13 13:37:44,2023-10-14 09:50:30,Madison Ave & E 82 St,Madison Ave & E 82 St,40.778131,-73.960694,40.778131,-73.960694,casual
3737724,classic_bike,2023-10-08 16:24:54,2023-10-09 09:52:30,President St & Henry St,President St & Henry St,40.682800,-73.999904,40.682800,-73.999904,casual
3738392,classic_bike,2023-10-07 22:00:27,2023-10-08 15:08:32,West St & Liberty St,West St & Liberty St,40.711444,-74.014847,40.711444,-74.014847,casual


In [16]:
start_stations = clean_df['start_station_name'].value_counts()

In [17]:
start_stations[start_stations.values < 100]

6 Ave & W 33 St                  2
Broadway & Morris St             2
E 48 St & 3 Ave                  2
Brooklyn Bridge Park - Pier 2    2
Mercer St & Spring St            2
                                ..
5 Ave & E 87 St                  1
E 95 St & 3 Ave                  1
77 St & 31 Ave                   1
50 St & Northern Blvd            1
Pioneer St & Richards St         1
Name: start_station_name, Length: 90, dtype: int64

In [19]:
pattern = 'Pioneer St'
for station in start_stations.index:
    if len(re.findall(pattern, station)) > 0:
        print(station)

Pioneer St & Richards St


In [94]:
same_startend = clean_df.loc[(clean_df['start_lat'] == clean_df['end_lat']) & (clean_df['start_lng'] == clean_df['end_lng'])]
same_startend

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual
1,classic_bike,2023-10-11 16:03:17,2023-10-11 16:45:26,Central Park West & W 85 St,Central Park West & W 85 St,40.784760,-73.969862,40.784760,-73.969862,casual
9,classic_bike,2023-10-18 14:08:42,2023-10-18 14:09:50,W 48 St & Rockefeller Plaza,W 48 St & Rockefeller Plaza,40.757769,-73.979294,40.757769,-73.979294,casual
13,classic_bike,2023-10-27 13:12:05,2023-10-27 13:13:00,5 Ave & E 72 St,5 Ave & E 72 St,40.772828,-73.966853,40.772828,-73.966853,casual
22,classic_bike,2023-10-08 07:31:17,2023-10-08 07:32:59,W 56 St & 6 Ave,W 56 St & 6 Ave,40.763406,-73.977225,40.763406,-73.977225,casual
23,classic_bike,2023-10-08 06:52:23,2023-10-08 07:18:25,E 58 St & 3 Ave,E 58 St & 3 Ave,40.760958,-73.967245,40.760958,-73.967245,member
...,...,...,...,...,...,...,...,...,...,...
3747015,electric_bike,2023-10-23 19:07:02,2023-10-23 19:07:06,E 32 St & Park Ave,E 32 St & Park Ave,40.745712,-73.981948,40.745712,-73.981948,member
3747016,electric_bike,2023-10-23 19:05:29,2023-10-23 19:06:09,E 32 St & Park Ave,E 32 St & Park Ave,40.745712,-73.981948,40.745712,-73.981948,member
3747019,electric_bike,2023-10-14 11:30:49,2023-10-14 12:00:18,W 67 St & Broadway,W 67 St & Broadway,40.774925,-73.982666,40.774925,-73.982666,member
3747021,classic_bike,2023-10-29 16:43:43,2023-10-29 16:44:29,Kent Ave & Grand St,Kent Ave & Grand St,40.716425,-73.965940,40.716425,-73.965940,member


In [21]:
duration = same_startend['ended_at'] - same_startend['started_at']
duration

1        0 days 00:09:40
10       0 days 00:01:05
11       0 days 00:00:23
13       0 days 00:43:49
38       0 days 00:35:31
               ...      
300385   0 days 00:12:12
300387   0 days 00:00:27
300388   0 days 00:00:18
300389   0 days 00:00:39
300390   0 days 00:15:52
Length: 12174, dtype: timedelta64[ns]

In [22]:
duration.sort_values()

123669   0 days 00:00:00
1896     0 days 00:00:00
93793    0 days 00:00:00
54238    0 days 00:00:00
132272   0 days 00:00:00
               ...      
183600   0 days 17:12:50
168901   0 days 18:01:53
256249   0 days 18:03:06
181357   0 days 18:12:53
233099   0 days 19:00:20
Length: 12174, dtype: timedelta64[ns]

In [23]:
same_startend[same_startend.index == 170217]

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual


In [24]:
invalid = same_startend.loc[same_startend['ended_at'] == same_startend['started_at']]
len(invalid)

9

In [25]:
# Less than one minute in duration
same_startend.loc[duration < pd.Timedelta(minutes=1)]

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
11,6A1209C04341249B,electric_bike,2023-08-02 13:25:13,2023-08-02 13:25:36,Jackson Square,JC063,Jackson Square,JC063,40.711130,-74.078900,40.711130,-74.078900,member
120,8EF5194B604BF30E,classic_bike,2023-08-09 12:12:14,2023-08-09 12:12:20,Hoboken Terminal - River St & Hudson Pl,HB102,Hoboken Terminal - River St & Hudson Pl,HB102,40.736068,-74.029127,40.736068,-74.029127,casual
121,06A71E513DA709FD,classic_bike,2023-08-27 11:06:34,2023-08-27 11:06:46,Hoboken Terminal - River St & Hudson Pl,HB102,Hoboken Terminal - River St & Hudson Pl,HB102,40.736068,-74.029127,40.736068,-74.029127,member
232,9A5CF8CEFCB5A0F1,electric_bike,2023-08-12 09:20:32,2023-08-12 09:20:40,Leonard Gordon Park,JC080,Leonard Gordon Park,JC080,40.745910,-74.057271,40.745910,-74.057271,member
247,A2542561E767B213,classic_bike,2023-08-11 21:34:32,2023-08-11 21:34:57,Leonard Gordon Park,JC080,Leonard Gordon Park,JC080,40.745910,-74.057271,40.745910,-74.057271,casual
...,...,...,...,...,...,...,...,...,...,...,...,...,...
300352,4BD6C2A59CC68E63,electric_bike,2023-09-02 19:45:55,2023-09-02 19:45:58,Montgomery St,JC099,Montgomery St,JC099,40.719420,-74.050990,40.719420,-74.050990,member
300360,0B8B95C55510D757,classic_bike,2023-09-28 19:20:23,2023-09-28 19:20:25,Montgomery St,JC099,Montgomery St,JC099,40.719420,-74.050990,40.719420,-74.050990,member
300387,40E039743C20EEBB,classic_bike,2023-09-17 19:43:20,2023-09-17 19:43:47,Montgomery St,JC099,Montgomery St,JC099,40.719420,-74.050990,40.719420,-74.050990,member
300388,61B1143C8303590E,electric_bike,2023-09-14 11:34:02,2023-09-14 11:34:20,8 St & Washington St,HB603,8 St & Washington St,HB603,40.745984,-74.028199,40.745984,-74.028199,member


In [28]:
# Longer duration rides
same_startend.loc[duration > pd.Timedelta(hours=6)]

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
9349,0398EF623FB318D5,electric_bike,2023-08-19 00:22:59,2023-08-19 17:17:57,Hilltop,JC019,Hilltop,JC019,40.731169,-74.057574,40.731169,-74.057574,member
12518,8EA10592E6553A27,classic_bike,2023-08-12 11:32:02,2023-08-12 18:37:41,Liberty Light Rail,JC052,Liberty Light Rail,JC052,40.711242,-74.055701,40.711242,-74.055701,casual
15775,031E2DAE9A78FC5E,electric_bike,2023-08-05 18:16:03,2023-08-06 01:51:45,Newport PATH,JC066,Newport PATH,JC066,40.727224,-74.033759,40.727224,-74.033759,casual
22171,E0F3483DD2284AA5,classic_bike,2023-08-06 00:30:51,2023-08-06 07:54:04,Hoboken Terminal - River St & Hudson Pl,HB102,Hoboken Terminal - River St & Hudson Pl,HB102,40.736068,-74.029127,40.736068,-74.029127,casual
47838,01DDB57DB1733A15,electric_bike,2023-08-12 01:02:56,2023-08-12 10:01:09,South Waterfront Walkway - Sinatra Dr & 1 St,HB103,South Waterfront Walkway - Sinatra Dr & 1 St,HB103,40.736982,-74.027781,40.736982,-74.027781,casual
51452,2537C8AC1351C711,classic_bike,2023-08-23 11:21:39,2023-08-23 19:35:20,Lincoln Park,JC053,Lincoln Park,JC053,40.724605,-74.078406,40.724605,-74.078406,casual
56418,2F8936E260CDD4F7,classic_bike,2023-08-22 13:30:40,2023-08-22 19:50:38,14 St Ferry - 14 St & Shipyard Ln,HB202,14 St Ferry - 14 St & Shipyard Ln,HB202,40.752961,-74.024353,40.752961,-74.024353,casual
74016,1BF2A767D0FBD31F,electric_bike,2023-08-02 06:23:49,2023-08-02 17:52:58,9 St HBLR - Jackson St & 8 St,HB305,9 St HBLR - Jackson St & 8 St,HB305,40.747907,-74.038412,40.747907,-74.038412,casual
95389,BC59A7220354FFD5,classic_bike,2023-08-10 00:41:54,2023-08-10 08:42:24,Church Sq Park - 5 St & Park Ave,HB601,Church Sq Park - 5 St & Park Ave,HB601,40.742659,-74.032233,40.742659,-74.032233,casual
168901,35174638CAA29E99,classic_bike,2023-10-05 22:30:30,2023-10-06 16:32:23,Bergen Ave & Sip Ave,JC109,Bergen Ave & Sip Ave,JC109,40.731009,-74.064437,40.731009,-74.064437,casual


In [29]:
clean_df['rideable_type'].value_counts()

classic_bike     275903
electric_bike     26881
Name: rideable_type, dtype: int64

In [30]:
clean_df.loc[clean_df['rideable_type'] == "docked_bike"]

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
