# Data Cleaning

## Import dependencies and files

In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path

import re
from datetime import datetime
import matplotlib.pyplot as plt

In [2]:
# Import Oct 2023 dataset
oct_path = Path("resources/202310-citibike-tripdata.csv")
oct_df = pd.read_csv(
    oct_path,
    dtype = {
        'start_station_id': str, # dtype warning, column 5
        'end_station_id': str # dtype warning, column 7
    })

## Check data types

In [3]:
# Check the dataset columns
oct_df.info(show_counts=True)

# CONCLUSION
# Inconsistent non-null counts, incomplete data.
# Latitude and longitude data types correct.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3823673 entries, 0 to 3823672
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   ride_id             3823673 non-null  object 
 1   rideable_type       3823673 non-null  object 
 2   started_at          3823673 non-null  object 
 3   ended_at            3823673 non-null  object 
 4   start_station_name  3817887 non-null  object 
 5   start_station_id    3817887 non-null  object 
 6   end_station_name    3808655 non-null  object 
 7   end_station_id      3808655 non-null  object 
 8   start_lat           3823673 non-null  float64
 9   start_lng           3823673 non-null  float64
 10  end_lat             3821286 non-null  float64
 11  end_lng             3821286 non-null  float64
 12  member_casual       3823673 non-null  object 
dtypes: float64(4), object(9)
memory usage: 379.2+ MB


In [4]:
# Check the `ride_id` column
print(len(oct_df.ride_id.unique()))

# CONCLUSION
# Each `ride_id` is unique, irrelevant to the analysis.
# Drop this column.

3823673


In [5]:
# Check the `rideable_type` column
oct_df.rideable_type.value_counts()

# CONCLUSION
# Two types: classic, electric.

rideable_type
classic_bike     3621284
electric_bike     202389
Name: count, dtype: int64

In [6]:
# Check the `started_at` and `ended_at` columns
print(f"started_at:\n{oct_df.started_at[:10]}\n")
print(f"ended_at:\n{oct_df.ended_at[:10]}")

# CONCLUSION
# Convert to datetime datatype.

started_at:
0    2023-10-03 02:48:38
1    2023-10-11 16:03:17
2    2023-10-11 19:57:13
3    2023-10-10 20:18:22
4    2023-10-17 16:26:58
5    2023-10-16 02:44:56
6    2023-10-16 08:46:06
7    2023-10-15 21:47:42
8    2023-10-18 11:43:10
9    2023-10-18 14:08:42
Name: started_at, dtype: object

ended_at:
0    2023-10-03 02:48:40
1    2023-10-11 16:45:26
2    2023-10-11 20:20:10
3    2023-10-10 20:18:37
4    2023-10-17 16:34:27
5    2023-10-16 03:06:06
6    2023-10-16 08:52:23
7    2023-10-15 21:56:48
8    2023-10-18 12:44:16
9    2023-10-18 14:09:50
Name: ended_at, dtype: object


In [7]:
# Check the `start_station_name` and `end_station_name` columns
start_stations = oct_df.start_station_name.value_counts()
end_stations = oct_df.end_station_name.value_counts()

# Isolate station names
start_names = set(start_stations.index)
end_names = set(end_stations.index)

# Print statements
print(start_stations, "\n")
print(end_stations, "\n")
print(f"start/end only stations: {start_names.difference(end_names)}")

# CONCLUSION
# All stations can be a start or end station.

start_station_name
W 21 St & 6 Ave                      14676
West St & Chambers St                13392
E 41 St & Madison Ave (SE corner)    12052
University Pl & E 14 St              11993
Broadway & W 58 St                   11790
                                     ...  
Warren St                                1
Madison St & 1 St                        1
JC Medical Center                        1
Morris Canal                             1
Columbus Park - Clinton St & 9 St        1
Name: count, Length: 2116, dtype: int64 

end_station_name
W 21 St & 6 Ave                      14682
West St & Chambers St                13450
University Pl & E 14 St              12049
E 41 St & Madison Ave (SE corner)    11918
Broadway & W 58 St                   11623
                                     ...  
Sedgwick Ave & Hall of Fame Tce          1
Madison St & 1 St                        1
Journal Square                           1
Bergen Ave                               1
River St & 1 St    

In [8]:
# Check the `start_station_id` and `end_station_id` columns
print(oct_df.start_station_id.value_counts(), "\n")
print(oct_df.end_station_id.value_counts())

# CONCLUSION
# Inconsistent column values, drop these columns and
# use station names instead.

start_station_id
6140.05    14676
5329.03    13392
6432.10    12052
5905.14    11993
6948.10    11790
           ...  
JC110          1
JC072          1
HB103          1
HB402          1
HB501          1
Name: count, Length: 2063, dtype: int64 

end_station_id
6140.05       14682
5329.03       13450
5905.14       12049
6432.10       11918
6948.10       11623
              ...  
JC053             1
JC034             1
JC084             1
190 Morgan        1
HB609             1
Name: count, Length: 2098, dtype: int64


In [9]:
# Check the `member_casual` column
oct_df.member_casual.value_counts()

member_casual
member    3100635
casual     723038
Name: count, dtype: int64

## Drop rows and columns

In [10]:
# Drop rows with null values
nonulls_df = oct_df.dropna(how="any").reset_index(drop=True)
nonulls_df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3806758 entries, 0 to 3806757
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   ride_id             3806758 non-null  object 
 1   rideable_type       3806758 non-null  object 
 2   started_at          3806758 non-null  object 
 3   ended_at            3806758 non-null  object 
 4   start_station_name  3806758 non-null  object 
 5   start_station_id    3806758 non-null  object 
 6   end_station_name    3806758 non-null  object 
 7   end_station_id      3806758 non-null  object 
 8   start_lat           3806758 non-null  float64
 9   start_lng           3806758 non-null  float64
 10  end_lat             3806758 non-null  float64
 11  end_lng             3806758 non-null  float64
 12  member_casual       3806758 non-null  object 
dtypes: float64(4), object(9)
memory usage: 377.6+ MB


In [11]:
# Drop columns: `ride_id`, `start_station_id`, `end_station_id`
drop_columns = ['ride_id', 'start_station_id', 'end_station_id']

reduced_df = nonulls_df.drop(columns=drop_columns)
reduced_df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3806758 entries, 0 to 3806757
Data columns (total 10 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   rideable_type       3806758 non-null  object 
 1   started_at          3806758 non-null  object 
 2   ended_at            3806758 non-null  object 
 3   start_station_name  3806758 non-null  object 
 4   end_station_name    3806758 non-null  object 
 5   start_lat           3806758 non-null  float64
 6   start_lng           3806758 non-null  float64
 7   end_lat             3806758 non-null  float64
 8   end_lng             3806758 non-null  float64
 9   member_casual       3806758 non-null  object 
dtypes: float64(4), object(6)
memory usage: 290.4+ MB


## Convert to datetime

In [12]:
# Convert the `started_at` and `ended_at` columns to datetime
clean_df = reduced_df.astype({
    'started_at': 'datetime64[ns]',
    'ended_at': 'datetime64[ns]'
})
clean_df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3806758 entries, 0 to 3806757
Data columns (total 10 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   rideable_type       3806758 non-null  object        
 1   started_at          3806758 non-null  datetime64[ns]
 2   ended_at            3806758 non-null  datetime64[ns]
 3   start_station_name  3806758 non-null  object        
 4   end_station_name    3806758 non-null  object        
 5   start_lat           3806758 non-null  float64       
 6   start_lng           3806758 non-null  float64       
 7   end_lat             3806758 non-null  float64       
 8   end_lng             3806758 non-null  float64       
 9   member_casual       3806758 non-null  object        
dtypes: datetime64[ns](2), float64(4), object(4)
memory usage: 290.4+ MB


In [13]:
# Display cleaned DataFrame
clean_df.head()

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual
0,classic_bike,2023-10-03 02:48:38,2023-10-03 02:48:40,Columbus Pl & Atlantic Ave,Columbus Pl & Atlantic Ave,40.677223,-73.922792,40.67717,-73.92285,casual
1,classic_bike,2023-10-11 16:03:17,2023-10-11 16:45:26,Central Park West & W 85 St,Central Park West & W 85 St,40.78476,-73.969862,40.78476,-73.969862,casual
2,classic_bike,2023-10-11 19:57:13,2023-10-11 20:20:10,Hicks St & Montague St,5 St & 6 Ave,40.694974,-73.995936,40.670484,-73.98209,casual
3,classic_bike,2023-10-10 20:18:22,2023-10-10 20:18:37,Atlantic Ave & Furman St,Atlantic Ave & Furman St,40.691669,-74.000139,40.691652,-73.999979,casual
4,classic_bike,2023-10-17 16:26:58,2023-10-17 16:34:27,E 41 St & Madison Ave (SE corner),E 58 St & 3 Ave,40.751845,-73.979585,40.760958,-73.967245,casual


## Check for invalid trips

In [14]:
# Calculate the trip duration
duration = clean_df.ended_at - clean_df.started_at

duration.sort_values(ascending=True)

2672150   -1 days +23:47:45
2576833   -1 days +23:50:34
2179299   -1 days +23:55:08
3685032   -1 days +23:55:32
2358376   -1 days +23:55:52
                 ...       
3751717     1 days 00:58:19
399235      1 days 00:58:44
2427636     1 days 00:59:12
3725100     1 days 00:59:13
2719744     1 days 00:59:26
Length: 3806758, dtype: timedelta64[ns]

### Negative Duration

In [15]:
# Drop rows with a negative duration
positive_durations = clean_df.loc[duration > pd.Timedelta(0)].copy()

# Display the shape and head
print(positive_durations.shape)
positive_durations.head()

(3805477, 10)


Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual
0,classic_bike,2023-10-03 02:48:38,2023-10-03 02:48:40,Columbus Pl & Atlantic Ave,Columbus Pl & Atlantic Ave,40.677223,-73.922792,40.67717,-73.92285,casual
1,classic_bike,2023-10-11 16:03:17,2023-10-11 16:45:26,Central Park West & W 85 St,Central Park West & W 85 St,40.78476,-73.969862,40.78476,-73.969862,casual
2,classic_bike,2023-10-11 19:57:13,2023-10-11 20:20:10,Hicks St & Montague St,5 St & 6 Ave,40.694974,-73.995936,40.670484,-73.98209,casual
3,classic_bike,2023-10-10 20:18:22,2023-10-10 20:18:37,Atlantic Ave & Furman St,Atlantic Ave & Furman St,40.691669,-74.000139,40.691652,-73.999979,casual
4,classic_bike,2023-10-17 16:26:58,2023-10-17 16:34:27,E 41 St & Madison Ave (SE corner),E 58 St & 3 Ave,40.751845,-73.979585,40.760958,-73.967245,casual


In [16]:
# Calculate updated duration values and convert to minutes
duration = positive_durations.ended_at - positive_durations.started_at

# Convert to minutes, as integer
duration_in_mins = (duration.dt.total_seconds() / 60).astype(int)

# Descriptive statistics, as integers
duration_stats = duration_in_mins.describe().apply(lambda x: round(x))
duration_stats

count    3805477
mean          13
std           29
min            0
25%            5
50%            9
75%           16
max         1499
dtype: int64

In [17]:
# Add the duration_in_mins as a new column
positive_durations['duration_in_mins'] = duration_in_mins

positive_durations.head()

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,duration_in_mins
0,classic_bike,2023-10-03 02:48:38,2023-10-03 02:48:40,Columbus Pl & Atlantic Ave,Columbus Pl & Atlantic Ave,40.677223,-73.922792,40.67717,-73.92285,casual,0
1,classic_bike,2023-10-11 16:03:17,2023-10-11 16:45:26,Central Park West & W 85 St,Central Park West & W 85 St,40.78476,-73.969862,40.78476,-73.969862,casual,42
2,classic_bike,2023-10-11 19:57:13,2023-10-11 20:20:10,Hicks St & Montague St,5 St & 6 Ave,40.694974,-73.995936,40.670484,-73.98209,casual,22
3,classic_bike,2023-10-10 20:18:22,2023-10-10 20:18:37,Atlantic Ave & Furman St,Atlantic Ave & Furman St,40.691669,-74.000139,40.691652,-73.999979,casual,0
4,classic_bike,2023-10-17 16:26:58,2023-10-17 16:34:27,E 41 St & Madison Ave (SE corner),E 58 St & 3 Ave,40.751845,-73.979585,40.760958,-73.967245,casual,7


### Short Duration Trips (< 1 minute duration)

In [18]:
# Identify trips < 1 minute (potential user error, incorrect docking, etc.)
lessthan_minute = positive_durations.loc[duration < pd.Timedelta(minutes=1)]
lessthan_minute

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,duration_in_mins
0,classic_bike,2023-10-03 02:48:38,2023-10-03 02:48:40,Columbus Pl & Atlantic Ave,Columbus Pl & Atlantic Ave,40.677223,-73.922792,40.677170,-73.922850,casual,0
3,classic_bike,2023-10-10 20:18:22,2023-10-10 20:18:37,Atlantic Ave & Furman St,Atlantic Ave & Furman St,40.691669,-74.000139,40.691652,-73.999979,casual,0
10,classic_bike,2023-10-27 18:45:35,2023-10-27 18:46:03,W 12 St & Hudson St,W 12 St & Hudson St,40.737479,-74.005524,40.737530,-74.005589,casual,0
11,classic_bike,2023-10-27 17:24:59,2023-10-27 17:25:05,Central Park West & W 85 St,Central Park West & W 85 St,40.784824,-73.969830,40.784760,-73.969862,casual,0
12,classic_bike,2023-10-26 21:49:46,2023-10-26 21:50:06,Canal St & Rutgers St,Canal St & Rutgers St,40.714229,-73.989844,40.714275,-73.989900,casual,0
...,...,...,...,...,...,...,...,...,...,...,...
3797732,classic_bike,2023-10-17 09:21:53,2023-10-17 09:22:50,Harrison St & Hudson St,Duane St & Hudson St,40.718710,-74.009001,40.717030,-74.009250,member,0
3798605,classic_bike,2023-10-06 07:50:17,2023-10-06 07:51:04,Rutgers St & Henry St,Canal St & Rutgers St,40.713322,-73.990097,40.714275,-73.989900,member,0
3801338,classic_bike,2023-10-17 16:26:46,2023-10-17 16:27:31,Rutgers St & Henry St,Canal St & Rutgers St,40.713322,-73.990097,40.714275,-73.989900,member,0
3802024,classic_bike,2023-10-14 13:18:10,2023-10-14 13:19:07,6 Ave & 9 St,5 St & 6 Ave,40.668009,-73.983877,40.670484,-73.982090,member,0


__NOTE__:
- Calculated using raw `started_at` and `ended_at`, rather than the rounded `duration_in_mins`
- This has supposedly been removed prior to dataset publish.

In [19]:
# EXPLORATION: Coordinates vs Station Name
sample_station = "Dock St & Front St"

# Conditions
lat_condition = (lessthan_minute['start_lat'] != lessthan_minute['end_lat'])
lng_condition = (lessthan_minute['start_lng'] != lessthan_minute['end_lng'])
start_stn_condition = (lessthan_minute['start_station_name'] == sample_station)
end_stn_condition = (lessthan_minute['end_station_name'] == sample_station)

# Filter using conditions
same_startend = lessthan_minute.loc[lat_condition & lng_condition & start_stn_condition & end_stn_condition]
same_startend

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,duration_in_mins
1709288,classic_bike,2023-10-11 15:25:49,2023-10-11 15:26:30,Dock St & Front St,Dock St & Front St,40.702976,-73.992544,40.702709,-73.99253,member,0
1711796,classic_bike,2023-10-19 08:43:42,2023-10-19 08:43:53,Dock St & Front St,Dock St & Front St,40.702925,-73.992489,40.702709,-73.99253,member,0
1712882,classic_bike,2023-10-15 00:10:04,2023-10-15 00:10:09,Dock St & Front St,Dock St & Front St,40.702749,-73.992563,40.702709,-73.99253,member,0
1712973,classic_bike,2023-10-18 11:11:33,2023-10-18 11:11:34,Dock St & Front St,Dock St & Front St,40.702749,-73.992549,40.702709,-73.99253,casual,0


__Observation__: Multiple `start_lat` values for the same station.

In [20]:
# Determine trips which occurred at the same location
same_location = lessthan_minute.loc[lessthan_minute['start_station_name'] == lessthan_minute['end_station_name']]
same_location

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,duration_in_mins
0,classic_bike,2023-10-03 02:48:38,2023-10-03 02:48:40,Columbus Pl & Atlantic Ave,Columbus Pl & Atlantic Ave,40.677223,-73.922792,40.677170,-73.922850,casual,0
3,classic_bike,2023-10-10 20:18:22,2023-10-10 20:18:37,Atlantic Ave & Furman St,Atlantic Ave & Furman St,40.691669,-74.000139,40.691652,-73.999979,casual,0
10,classic_bike,2023-10-27 18:45:35,2023-10-27 18:46:03,W 12 St & Hudson St,W 12 St & Hudson St,40.737479,-74.005524,40.737530,-74.005589,casual,0
11,classic_bike,2023-10-27 17:24:59,2023-10-27 17:25:05,Central Park West & W 85 St,Central Park West & W 85 St,40.784824,-73.969830,40.784760,-73.969862,casual,0
12,classic_bike,2023-10-26 21:49:46,2023-10-26 21:50:06,Canal St & Rutgers St,Canal St & Rutgers St,40.714229,-73.989844,40.714275,-73.989900,casual,0
...,...,...,...,...,...,...,...,...,...,...,...
3747015,electric_bike,2023-10-23 19:07:02,2023-10-23 19:07:06,E 32 St & Park Ave,E 32 St & Park Ave,40.745712,-73.981948,40.745712,-73.981948,member,0
3747016,electric_bike,2023-10-23 19:05:29,2023-10-23 19:06:09,E 32 St & Park Ave,E 32 St & Park Ave,40.745712,-73.981948,40.745712,-73.981948,member,0
3747017,classic_bike,2023-10-25 04:48:45,2023-10-25 04:49:13,Franklin Ave & St Marks Ave,Franklin Ave & St Marks Ave,40.675686,-73.956239,40.675832,-73.956168,member,0
3747021,classic_bike,2023-10-29 16:43:43,2023-10-29 16:44:29,Kent Ave & Grand St,Kent Ave & Grand St,40.716425,-73.965940,40.716425,-73.965940,member,0


In [21]:
# Determine the distribution of membership
same_location.member_casual.value_counts()

member_casual
member    64892
casual     9900
Name: count, dtype: int64

Every bike unlock for "casual" trips cost $4.49, if not a day pass.

In [22]:
# Add a `trip_validity` column
positive_durations['trip_validity'] = pd.Series(dtype=str)
positive_durations.loc[same_location.index, 'trip_validity'] = "short"
# positive_durations.loc[~same_location.index, 'trip_validity'] = "valid"

positive_durations.trip_validity.value_counts()

trip_validity
short    74792
Name: count, dtype: int64

### Long Duration Trips

In [23]:
# Calculate the IQR, lower/upper bounds
lower_quartile = duration_stats['25%']
upper_quartile = duration_stats['75%']

# Calculate the IQR
iqr = upper_quartile - lower_quartile

# Calculate the bounds
lower_bounds = lower_quartile - (1.5*iqr)
upper_bounds = upper_quartile + (1.5*iqr)

print(f"Lower bounds: {lower_bounds}")
print(f"Upper bounds: {upper_bounds}")

Lower bounds: -11.5
Upper bounds: 32.5


32.5 minutes is still within the 45-min "member" limit

In [24]:
# Casual trips - exceeded 30 minutes
casual_trips = (positive_durations['member_casual'] == "casual")
exceeded = (duration > pd.Timedelta(minutes=30))

exceeded_casual = positive_durations.loc[exceeded & casual_trips]
casual_stats = exceeded_casual.duration_in_mins.describe()
casual_stats

count    109254.000000
mean         60.685366
std          88.801498
min          30.000000
25%          34.000000
50%          42.000000
75%          59.000000
max        1499.000000
Name: duration_in_mins, dtype: float64

In [25]:
# Calculate the casual exceeded IQR, lower/upper bounds
lower_quartile = casual_stats['25%']
upper_quartile = casual_stats['75%']

# Calculate the IQR
iqr = upper_quartile - lower_quartile

# Calculate the bounds
lower_bounds = lower_quartile - (1.5*iqr)
upper_bounds = upper_quartile + (1.5*iqr)

print(f"Lower bounds: {lower_bounds}")
print(f"Upper bounds: {upper_bounds}")

Lower bounds: -3.5
Upper bounds: 96.5


In [26]:
# Identify rows of outliers
casual_outliers = exceeded_casual.loc[duration > pd.Timedelta(minutes=upper_bounds)]
casual_outliers.sort_values(by="duration_in_mins", ascending=False)

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,duration_in_mins,trip_validity
2719744,classic_bike,2023-10-15 17:25:29,2023-10-16 18:24:55,Vesey St & Church St,Cooper Square & Astor Pl,40.712220,-74.010472,40.729515,-73.990753,casual,1499,
562143,classic_bike,2023-10-13 22:16:26,2023-10-14 23:12:46,E 31 St & 3 Ave,Monroe St & Classon Ave,40.743943,-73.979661,40.684568,-73.958811,casual,1496,
1176362,classic_bike,2023-10-08 13:20:45,2023-10-09 14:11:44,Soissons Landing,Soissons Landing,40.692317,-74.014866,40.692317,-74.014866,casual,1490,
265910,classic_bike,2023-10-02 17:02:43,2023-10-03 17:48:06,47 Ave & 31 St,Steinway St & 28 Ave,40.743000,-73.935610,40.765625,-73.913669,casual,1485,
3299477,classic_bike,2023-10-16 15:45:06,2023-10-17 16:28:17,River Ter & Warren St,River Ter & Warren St,40.717599,-74.015880,40.717599,-74.015880,casual,1483,
...,...,...,...,...,...,...,...,...,...,...,...,...
97932,classic_bike,2023-10-26 12:00:25,2023-10-26 13:37:04,W 36 St & 7 Ave,Broadway & W 38 St,40.752275,-73.989289,40.752973,-73.987349,casual,96,
100325,electric_bike,2023-10-26 17:42:50,2023-10-26 19:19:28,E 63 St & 3 Ave,E 5 St & Cooper Sq,40.763954,-73.964600,40.727690,-73.990993,casual,96,
2247895,classic_bike,2023-10-22 21:38:14,2023-10-22 23:14:51,E 2 St & 2 Ave,Centre St & Chambers St,40.725029,-73.990697,40.712733,-74.004607,casual,96,
2259056,classic_bike,2023-10-31 13:30:09,2023-10-31 15:06:42,Centre St & Chambers St,Centre St & Chambers St,40.712712,-74.004660,40.712733,-74.004607,casual,96,


In [27]:
# Update the `trip_validity` column
positive_durations.loc[casual_outliers.index, 'trip_validity'] = "long"

positive_durations.trip_validity.value_counts()

trip_validity
short    74792
long      8902
Name: count, dtype: int64

In [28]:
# Member trips - exceeded 45 minutes
member_trips = (positive_durations['member_casual'] == "member")
exceeded = (duration > pd.Timedelta(minutes=45))

exceeded_member = positive_durations.loc[exceeded & member_trips]
member_stats = exceeded_member.duration_in_mins.describe()
member_stats

count    43210.000000
mean       105.941912
std        182.022216
min         45.000000
25%         48.000000
50%         55.000000
75%         71.000000
max       1499.000000
Name: duration_in_mins, dtype: float64

In [29]:
# Calculate the casual exceeded IQR, lower/upper bounds
lower_quartile = member_stats['25%']
upper_quartile = member_stats['75%']

# Calculate the IQR
iqr = upper_quartile - lower_quartile

# Calculate the bounds
lower_bounds = lower_quartile - (1.5*iqr)
upper_bounds = upper_quartile + (1.5*iqr)

print(f"Lower bounds: {lower_bounds}")
print(f"Upper bounds: {upper_bounds}")

Lower bounds: 13.5
Upper bounds: 105.5


In [30]:
# Identify rows of outliers
member_outliers = exceeded_member.loc[duration > pd.Timedelta(minutes=upper_bounds)]
member_outliers.sort_values(by="duration_in_mins", ascending=False)

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,duration_in_mins,trip_validity
2427636,classic_bike,2023-10-25 16:49:22,2023-10-26 17:48:34,Riverside Dr & W 82 St,W 70 St & Amsterdam Ave,40.787209,-73.981281,40.777480,-73.982886,member,1499,
3725100,classic_bike,2023-10-27 06:02:45,2023-10-28 07:01:58,W 20 St & 8 Ave,1 Ave & E 68 St,40.743453,-74.000040,40.765005,-73.958185,member,1499,
399235,classic_bike,2023-10-21 01:20:46,2023-10-22 02:19:30,Water St & Fletcher St,Fulton St & William St,40.706411,-74.005597,40.709601,-74.006551,member,1498,
3751717,classic_bike,2023-10-14 09:58:48,2023-10-15 10:57:07,E 3 St & Church Ave,Greenwood Ave & E 4 St,40.643817,-73.977433,40.650739,-73.977739,member,1498,
841605,classic_bike,2023-10-14 08:39:07,2023-10-15 09:37:17,Centre St & Chambers St,Catherine St & Monroe St,40.712733,-74.004607,40.711174,-73.996826,member,1498,
...,...,...,...,...,...,...,...,...,...,...,...,...
2343181,classic_bike,2023-10-09 20:27:36,2023-10-09 22:13:24,St Marks Pl & 2 Ave,1 Ave & E 6 St,40.728419,-73.987140,40.726331,-73.986169,member,105,
1310797,classic_bike,2023-10-11 09:46:55,2023-10-11 11:32:37,Howard St & Lafayette St,Hudson St & Reade St,40.719221,-73.999622,40.716250,-74.009106,member,105,
3296135,classic_bike,2023-10-20 23:36:29,2023-10-21 01:22:09,Lenox Ave & W 130 St,Edgecombe Ave & W 141 St,40.810822,-73.943100,40.820681,-73.945144,member,105,
2234994,classic_bike,2023-10-18 17:57:00,2023-10-18 19:42:55,Grand Ave & Bergen St,Lincoln Rd & Ocean Ave,40.677974,-73.962228,40.660500,-73.962600,member,105,


In [31]:
# Update the `trip_validity` column
positive_durations.loc[member_outliers.index, 'trip_validity'] = "long"

positive_durations.trip_validity.value_counts()

trip_validity
short    74792
long     14585
Name: count, dtype: int64

In [32]:
positive_durations.trip_validity.fillna(value="valid", inplace=True)

# Reset DataFrame index
positive_durations = positive_durations.reset_index(drop=True)

positive_durations.head()

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,duration_in_mins,trip_validity
0,classic_bike,2023-10-03 02:48:38,2023-10-03 02:48:40,Columbus Pl & Atlantic Ave,Columbus Pl & Atlantic Ave,40.677223,-73.922792,40.67717,-73.92285,casual,0,short
1,classic_bike,2023-10-11 16:03:17,2023-10-11 16:45:26,Central Park West & W 85 St,Central Park West & W 85 St,40.78476,-73.969862,40.78476,-73.969862,casual,42,valid
2,classic_bike,2023-10-11 19:57:13,2023-10-11 20:20:10,Hicks St & Montague St,5 St & 6 Ave,40.694974,-73.995936,40.670484,-73.98209,casual,22,valid
3,classic_bike,2023-10-10 20:18:22,2023-10-10 20:18:37,Atlantic Ave & Furman St,Atlantic Ave & Furman St,40.691669,-74.000139,40.691652,-73.999979,casual,0,short
4,classic_bike,2023-10-17 16:26:58,2023-10-17 16:34:27,E 41 St & Madison Ave (SE corner),E 58 St & 3 Ave,40.751845,-73.979585,40.760958,-73.967245,casual,7,valid


In [33]:
positive_durations.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3805477 entries, 0 to 3805476
Data columns (total 12 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   rideable_type       3805477 non-null  object        
 1   started_at          3805477 non-null  datetime64[ns]
 2   ended_at            3805477 non-null  datetime64[ns]
 3   start_station_name  3805477 non-null  object        
 4   end_station_name    3805477 non-null  object        
 5   start_lat           3805477 non-null  float64       
 6   start_lng           3805477 non-null  float64       
 7   end_lat             3805477 non-null  float64       
 8   end_lng             3805477 non-null  float64       
 9   member_casual       3805477 non-null  object        
 10  duration_in_mins    3805477 non-null  int64         
 11  trip_validity       3805477 non-null  object        
dtypes: datetime64[ns](2), float64(4), int64(1), object(5)
memory usage: 34

## Simplify the coordinates

In [34]:
# Isolate the station names and coordinates
stations_df = positive_durations[[
    'start_station_name', 'end_station_name',
    'start_lat', 'start_lng',
    'end_lat', 'end_lng']]
stations_df

Unnamed: 0,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng
0,Columbus Pl & Atlantic Ave,Columbus Pl & Atlantic Ave,40.677223,-73.922792,40.677170,-73.922850
1,Central Park West & W 85 St,Central Park West & W 85 St,40.784760,-73.969862,40.784760,-73.969862
2,Hicks St & Montague St,5 St & 6 Ave,40.694974,-73.995936,40.670484,-73.982090
3,Atlantic Ave & Furman St,Atlantic Ave & Furman St,40.691669,-74.000139,40.691652,-73.999979
4,E 41 St & Madison Ave (SE corner),E 58 St & 3 Ave,40.751845,-73.979585,40.760958,-73.967245
...,...,...,...,...,...,...
3805472,Lewis Ave & Madison St,Myrtle Ave & Lewis Ave,40.686312,-73.935775,40.696820,-73.937569
3805473,E 44 St & Lexington Ave,E 58 St & 3 Ave,40.752643,-73.974996,40.760958,-73.967245
3805474,8 Ave & W 38 St,W 48 St & Rockefeller Plaza,40.754646,-73.991850,40.757769,-73.979294
3805475,8 Ave & W 38 St,W 56 St & 6 Ave,40.754610,-73.991770,40.763406,-73.977225


In [35]:
# Check which column contains all the station names
start = set(stations_df.start_station_name)
end = set(stations_df.end_station_name)

print(f"Number of start stations: {len(start)}")
print(f"Number of end stations: {len(end)}\n")

print(f"If empty, use 'start_station_name': {len(end.difference(start))}")
print(f"If empty, use 'end_station_name': {len(start.difference(end))}")

Number of start stations: 2116
Number of end stations: 2151

If empty, use 'start_station_name': 35
If empty, use 'end_station_name': 0


__Conclusion__: Use `end_station_name` since it contains all the station names

In [36]:
def create_coordinates():
    # Create a DataFrame of the end stations and its count
    station_trips = pd.DataFrame(stations_df.end_station_name.value_counts())
    
    # Initialise the average coordinate columns
    station_trips['average_lat'] = pd.Series(dtype=float)
    station_trips['average_lng'] = pd.Series(dtype=float)

    for idx, station in station_trips.iterrows():
        if (pd.isna(station_trips.loc[idx, 'average_lat']) or pd.isna(station_trips.loc[idx, 'average_lng'])):
            station_trips.loc[idx, 'average_lat'] = positive_durations.loc[positive_durations['end_station_name'] == station.name, 'end_lat'].mean()
            station_trips.loc[idx, 'average_lng'] = positive_durations.loc[positive_durations['end_station_name'] == station.name, 'end_lng'].mean()
        else:
            continue
        print(station.name)

    # Export station_trips as a CSV
    station_trips.to_csv("resources/station_trips.csv")

In [37]:
# Check if station_trips.csv exists
station_trips_path = "resources/station_trips.csv"

for path in Path("resources").iterdir():
    if (str(path) == station_trips_path):
        print("CSV already exists")
        break
    else:
        create_coordinates()

CSV already exists


In [38]:
# Create station_trips by importing the csv
station_trips = pd.read_csv(station_trips_path)
station_trips.head()

Unnamed: 0,end_station_name,count,average_lat,average_lng
0,W 21 St & 6 Ave,14670,40.74174,-73.994156
1,West St & Chambers St,13441,40.717548,-74.013221
2,University Pl & E 14 St,12039,40.734814,-73.992085
3,E 41 St & Madison Ave (SE corner),11914,40.752049,-73.979635
4,Broadway & W 58 St,11609,40.766953,-73.981693


### Merge start stations and its count

In [39]:
# Rename count to end_count
station_trips = station_trips.rename(columns={
    'count': 'end_count',
    'end_station_name': 'station_name'})
station_trips.head()

Unnamed: 0,station_name,end_count,average_lat,average_lng
0,W 21 St & 6 Ave,14670,40.74174,-73.994156
1,West St & Chambers St,13441,40.717548,-74.013221
2,University Pl & E 14 St,12039,40.734814,-73.992085
3,E 41 St & Madison Ave (SE corner),11914,40.752049,-73.979635
4,Broadway & W 58 St,11609,40.766953,-73.981693


In [40]:
# Add the start_count
start_stations = pd.DataFrame(stations_df.start_station_name.value_counts()).reset_index()

# Rename the columns for merging
start_stations = start_stations.rename(columns={
    'count': 'start_count',
    'start_station_name': 'station_name'})

# Display the DataFrame
start_stations.head()

Unnamed: 0,station_name,start_count
0,W 21 St & 6 Ave,14639
1,West St & Chambers St,13363
2,E 41 St & Madison Ave (SE corner),12011
3,University Pl & E 14 St,11957
4,Broadway & W 58 St,11731


In [41]:
# Merge the DataFrames
station_trips = pd.merge(station_trips, start_stations, how="inner", on="station_name")

In [42]:
# Rearrange columns and display
station_trips = station_trips[['station_name', 'start_count', 'end_count', 'average_lat', 'average_lng']]
station_trips.head()

Unnamed: 0,station_name,start_count,end_count,average_lat,average_lng
0,W 21 St & 6 Ave,14639,14670,40.74174,-73.994156
1,West St & Chambers St,13363,13441,40.717548,-74.013221
2,University Pl & E 14 St,11957,12039,40.734814,-73.992085
3,E 41 St & Madison Ave (SE corner),12011,11914,40.752049,-73.979635
4,Broadway & W 58 St,11731,11609,40.766953,-73.981693


In [48]:
# Display the DataFrame distribution
station_trips.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2116 entries, 0 to 2115
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   station_name  2116 non-null   object 
 1   start_count   2116 non-null   int64  
 2   end_count     2116 non-null   int64  
 3   average_lat   2116 non-null   float64
 4   average_lng   2116 non-null   float64
dtypes: float64(2), int64(2), object(1)
memory usage: 82.8+ KB


In [49]:
# Overwrite the existing station_trips.csv
station_trips.to_csv("resources/station_trips.csv")

## Export the remainder of the data

In [45]:
positive_durations.head()

simplified_df = positive_durations[[
    'rideable_type', 'started_at', 'ended_at',
    'start_station_name', 'end_station_name',
    'member_casual', 'duration_in_mins',
    'trip_validity'
]]

simplified_df.head()

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,member_casual,duration_in_mins,trip_validity
0,classic_bike,2023-10-03 02:48:38,2023-10-03 02:48:40,Columbus Pl & Atlantic Ave,Columbus Pl & Atlantic Ave,casual,0,short
1,classic_bike,2023-10-11 16:03:17,2023-10-11 16:45:26,Central Park West & W 85 St,Central Park West & W 85 St,casual,42,valid
2,classic_bike,2023-10-11 19:57:13,2023-10-11 20:20:10,Hicks St & Montague St,5 St & 6 Ave,casual,22,valid
3,classic_bike,2023-10-10 20:18:22,2023-10-10 20:18:37,Atlantic Ave & Furman St,Atlantic Ave & Furman St,casual,0,short
4,classic_bike,2023-10-17 16:26:58,2023-10-17 16:34:27,E 41 St & Madison Ave (SE corner),E 58 St & 3 Ave,casual,7,valid


In [46]:
simplified_df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3805477 entries, 0 to 3805476
Data columns (total 8 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   rideable_type       3805477 non-null  object        
 1   started_at          3805477 non-null  datetime64[ns]
 2   ended_at            3805477 non-null  datetime64[ns]
 3   start_station_name  3805477 non-null  object        
 4   end_station_name    3805477 non-null  object        
 5   member_casual       3805477 non-null  object        
 6   duration_in_mins    3805477 non-null  int64         
 7   trip_validity       3805477 non-null  object        
dtypes: datetime64[ns](2), int64(1), object(5)
memory usage: 232.3+ MB


In [47]:
simplified_df.to_csv("resources/simplified_df.csv")