# ETL

In [3]:
import pandas as pd
from pathlib import Path
from datetime import datetime as dt

# Ignore warnings
import warnings
warnings.simplefilter(action='ignore')

# Local modules
from data_exploration import dataset_info

def duration_hours(start_at, end_at):
    #time1 = dt.strptime(start_at, '%Y-%m-%d %H:%M:%S')
    #time2 = dt.strptime(end_at, '%Y-%m-%d %H:%M:%S')
    delta = end_at-start_at
    return delta.days*24 + delta.seconds/3600

In [4]:
# Read the CSV into a DataFrame
prefix = 'JC-202307'
csv = Path(f"Citibike_Data/{prefix}-citibike-tripdata.csv")
trip_df = pd.read_csv(csv)

## Convert dates string to datetime

In [5]:
trip_df.dtypes

ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

In [6]:
trip_df['started_at'] = pd.to_datetime(trip_df['started_at'], format='%Y-%m-%d %H:%M:%S')
trip_df['ended_at'] = pd.to_datetime(trip_df['ended_at'], format='%Y-%m-%d %H:%M:%S')

In [7]:
trip_df.dtypes

ride_id                       object
rideable_type                 object
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name            object
start_station_id              object
end_station_name              object
end_station_id                object
start_lat                    float64
start_lng                    float64
end_lat                      float64
end_lng                      float64
member_casual                 object
dtype: object

## Calculate trip duration

In [8]:
print(len(trip_df))

106608


In [9]:
# Add a new column
trip_df['duration'] = 0 

count = 0

for index, row in trip_df.iterrows():
    count += 1
    print(f"Progress: {100*count/len(trip_df):.2f}%")
    trip_df.loc[index, 'duration'] = duration_hours(trip_df.loc[index,'started_at'], trip_df.loc[index,'ended_at'])

trip_df

Progress: 0.00%
Progress: 0.00%
Progress: 0.00%
Progress: 0.00%
Progress: 0.00%
Progress: 0.01%
Progress: 0.01%
Progress: 0.01%
Progress: 0.01%
Progress: 0.01%
Progress: 0.01%
Progress: 0.01%
Progress: 0.01%
Progress: 0.01%
Progress: 0.01%
Progress: 0.02%
Progress: 0.02%
Progress: 0.02%
Progress: 0.02%
Progress: 0.02%
Progress: 0.02%
Progress: 0.02%
Progress: 0.02%
Progress: 0.02%
Progress: 0.02%
Progress: 0.02%
Progress: 0.03%
Progress: 0.03%
Progress: 0.03%
Progress: 0.03%
Progress: 0.03%
Progress: 0.03%
Progress: 0.03%
Progress: 0.03%
Progress: 0.03%
Progress: 0.03%
Progress: 0.03%
Progress: 0.04%
Progress: 0.04%
Progress: 0.04%
Progress: 0.04%
Progress: 0.04%
Progress: 0.04%
Progress: 0.04%
Progress: 0.04%
Progress: 0.04%
Progress: 0.04%
Progress: 0.05%
Progress: 0.05%
Progress: 0.05%
Progress: 0.05%
Progress: 0.05%
Progress: 0.05%
Progress: 0.05%
Progress: 0.05%
Progress: 0.05%
Progress: 0.05%
Progress: 0.05%
Progress: 0.06%
Progress: 0.06%
Progress: 0.06%
Progress: 0.06%
Progress

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,duration
0,7A68381621C25F78,classic_bike,2023-07-17 17:16:34,2023-07-17 17:20:52,Astor Place,JC077,Communipaw & Berry Lane,JC084,40.719225,-74.071281,40.714358,-74.066611,member,0.071667
1,0F814CA67B2FA120,classic_bike,2023-07-26 19:40:15,2023-07-26 19:44:37,Adams St & 2 St,HB407,9 St HBLR - Jackson St & 8 St,HB305,40.739727,-74.036866,40.747907,-74.038412,member,0.072778
2,775A38967EBF5FB4,electric_bike,2023-07-01 12:12:22,2023-07-01 12:27:45,McGinley Square,JC055,Riverview Park,JC057,40.725340,-74.067622,40.744319,-74.043991,member,0.256389
3,D93B742DCE1C1447,classic_bike,2023-07-20 19:10:18,2023-07-20 19:17:22,Baldwin at Montgomery,JC020,Brunswick St,JC023,40.723455,-74.064359,40.724176,-74.050656,member,0.117778
4,AA7A6863B4B92169,electric_bike,2023-07-07 19:33:59,2023-07-07 19:58:17,Baldwin at Montgomery,JC020,Mama Johnson Field - 4 St & Jackson St,HB404,40.723659,-74.064194,40.743140,-74.040041,casual,0.405000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106603,944F5CD711E7688E,classic_bike,2023-07-25 18:21:52,2023-07-25 18:34:38,Van Vorst Park,JC035,Newport Pkwy,JC008,40.718489,-74.047727,40.728745,-74.032108,member,0.212778
106604,4665B542F79F8C43,electric_bike,2023-07-26 22:22:20,2023-07-26 22:26:20,Van Vorst Park,JC035,Warren St,JC006,40.718489,-74.047727,40.721124,-74.038051,member,0.066667
106605,789D45FDEBC19E83,classic_bike,2023-07-31 07:57:29,2023-07-31 08:03:58,Van Vorst Park,JC035,Newport Pkwy,JC008,40.718422,-74.047691,40.728745,-74.032108,member,0.108056
106606,FFBE463288D36C2A,classic_bike,2023-07-07 17:52:09,2023-07-07 18:00:14,Van Vorst Park,JC035,Warren St,JC006,40.718489,-74.047727,40.721124,-74.038051,member,0.134722


In [10]:
# Print min and max duration
print(f"Min duration: {trip_df['duration'].min()}")
print(f"Max duration: {trip_df['duration'].max()}")

Min duration: -0.0002777777777787094
Max duration: 64.16305555555556


In [11]:
# Find zero and negative durations
non_pos_duration = trip_df.loc[trip_df['duration'] <= 0,:]
print(f"Rides with non-positive durations: {len(non_pos_duration)}")
non_pos_duration

Rides with non-positive durations: 11


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,duration
5505,78F1DCC60211FE71,classic_bike,2023-07-17 21:26:40,2023-07-17 21:26:40,Bergen Ave & Sip Ave,JC109,Bergen Ave & Sip Ave,JC109,40.730993,-74.064476,40.731009,-74.064437,member,0.0
5971,0EA9E8CD00BEBA58,classic_bike,2023-07-03 15:22:44,2023-07-03 15:22:44,Bergen Ave & Sip Ave,JC109,Bergen Ave & Sip Ave,JC109,40.731009,-74.064437,40.731009,-74.064437,member,0.0
12631,3A3BCFF4E4F61FEB,classic_bike,2023-07-15 10:32:32,2023-07-15 10:32:32,Dey St,JC065,Dey St,JC065,40.737757,-74.06701,40.737711,-74.066921,member,0.0
16063,8F4712DCB3171E2D,classic_bike,2023-07-26 19:19:19,2023-07-26 19:19:19,Hoboken Terminal - River St & Hudson Pl,HB102,Hoboken Terminal - River St & Hudson Pl,HB102,40.736068,-74.029127,40.736068,-74.029127,member,0.0
35053,8490496CE9B28D10,classic_bike,2023-07-04 10:47:11,2023-07-04 10:47:11,City Hall,JC003,City Hall,JC003,40.717732,-74.043845,40.717732,-74.043845,member,0.0
44567,60BE0F420FF8A901,classic_bike,2023-07-26 07:01:56,2023-07-26 07:01:56,City Hall,JC003,City Hall,JC003,40.717717,-74.043962,40.717732,-74.043845,member,0.0
46848,369B6CE72EB42F82,classic_bike,2023-07-01 10:11:05,2023-07-01 10:11:05,Grove St PATH,JC115,Grove St PATH,JC115,40.71941,-74.04309,40.71941,-74.04309,member,0.0
59660,6297B17FD4AB75BB,electric_bike,2023-07-17 11:33:54,2023-07-17 11:33:53,Christ Hospital,JC034,Christ Hospital,JC034,40.734786,-74.050444,40.734786,-74.050444,member,-0.000278
62617,F56BA55CCE835193,classic_bike,2023-07-15 10:50:45,2023-07-15 10:50:45,Christ Hospital,JC034,Christ Hospital,JC034,40.73481,-74.050445,40.734786,-74.050444,member,0.0
89874,EDBF7D4A391BEFF9,classic_bike,2023-07-18 14:28:44,2023-07-18 14:28:44,8 St & Washington St,HB603,8 St & Washington St,HB603,40.745984,-74.028199,40.745984,-74.028199,member,0.0


### Cleaning cancelled rides
From looking at the data, it is likely that the rides with zero or negative durations have been cancelled on the spot. In one case, the duration is negative (start at 11:33:54 and end at 11:33:53), which could be caused by a time synchronisation issue.

Because the number of data points is small, we decide to delete these rows from the dataset.

In [12]:
# Remove zero and negative durations from the dataset
cleaned_trip_df = trip_df.loc[trip_df['duration'] > 0,:]

In [13]:
dataset_info(cleaned_trip_df)

Unnamed: 0,columns,dtypes,elements,missing,unique
0,ride_id,object,106597,0,106597
1,rideable_type,object,106597,0,3
2,started_at,datetime64[ns],106597,0,103097
3,ended_at,datetime64[ns],106597,0,103470
4,start_station_name,object,106590,7,103
5,start_station_id,object,106590,7,103
6,end_station_name,object,106205,392,191
7,end_station_id,object,106205,392,191
8,start_lat,float64,106597,0,34352
9,start_lng,float64,106597,0,35005


### Convert duration from hours to minutes
All trips being rather short, a column with duration in minutes in created instead of the one in hours.

In [14]:
# Get statistics on duration
cleaned_trip_df['duration'].describe()

count    106597.000000
mean          0.235021
std           1.026420
min           0.000278
25%           0.072222
50%           0.114444
75%           0.192778
max          64.163056
Name: duration, dtype: float64

In [15]:
cleaned_trip_df['duration'] = cleaned_trip_df['duration']*60

In [16]:
# Get statistics on duration
cleaned_trip_df['duration'].describe()

count    106597.000000
mean         14.101242
std          61.585203
min           0.016667
25%           4.333333
50%           6.866667
75%          11.566667
max        3849.783333
Name: duration, dtype: float64

## Understand the missing geolocation data

In [17]:
missing_geo = cleaned_trip_df.loc[cleaned_trip_df['end_lat'].isna(),:]
missing_geo

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,duration
42867,11567D7191156C2C,electric_bike,2023-07-04 15:49:29,2023-07-05 16:49:09,Hoboken Terminal - Hudson St & Hudson Pl,HB101,,,40.735938,-74.030305,,,member,1499.666667
42871,872D97124E2F121F,classic_bike,2023-07-18 08:09:33,2023-07-19 09:09:24,Madison St & 1 St,HB402,,,40.738790,-74.039300,,,member,1499.850000
42872,BE411A2E73C57132,classic_bike,2023-07-30 11:43:23,2023-07-31 12:43:08,Jersey & 3rd,JC074,,,40.723332,-74.045953,,,member,1499.750000
42888,AD6876AD9C9B880C,classic_bike,2023-07-01 16:33:50,2023-07-02 17:33:42,Jackson Square,JC063,,,40.711130,-74.078900,,,casual,1499.866667
42895,31A7688F36AB2FB1,docked_bike,2023-07-14 19:20:08,2023-07-15 20:20:10,City Hall - Washington St & 1 St,HB105,,,40.737360,-74.030970,,,casual,1500.033333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102153,80C610AB04F7792C,classic_bike,2023-07-08 17:05:40,2023-07-09 18:05:33,Hamilton Park,JC009,,,40.727596,-74.044247,,,member,1499.883333
102154,8870DB3258F9823B,classic_bike,2023-07-17 09:37:07,2023-07-18 10:37:03,Hamilton Park,JC009,,,40.727596,-74.044247,,,member,1499.933333
102158,06DD7C509BF2D8CA,classic_bike,2023-07-29 21:38:12,2023-07-30 22:38:08,Hamilton Park,JC009,,,40.727596,-74.044247,,,member,1499.933333
102161,F4F2526CCDAB7F7C,classic_bike,2023-07-26 08:01:47,2023-07-27 09:01:41,Hamilton Park,JC009,,,40.727596,-74.044247,,,member,1499.900000


In [18]:
print(f"{len(missing_geo)} rides have missing geolocation data for the return point.")

122 rides have missing geolocation data for the return point.


In [19]:
missing_geo['duration'].describe()

count     122.000000
mean     1496.588251
std       277.267697
min        97.016667
25%      1499.816667
50%      1499.900000
75%      1499.916667
max      3849.783333
Name: duration, dtype: float64

### Observation about the missing return location
The Citibike website states the following:

> If you do not return a bike within a 24-hour period, you will be charged a lost or stolen bike fee of $1,200 (plus tax).

A lot of the bikes were rented for duration around 1499 minutes, which is a day + 1 hour. These clients have likely missed the return period and the bikes may have been considered stolen or lost.

(Source: https://help.citibikenyc.com/hc/en-us/articles/360032367371-What-if-I-keep-a-bike-out-too-long-)

In [20]:
missing_geo_lt1499 = missing_geo.loc[missing_geo['duration']<1499,:]
missing_geo_mid = missing_geo.loc[(missing_geo['duration']>=1499) & (missing_geo['duration']<=1500),:]
missing_geo_gt1500 = missing_geo.loc[missing_geo['duration']>1500,:]

print(f"{len(missing_geo_lt1499)} rides with lost bike, duration < 1499 min")
print(f"{len(missing_geo_gt1500)} rides with lost bike, duration > 1500 min")
print(f"{len(missing_geo_mid)} rides with duration 1499-1500 min")

2 rides with lost bike, duration < 1499 min
3 rides with lost bike, duration > 1500 min
117 rides with duration 1499-1500 min


In [21]:
missing_geo_lt1499

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,duration
53729,5EF9D01B0347426D,classic_bike,2023-07-03 13:55:26,2023-07-03 16:28:41,4 St & Grand St,HB301,,,40.742258,-74.035111,,,member,153.25
62150,FFBAAE6C609D2A18,classic_bike,2023-07-11 14:56:04,2023-07-11 16:33:05,Columbus Park - Clinton St & 9 St,HB501,,,40.748161,-74.032453,,,member,97.016667


In [22]:
missing_geo_gt1500

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,duration
42895,31A7688F36AB2FB1,docked_bike,2023-07-14 19:20:08,2023-07-15 20:20:10,City Hall - Washington St & 1 St,HB105,,,40.73736,-74.03097,,,casual,1500.033333
53358,9EC760E5A983BB51,docked_bike,2023-07-05 10:01:59,2023-07-08 02:11:46,Liberty Light Rail,JC052,,,40.711242,-74.055701,,,casual,3849.783333
53730,01293BC0EC81F35F,docked_bike,2023-07-01 18:45:51,2023-07-02 19:45:52,Lafayette Park,JC078,,,40.713464,-74.062859,,,casual,1500.016667


In [23]:
# Save the data with missing geolocation data to a CSV file
filepath = Path(f"Clean_Data/{prefix}-missinggeo.csv")
missing_geo.to_csv(filepath, index=False)

## Keep only the rides with valid return point geolocation
We decide to drop the rides without valid return location from this analysis. It is near impossible to know what happened to these bikes. The duration may not be trusted as actual data and the trip distance cannot be calculated.

In [24]:
cleaned_trip_dropna_df = cleaned_trip_df.loc[cleaned_trip_df['end_lat'].notna(),:]
cleaned_trip_dropna_df.fillna('Unknown', inplace=True)

In [25]:
dataset_info(cleaned_trip_dropna_df)

Unnamed: 0,columns,dtypes,elements,missing,unique
0,ride_id,object,106475,0,106475
1,rideable_type,object,106475,0,3
2,started_at,datetime64[ns],106475,0,102985
3,ended_at,datetime64[ns],106475,0,103356
4,start_station_name,object,106475,0,104
5,start_station_id,object,106475,0,104
6,end_station_name,object,106475,0,192
7,end_station_id,object,106475,0,192
8,start_lat,float64,106475,0,34352
9,start_lng,float64,106475,0,35005


## Calculate trip distance

In [26]:
# Source: https://www.movable-type.co.uk/scripts/latlong.html
from math import sin, cos, acos, pi

def lonlat_to_distance(lonlat1, lonlat2):
# Calculate distance between two points given by their [lon, lat]
    lon1 = pi*lonlat1[0]/180
    lat1 = pi*lonlat1[1]/180
    lon2 = pi*lonlat2[0]/180
    lat2 = pi*lonlat2[1]/180
    earth_radius = 6369 # Earth radius in km.)
    acos_in = sin(lat1)*sin(lat2) + cos(lat1)*cos(lat2)*cos(lon2-lon1)

    # Handle the floating point precision errors
    if (acos_in > 1) & (acos_in < 1.00005):
        acos_in = 1

    return acos(acos_in)*earth_radius

In [27]:
cleaned_trip_dropna_df['distance'] = 0

for index, row in cleaned_trip_dropna_df.iterrows():
    lonlat1 = [cleaned_trip_dropna_df.loc[index, 'start_lng'], cleaned_trip_dropna_df.loc[index, 'start_lat']]
    lonlat2 = [cleaned_trip_dropna_df.loc[index, 'end_lng'], cleaned_trip_dropna_df.loc[index, 'end_lat']]
    cleaned_trip_dropna_df.loc[index, 'distance'] = lonlat_to_distance(lonlat1, lonlat2)

cleaned_trip_dropna_df

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,duration,distance
0,7A68381621C25F78,classic_bike,2023-07-17 17:16:34,2023-07-17 17:20:52,Astor Place,JC077,Communipaw & Berry Lane,JC084,40.719225,-74.071281,40.714358,-74.066611,member,4.300000,0.668884
1,0F814CA67B2FA120,classic_bike,2023-07-26 19:40:15,2023-07-26 19:44:37,Adams St & 2 St,HB407,9 St HBLR - Jackson St & 8 St,HB305,40.739727,-74.036866,40.747907,-74.038412,member,4.366667,0.918566
2,775A38967EBF5FB4,electric_bike,2023-07-01 12:12:22,2023-07-01 12:27:45,McGinley Square,JC055,Riverview Park,JC057,40.725340,-74.067622,40.744319,-74.043991,member,15.383333,2.900464
3,D93B742DCE1C1447,classic_bike,2023-07-20 19:10:18,2023-07-20 19:17:22,Baldwin at Montgomery,JC020,Brunswick St,JC023,40.723455,-74.064359,40.724176,-74.050656,member,7.066667,1.157139
4,AA7A6863B4B92169,electric_bike,2023-07-07 19:33:59,2023-07-07 19:58:17,Baldwin at Montgomery,JC020,Mama Johnson Field - 4 St & Jackson St,HB404,40.723659,-74.064194,40.743140,-74.040041,casual,24.300000,2.971290
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106603,944F5CD711E7688E,classic_bike,2023-07-25 18:21:52,2023-07-25 18:34:38,Van Vorst Park,JC035,Newport Pkwy,JC008,40.718489,-74.047727,40.728745,-74.032108,member,12.766667,1.740958
106604,4665B542F79F8C43,electric_bike,2023-07-26 22:22:20,2023-07-26 22:26:20,Van Vorst Park,JC035,Warren St,JC006,40.718489,-74.047727,40.721124,-74.038051,member,4.000000,0.866182
106605,789D45FDEBC19E83,classic_bike,2023-07-31 07:57:29,2023-07-31 08:03:58,Van Vorst Park,JC035,Newport Pkwy,JC008,40.718422,-74.047691,40.728745,-74.032108,member,6.483333,1.743606
106606,FFBE463288D36C2A,classic_bike,2023-07-07 17:52:09,2023-07-07 18:00:14,Van Vorst Park,JC035,Warren St,JC006,40.718489,-74.047727,40.721124,-74.038051,member,8.083333,0.866182


In [28]:
cleaned_trip_dropna_df['distance'].describe()

count    106475.000000
mean          1.187714
std           0.832446
min           0.000000
25%           0.656801
50%           1.035261
75%           1.559151
max          20.355396
Name: distance, dtype: float64

## Adjust the longitude and latitude of the stations
In the original dataset, some variations exist between the lon/lat for different rides assigned to the same station. This is likely to be due either
- to slightly different locations where the bikes are parked, or
- to noise in the GPS trackers used on the bike

Regardless of the reason, adjusting the geolocation data allows to group the bikes from a similar stations together in a map instead of in a cluster around the station.

In [29]:
# List all start station id
start_station = cleaned_trip_dropna_df['start_station_id'].unique()
print(f"{len(start_station)} start stations")
print(start_station)

for station in start_station:

    # Keep the value the same for any station marked Unknown
    if station != 'Unknown':
        # Calculate the mean lat and lon for the station
        mean_lat = cleaned_trip_dropna_df.loc[cleaned_trip_dropna_df['start_station_id']==station,:]['start_lat'].mean()
        mean_lon = cleaned_trip_dropna_df.loc[cleaned_trip_dropna_df['start_station_id']==station,:]['start_lng'].mean()

        # Replace the original station lon/lat with the calculated mean 
        cleaned_trip_dropna_df.loc[cleaned_trip_dropna_df['start_station_id']==station,'start_lat'] = mean_lat
        cleaned_trip_dropna_df.loc[cleaned_trip_dropna_df['start_station_id']==station,'start_lng'] = mean_lon

104 start stations
['JC077' 'HB407' 'JC055' 'JC020' 'JC074' 'HB402' 'HB101' 'HB408' 'HB105'
 'JC057' 'JC116' 'JC082' 'JC063' 'JC066' 'JC095' 'JC072' 'HB601' 'JC110'
 'JC093' 'JC109' 'JC098' 'JC034' 'HB305' 'JC052' 'HB409' 'JC006' 'JC008'
 'HB404' 'JC094' 'JC103' 'HB602' 'JC097' 'JC023' 'JC003' 'JC013' 'JC038'
 'HB201' 'JC115' 'HB608' 'HB202' '6364.10' 'JC051' 'JC104' 'JC019' 'JC084'
 'JC102' 'HB102' 'HB506' 'JC065' 'JC002' '5288.09' 'JC081' 'HB302' 'JC059'
 'HB103' '5500.07' 'JC014' 'HB401' 'HB505' 'HB507' 'HB301' 'JC075' 'JC078'
 'HB304' 'JC105' 'HB607' 'JC022' '6450.12' '7123.04' 'HB502' 'HB603'
 'JC076' 'JC099' 'JC018' 'Unknown' 'HB501' 'JC053' '6700.14' 'JC027'
 'HB203' 'HB303' '6022.04' '5065.10' '5470.10' '6602.03' '6224.05'
 '5584.04' 'JC080' 'JC032' '7052.01' '5184.08' '5024.10' '6578.01' 'JC107'
 '5593.04' 'JC108' 'HB503' '6889.12' '5847.01' 'JC024' 'JC009' 'JC035'
 '5351.07' '5128.04']


In [32]:
# List all end station id
end_station = cleaned_trip_dropna_df['end_station_id'].unique()
print(f"{len(end_station)} end stations")
print(end_station)

for station in end_station:

    # Keep the value the same for any station marked Unknown
    if station != 'Unknown':
        # Calculate the mean lat and lon for the station
        mean_lat = cleaned_trip_dropna_df.loc[cleaned_trip_dropna_df['end_station_id']==station,:]['end_lat'].mean()
        mean_lon = cleaned_trip_dropna_df.loc[cleaned_trip_dropna_df['end_station_id']==station,:]['end_lng'].mean()

        # Replace the original station lon/lat with the calculated mean 
        cleaned_trip_dropna_df.loc[cleaned_trip_dropna_df['end_station_id']==station,'end_lat'] = mean_lat
        cleaned_trip_dropna_df.loc[cleaned_trip_dropna_df['end_station_id']==station,'end_lng'] = mean_lon

192 end stations
['JC084' 'HB305' 'JC057' 'JC023' 'HB404' 'HB602' 'HB409' 'JC035' 'HB402'
 'JC116' 'JC082' 'HB105' 'JC063' 'JC077' 'JC074' 'JC024' 'JC065' '5971.08'
 '6960.10' '5696.03' 'JC109' 'HB408' 'JC055' 'HB407' 'JC009' '7524.16'
 'JC098' '6230.04' '6551.02' '5195.06' '5175.08' '6765.01' 'HB506' 'HB102'
 'JC094' '6140.05' 'JC093' 'HB601' '5329.03' '6098.02' '5712.12' 'JC019'
 'JC020' '6569.09' '6578.01' 'HB608' '7646.04' 'JC104' '7323.09' '5797.01'
 '6659.03' 'JC002' '5065.10' '6700.01' 'JC110' 'JC072' '6157.04' '6233.05'
 '5509.02' '5114.08' '5532.01' 'JC097' 'JC102' '6364.10' '5351.07' 'JC051'
 '6560.01' 'HB201' 'JC008' 'JC006' '4962.01' '5033.01' '6492.04' '6432.11'
 '6551.11' 'HB202' '5679.08' '8358.03' 'HB505' '5145.02' 'JC115' 'HB101'
 '5914.08' '5001.08' '5730.08' 'JC081' 'JC013' '5303.08' 'JC003' 'JC038'
 '7175.05' '5297.02' 'HB301' '6115.06' 'JC103' '5105.01' '5065.14' 'JC053'
 'JC018' 'HB507' 'HB401' '5561.06' '5329.08' 'JC105' 'JC022' 'HB103'
 '5610.09' 'JC078' 'JC075'

In [34]:
# Save the data with to a CSV file
filepath = Path(f"Clean_Data/{prefix}.csv")
cleaned_trip_dropna_df.to_csv(filepath, index=False)

# Ideas for questions
- Number of trips for each month, over the year
- Variation in distance and duration over the year, and depending on the temperature