## Setup

In [1]:
# imports
import pandas as pd
import glob
import os

# geocoding imports
import geopandas as gpd
from shapely.geometry import Point

In [2]:
# find the csv files
files_list = glob.glob(os.path.join('Resources', '*.csv'))

# combine them
dfs = []

for file in files_list:
    df = pd.read_csv(file)
    dfs.append(df)

citibike_df = pd.concat(dfs, ignore_index=True)

# display df
citibike_df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,0744109F13385D1D,electric_bike,2024-01-15 15:18:07,2024-01-15 15:32:44,Morris Canal,JC072,Oakland Ave,JC022,40.712297,-74.038185,40.737604,-74.052478,member
1,B1488BFEF9118000,classic_bike,2024-01-13 15:32:50,2024-01-13 15:36:18,JC Medical Center,JC110,Grove St PATH,JC115,40.715391,-74.049692,40.71941,-74.04309,member
2,95A2FE8E51B4C836,classic_bike,2024-01-19 13:11:00,2024-01-19 13:14:44,Morris Canal,JC072,Exchange Pl,JC116,40.712419,-74.038526,40.716366,-74.034344,member
3,95D9AFF6A1652DC1,classic_bike,2024-01-23 07:03:49,2024-01-23 07:07:11,Morris Canal,JC072,Exchange Pl,JC116,40.712419,-74.038526,40.716366,-74.034344,member
4,5F7408988A83B1B3,classic_bike,2024-01-01 16:46:10,2024-01-01 16:50:31,Morris Canal,JC072,Harborside,JC104,40.712419,-74.038526,40.719252,-74.034234,member


In [3]:
# get the stations df
station_path = 'Output/stations.csv'
stations_df = pd.read_csv(station_path)

# display
stations_df.head()

Unnamed: 0,short_name,name,region_id,lat,lon
0,7082.08,23 Ave & 31 St,71.0,40.774233,-73.912749
1,3460.06,2 Ave & 37 St,71.0,40.65624,-74.00933
2,8472.06,Valentine Ave & E 183 St,71.0,40.856987,-73.898237
3,5540.06,56 Dr & 61 St,71.0,40.72368,-73.90458
4,8717.07,E 201 St & Briggs Ave,71.0,40.87207,-73.88459


## Data Check
---

#### Update data types

In [4]:
# check data types
citibike_df.dtypes

ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

In [5]:
# check for amount of nulls 
citibike_df.isnull().sum()

ride_id                  0
rideable_type            0
started_at               0
ended_at                 0
start_station_name     104
start_station_id       104
end_station_name      3061
end_station_id        3338
start_lat                0
start_lng                0
end_lat                353
end_lng                353
member_casual            0
dtype: int64

In [6]:
# update dates to datetime
date_format='mixed'
citibike_df['started_at'] = pd.to_datetime(citibike_df['started_at'], format=date_format)
citibike_df['ended_at'] = pd.to_datetime(citibike_df['ended_at'], format=date_format)

In [7]:
# check for datetime
citibike_df.select_dtypes('datetime64[ns]')

Unnamed: 0,started_at,ended_at
0,2024-01-15 15:18:07.000,2024-01-15 15:32:44.000
1,2024-01-13 15:32:50.000,2024-01-13 15:36:18.000
2,2024-01-19 13:11:00.000,2024-01-19 13:14:44.000
3,2024-01-23 07:03:49.000,2024-01-23 07:07:11.000
4,2024-01-01 16:46:10.000,2024-01-01 16:50:31.000
...,...,...
1052446,2024-12-28 09:45:30.704,2024-12-28 09:48:02.706
1052447,2024-12-12 16:21:50.427,2024-12-12 16:26:34.069
1052448,2024-12-11 19:23:24.109,2024-12-11 19:25:07.612
1052449,2024-12-12 20:48:40.471,2024-12-12 20:52:41.722


#### Check unique values

In [8]:
citibike_df.shape

(1052451, 13)

In [9]:
# check number of unique values per column
citibike_df.nunique()

ride_id               1052427
rideable_type               2
started_at            1043629
ended_at              1044175
start_station_name        205
start_station_id          205
end_station_name          519
end_station_id            520
start_lat              104008
start_lng              110731
end_lat                   541
end_lng                   547
member_casual               2
dtype: int64

##### Look into why Ride Id isn't unique --> counting rides from one month to the next (unnecessary duplicates)

In [10]:
# get df of duplicated ride_ids
dupes_df = citibike_df[citibike_df.duplicated(subset=['ride_id'], keep=False)]

# sort so duplicate groups stay together
dupes_df = dupes_df.sort_values(by='ride_id')

# display
dupes_df

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
420821,09D67B6866E802DA,classic_bike,2024-05-31 23:54:11.503,2024-06-01 00:19:31.421,Willow Ave & 12 St,HB505,Willow Ave & 12 St,HB505,40.751867,-74.030377,40.751867,-74.030377,casual
342531,09D67B6866E802DA,classic_bike,2024-05-31 23:54:11.000,2024-06-01 00:19:31.000,Willow Ave & 12 St,HB505,Willow Ave & 12 St,HB505,40.751867,-74.030377,40.751867,-74.030377,casual
360447,15413FAB9CC9F156,electric_bike,2024-05-31 23:51:10.638,2024-06-01 00:02:52.559,Hilltop,JC019,York St & Marin Blvd,JC097,40.731119,-74.057494,40.716615,-74.042412,casual
283209,15413FAB9CC9F156,electric_bike,2024-05-31 23:51:10.000,2024-06-01 00:02:52.000,Hilltop,JC019,York St & Marin Blvd,JC097,40.731119,-74.057494,40.716615,-74.042412,casual
256909,1AB4E466DCCC4147,classic_bike,2024-05-31 23:38:55.000,2024-06-01 05:20:10.000,South Waterfront Walkway - Sinatra Dr & 1 St,HB103,Hoboken Ave at Monmouth St,JC105,40.736982,-74.027781,40.735208,-74.046964,casual
414293,1AB4E466DCCC4147,classic_bike,2024-05-31 23:38:55.590,2024-06-01 05:20:10.327,South Waterfront Walkway - Sinatra Dr & 1 St,HB103,Hoboken Ave at Monmouth St,JC105,40.736982,-74.027781,40.735208,-74.046964,casual
363069,269CAE1A61663378,classic_bike,2024-05-31 23:55:31.802,2024-06-01 00:00:50.426,Hoboken Terminal - River St & Hudson Pl,HB102,Madison St & 1 St,HB402,40.736068,-74.029127,40.73879,-74.0393,member
273924,269CAE1A61663378,classic_bike,2024-05-31 23:55:31.000,2024-06-01 00:00:50.000,Hoboken Terminal - River St & Hudson Pl,HB102,Madison St & 1 St,HB402,40.736068,-74.029127,40.73879,-74.0393,member
395980,2FBED42B501D1159,electric_bike,2024-05-31 23:57:51.452,2024-06-01 00:08:06.554,Adams St & 12 St,HB610,City Hall - Washington St & 1 St,HB105,40.751833,-74.033343,40.73736,-74.03097,casual
305232,2FBED42B501D1159,electric_bike,2024-05-31 23:57:51.000,2024-06-01 00:08:06.000,Adams St & 12 St,HB610,City Hall - Washington St & 1 St,HB105,40.751833,-74.033343,40.73736,-74.03097,casual


In [11]:
# drop duplicate rows
citibike_df = citibike_df.drop_duplicates(subset=['ride_id'], keep='first')

# show number of unique ride id rows versus count
print(f'Total: {citibike_df['ride_id'].count()}')
print(f'Unique: {citibike_df['ride_id'].nunique()}')

Total: 1052427
Unique: 1052427


## Reverse Geocoding
---

### Get GeoDataFrames

#### CitiBike gdf

In [12]:
# create geometry columsn of point objects from lat/lng
citibike_df['geometry_start'] = gpd.points_from_xy(citibike_df['start_lng'], citibike_df['start_lat'])
citibike_df['geometry_end'] = gpd.points_from_xy(citibike_df['end_lng'], citibike_df['end_lat'])


# construct gdf
citibike_gdf = gpd.GeoDataFrame(
    citibike_df,
    geometry='geometry_start',  #current active geometry
    crs='EPSG:4326'
)

In [13]:
# display
citibike_gdf.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,geometry_start,geometry_end
0,0744109F13385D1D,electric_bike,2024-01-15 15:18:07,2024-01-15 15:32:44,Morris Canal,JC072,Oakland Ave,JC022,40.712297,-74.038185,40.737604,-74.052478,member,POINT (-74.038 40.712),POINT (-74.052 40.738)
1,B1488BFEF9118000,classic_bike,2024-01-13 15:32:50,2024-01-13 15:36:18,JC Medical Center,JC110,Grove St PATH,JC115,40.715391,-74.049692,40.71941,-74.04309,member,POINT (-74.05 40.715),POINT (-74.043 40.719)
2,95A2FE8E51B4C836,classic_bike,2024-01-19 13:11:00,2024-01-19 13:14:44,Morris Canal,JC072,Exchange Pl,JC116,40.712419,-74.038526,40.716366,-74.034344,member,POINT (-74.039 40.712),POINT (-74.034 40.716)
3,95D9AFF6A1652DC1,classic_bike,2024-01-23 07:03:49,2024-01-23 07:07:11,Morris Canal,JC072,Exchange Pl,JC116,40.712419,-74.038526,40.716366,-74.034344,member,POINT (-74.039 40.712),POINT (-74.034 40.716)
4,5F7408988A83B1B3,classic_bike,2024-01-01 16:46:10,2024-01-01 16:50:31,Morris Canal,JC072,Harborside,JC104,40.712419,-74.038526,40.719252,-74.034234,member,POINT (-74.039 40.712),POINT (-74.034 40.719)


#### Cities, Counties, States gdf

In [14]:
# find county subdivisions shapefiles
cousub_shp_list = glob.glob(os.path.join('Resources/Shapefiles_Cousub/Cousub_*/*.shp'))

# read in each county subdivision file into list
list_cousub_gdf = []

for folder in cousub_shp_list:
    cousub_gdf = gpd.read_file(folder).to_crs('EPSG:4326')
    list_cousub_gdf.append(cousub_gdf)

# combine gdfs into one gdf
cousubs_gdf = pd.concat(list_cousub_gdf, ignore_index=True)

# display
cousub_gdf.head()

Unnamed: 0,STATEFP,COUNTYFP,COUSUBFP,COUSUBNS,GEOID,GEOIDFQ,NAME,NAMELSAD,LSAD,CLASSFP,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
0,42,121,27832,1217144,4212127832,0600000US4212127832,Frenchcreek,Frenchcreek township,44,T1,G4040,A,75458260,538510,41.3968254,-79.9437494,"POLYGON ((-80 41.387, -80 41.387, -80 41.387, ..."
1,42,79,67456,1216732,4207967456,0600000US4207967456,Salem,Salem township,44,T1,G4040,A,74442584,2681721,41.1059794,-76.1835786,"POLYGON ((-76.229 41.138, -76.221 41.142, -76...."
2,42,79,78384,1216735,4207978384,0600000US4207978384,Union,Union township,44,T1,G4040,A,51285654,1126850,41.2000871,-76.151562,"POLYGON ((-76.21 41.214, -76.202 41.22, -76.20..."
3,42,79,83136,1215412,4207983136,0600000US4207983136,West Hazleton,West Hazleton borough,21,C5,G4040,F,4088525,0,40.9743238,-76.0254145,"POLYGON ((-76.038 40.978, -76.036 40.978, -76...."
4,42,57,5144,1216475,4205705144,0600000US4205705144,Belfast,Belfast township,44,T1,G4040,A,129879111,0,39.8810245,-78.1405196,"POLYGON ((-78.216 39.855, -78.216 39.856, -78...."


In [15]:
# read in county shapefile
counties_shapefile = 'Resources/Shapefile_Counties'
counties_gdf = gpd.read_file(counties_shapefile).to_crs('EPSG:4326')

# look at geodataframe
counties_gdf.head()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,GEOID,GEOIDFQ,NAME,NAMELSAD,LSAD,CLASSFP,MTFCC,CSAFP,CBSAFP,METDIVFP,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
0,31,39,835841,31039,0500000US31039,Cuming,Cuming County,6,H1,G4020,,,,A,1477563042,10772508,41.9158651,-96.7885168,"POLYGON ((-96.555 41.829, -96.555 41.828, -96...."
1,53,69,1513275,53069,0500000US53069,Wahkiakum,Wahkiakum County,6,H1,G4020,,,,A,680980773,61564428,46.2946377,-123.4244583,"POLYGON ((-123.73 46.264, -123.73 46.265, -123..."
2,35,11,933054,35011,0500000US35011,De Baca,De Baca County,6,H1,G4020,,,,A,6016818941,29090018,34.3592729,-104.3686961,"POLYGON ((-104.89 34.089, -104.89 34.089, -104..."
3,31,109,835876,31109,0500000US31109,Lancaster,Lancaster County,6,H1,G4020,339.0,30700.0,,A,2169269508,22850511,40.7835474,-96.6886584,"POLYGON ((-96.685 40.523, -96.692 40.523, -96...."
4,31,129,835886,31129,0500000US31129,Nuckolls,Nuckolls County,6,H1,G4020,,,,A,1489645201,1718484,40.1764918,-98.0468422,"POLYGON ((-98.274 40.118, -98.274 40.122, -98...."


#### zipcodes gdf

In [16]:
# read zip code shapefile
zip_shapefile = 'Resources/Shapefile_ZipCode'
zip_gdf = gpd.read_file(zip_shapefile).to_crs('EPSG:4326')

# look at geodataframe
zip_gdf.head()

Unnamed: 0,ZCTA5CE20,GEOID20,GEOIDFQ20,CLASSFP20,MTFCC20,FUNCSTAT20,ALAND20,AWATER20,INTPTLAT20,INTPTLON20,geometry
0,47236,47236,860Z200US47236,B5,G6350,S,1029063,0,39.1517426,-85.7252769,"POLYGON ((-85.734 39.156, -85.728 39.156, -85...."
1,47870,47870,860Z200US47870,B5,G6350,S,8830,0,39.3701518,-87.4735141,"POLYGON ((-87.474 39.37, -87.474 39.37, -87.47..."
2,47851,47851,860Z200US47851,B5,G6350,S,53326,0,39.5735839,-87.2459559,"POLYGON ((-87.248 39.574, -87.247 39.574, -87...."
3,47337,47337,860Z200US47337,B5,G6350,S,303089,0,39.8027537,-85.437285,"POLYGON ((-85.444 39.803, -85.443 39.803, -85...."
4,47435,47435,860Z200US47435,B5,G6350,S,13302,0,39.2657557,-86.2951577,"POLYGON ((-86.296 39.265, -86.296 39.266, -86...."


### Get Zipcodes

In [17]:
# join with zipcode gdf (start)
start_zips_joined_gdf = gpd.sjoin(citibike_gdf, zip_gdf, how='left', predicate='within')

# rename zip column
start_zips_joined_gdf = start_zips_joined_gdf.rename(columns={'ZCTA5CE20': 'start_zip'})

# merge start_zip back into citibike gdf
citibike_gdf['start_zip'] = start_zips_joined_gdf['start_zip']

# display
citibike_gdf.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,geometry_start,geometry_end,start_zip
0,0744109F13385D1D,electric_bike,2024-01-15 15:18:07,2024-01-15 15:32:44,Morris Canal,JC072,Oakland Ave,JC022,40.712297,-74.038185,40.737604,-74.052478,member,POINT (-74.038 40.712),POINT (-74.052 40.738),7302
1,B1488BFEF9118000,classic_bike,2024-01-13 15:32:50,2024-01-13 15:36:18,JC Medical Center,JC110,Grove St PATH,JC115,40.715391,-74.049692,40.71941,-74.04309,member,POINT (-74.05 40.715),POINT (-74.043 40.719),7302
2,95A2FE8E51B4C836,classic_bike,2024-01-19 13:11:00,2024-01-19 13:14:44,Morris Canal,JC072,Exchange Pl,JC116,40.712419,-74.038526,40.716366,-74.034344,member,POINT (-74.039 40.712),POINT (-74.034 40.716),7302
3,95D9AFF6A1652DC1,classic_bike,2024-01-23 07:03:49,2024-01-23 07:07:11,Morris Canal,JC072,Exchange Pl,JC116,40.712419,-74.038526,40.716366,-74.034344,member,POINT (-74.039 40.712),POINT (-74.034 40.716),7302
4,5F7408988A83B1B3,classic_bike,2024-01-01 16:46:10,2024-01-01 16:50:31,Morris Canal,JC072,Harborside,JC104,40.712419,-74.038526,40.719252,-74.034234,member,POINT (-74.039 40.712),POINT (-74.034 40.719),7302


In [18]:
# set end geometry to active geometry
ends_gdf = gpd.GeoDataFrame(
    citibike_df.copy(),
    geometry='geometry_end',
    crs='EPSG:4326'
)

# create end zip column
ends_joined_gdf = gpd.sjoin(ends_gdf, zip_gdf, how='left', predicate='within').rename(columns={'ZCTA5CE20': 'end_zip'})

# assign end zip to main gdf
citibike_gdf['end_zip'] = ends_joined_gdf['end_zip']

# display
citibike_gdf.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,geometry_start,geometry_end,start_zip,end_zip
0,0744109F13385D1D,electric_bike,2024-01-15 15:18:07,2024-01-15 15:32:44,Morris Canal,JC072,Oakland Ave,JC022,40.712297,-74.038185,40.737604,-74.052478,member,POINT (-74.038 40.712),POINT (-74.052 40.738),7302,7306
1,B1488BFEF9118000,classic_bike,2024-01-13 15:32:50,2024-01-13 15:36:18,JC Medical Center,JC110,Grove St PATH,JC115,40.715391,-74.049692,40.71941,-74.04309,member,POINT (-74.05 40.715),POINT (-74.043 40.719),7302,7302
2,95A2FE8E51B4C836,classic_bike,2024-01-19 13:11:00,2024-01-19 13:14:44,Morris Canal,JC072,Exchange Pl,JC116,40.712419,-74.038526,40.716366,-74.034344,member,POINT (-74.039 40.712),POINT (-74.034 40.716),7302,7302
3,95D9AFF6A1652DC1,classic_bike,2024-01-23 07:03:49,2024-01-23 07:07:11,Morris Canal,JC072,Exchange Pl,JC116,40.712419,-74.038526,40.716366,-74.034344,member,POINT (-74.039 40.712),POINT (-74.034 40.716),7302,7302
4,5F7408988A83B1B3,classic_bike,2024-01-01 16:46:10,2024-01-01 16:50:31,Morris Canal,JC072,Harborside,JC104,40.712419,-74.038526,40.719252,-74.034234,member,POINT (-74.039 40.712),POINT (-74.034 40.719),7302,7311


#### Double checking zips--> missing end zips (right outside of boundaries such as on water)

In [19]:
# look into null values
citibike_gdf.isnull().sum()

ride_id                  0
rideable_type            0
started_at               0
ended_at                 0
start_station_name     104
start_station_id       104
end_station_name      3057
end_station_id        3334
start_lat                0
start_lng                0
end_lat                352
end_lng                352
member_casual            0
geometry_start           0
geometry_end             0
start_zip                0
end_zip                402
dtype: int64

In [20]:
# see why there are more null end zips than end coordinates
end_zip_nulls_gdf = citibike_gdf[citibike_gdf['end_zip'].isna() & citibike_gdf['end_lat'].notna()].copy()
end_zip_nulls_gdf.isnull().sum()

ride_id                0
rideable_type          0
started_at             0
ended_at               0
start_station_name     0
start_station_id       0
end_station_name      14
end_station_id        14
start_lat              0
start_lng              0
end_lat                0
end_lng                0
member_casual          0
geometry_start         0
geometry_end           0
start_zip              0
end_zip               50
dtype: int64

In [21]:
# list it out
end_zip_nulls_gdf

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,geometry_start,geometry_end,start_zip,end_zip
9518,305A1F7966180481,electric_bike,2024-01-18 19:11:25.000,2024-01-18 20:31:32.000,Harborside,JC104,,,40.71926,-74.034342,40.72,-74.02,member,POINT (-74.034 40.719),POINT (-74.02 40.72),7311,
68307,95A3E7FABA1D387A,electric_bike,2024-02-11 17:27:28.000,2024-02-11 18:31:07.000,Warren St,JC006,,,40.721105,-74.038006,40.72,-74.04,casual,POINT (-74.038 40.721),POINT (-74.04 40.72),7302,
161964,856D6561FC3B2A56,classic_bike,2024-03-15 08:41:23.000,2024-03-15 09:06:52.000,Columbus Drive,JC014,Pier 40 - Hudson River Park,5696.03,40.718355,-74.038914,40.727714,-74.011296,member,POINT (-74.039 40.718),POINT (-74.011 40.728),7302,
247319,4A50280C500E4ACD,electric_bike,2024-04-19 08:54:45.000,2024-04-19 09:12:12.000,Columbus Drive,JC014,Pier 40 - Hudson River Park,5696.03,40.718774,-74.038901,40.727714,-74.011296,member,POINT (-74.039 40.719),POINT (-74.011 40.728),7302,
257454,1EFFA98AFA13BD36,classic_bike,2024-05-07 08:44:02.000,2024-05-07 09:05:51.000,Columbus Drive,JC014,Pier 40 - Hudson River Park,5696.03,40.718355,-74.038914,40.727714,-74.011296,member,POINT (-74.039 40.718),POINT (-74.011 40.728),7302,
322527,4EAF7DAA6C93A6EF,classic_bike,2024-05-14 17:12:14.000,2024-05-14 17:35:38.000,Newport Pkwy,JC008,Pier 40 - Hudson River Park,5696.03,40.728745,-74.032108,40.727714,-74.011296,member,POINT (-74.032 40.729),POINT (-74.011 40.728),7310,
323689,9FA416A13F007938,electric_bike,2024-05-20 07:43:20.000,2024-05-20 09:49:37.000,Marin Light Rail,JC013,,,40.714586,-74.042764,40.71,-74.02,member,POINT (-74.043 40.715),POINT (-74.02 40.71),7302,
371000,A9E249A1081D805B,electric_bike,2024-06-10 10:55:49.839,2024-06-10 14:17:31.422,14 St Ferry - 14 St & Shipyard Ln,HB202,,,40.752824,-74.024394,40.78,-73.99,casual,POINT (-74.024 40.753),POINT (-73.99 40.78),7030,
434190,A96FB7B10570BC41,electric_bike,2024-06-25 08:37:44.768,2024-06-25 08:58:19.500,Columbus Drive,JC014,Pier 40 - Hudson River Park,5696.03,40.718776,-74.038729,40.727714,-74.011296,member,POINT (-74.039 40.719),POINT (-74.011 40.728),7302,
434191,7A862588E1EB101F,electric_bike,2024-06-24 08:33:40.003,2024-06-24 08:58:16.303,Columbus Drive,JC014,Pier 40 - Hudson River Park,5696.03,40.718735,-74.038585,40.727714,-74.011296,member,POINT (-74.039 40.719),POINT (-74.011 40.728),7302,


#### Add missing end zips using nearest approach

In [22]:
# fix gdf setup of end zip nulls with end coordinates
end_zip_nulls_gdf = end_zip_nulls_gdf.iloc[:, :-1].reset_index(drop=True)

# set geometry to end coordinates and crs
end_zip_nulls_gdf = gpd.GeoDataFrame(
    end_zip_nulls_gdf,
    geometry='geometry_end',
    crs='EPSG:4326'
)

# display
end_zip_nulls_gdf.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,geometry_start,geometry_end,start_zip
0,305A1F7966180481,electric_bike,2024-01-18 19:11:25,2024-01-18 20:31:32,Harborside,JC104,,,40.71926,-74.034342,40.72,-74.02,member,POINT (-74.034 40.719),POINT (-74.02 40.72),7311
1,95A3E7FABA1D387A,electric_bike,2024-02-11 17:27:28,2024-02-11 18:31:07,Warren St,JC006,,,40.721105,-74.038006,40.72,-74.04,casual,POINT (-74.038 40.721),POINT (-74.04 40.72),7302
2,856D6561FC3B2A56,classic_bike,2024-03-15 08:41:23,2024-03-15 09:06:52,Columbus Drive,JC014,Pier 40 - Hudson River Park,5696.03,40.718355,-74.038914,40.727714,-74.011296,member,POINT (-74.039 40.718),POINT (-74.011 40.728),7302
3,4A50280C500E4ACD,electric_bike,2024-04-19 08:54:45,2024-04-19 09:12:12,Columbus Drive,JC014,Pier 40 - Hudson River Park,5696.03,40.718774,-74.038901,40.727714,-74.011296,member,POINT (-74.039 40.719),POINT (-74.011 40.728),7302
4,1EFFA98AFA13BD36,classic_bike,2024-05-07 08:44:02,2024-05-07 09:05:51,Columbus Drive,JC014,Pier 40 - Hudson River Park,5696.03,40.718355,-74.038914,40.727714,-74.011296,member,POINT (-74.039 40.718),POINT (-74.011 40.728),7302


In [23]:
# reproject points and zip polygons to local projected crs
ends_proj = end_zip_nulls_gdf.to_crs('EPSG:2263')
zip_proj = zip_gdf.to_crs('EPSG:2263')

In [24]:
# nearest spatial join to get zips
end_zip_nearest_gdf = gpd.sjoin_nearest(
    ends_proj, zip_proj,
    how='left',
    distance_col='dist_to_zip'
)

# rename zip column
end_zip_nearest_gdf = end_zip_nearest_gdf.rename(columns={'ZCTA5CE20': 'nearest_end_zip'})

In [25]:
# looking into zips
end_zip_nearest_gdf[['ride_id', 'end_lat', 'end_lng', 'geometry_end', 'nearest_end_zip', 'dist_to_zip']]

Unnamed: 0,ride_id,end_lat,end_lng,geometry_end,nearest_end_zip,dist_to_zip
0,305A1F7966180481,40.72,-74.02,POINT (9.79e+05 2.02e+05),7311,863.073563
1,95A3E7FABA1D387A,40.72,-74.04,POINT (9.73e+05 2.02e+05),7302,0.0
2,856D6561FC3B2A56,40.727714,-74.011296,POINT (9.81e+05 2.04e+05),10014,103.879678
3,4A50280C500E4ACD,40.727714,-74.011296,POINT (9.81e+05 2.04e+05),10014,103.879678
4,1EFFA98AFA13BD36,40.727714,-74.011296,POINT (9.81e+05 2.04e+05),10014,103.879678
5,4EAF7DAA6C93A6EF,40.727714,-74.011296,POINT (9.81e+05 2.04e+05),10014,103.879678
6,9FA416A13F007938,40.71,-74.02,POINT (9.79e+05 1.98e+05),10280,436.67445
7,A9E249A1081D805B,40.78,-73.99,POINT (9.87e+05 2.23e+05),10069,397.327208
8,A96FB7B10570BC41,40.727714,-74.011296,POINT (9.81e+05 2.04e+05),10014,103.879678
9,7A862588E1EB101F,40.727714,-74.011296,POINT (9.81e+05 2.04e+05),10014,103.879678


In [26]:
# merge end nearest zips column to main gdf
citibike_gdf = citibike_gdf.merge(
    end_zip_nearest_gdf[['ride_id', 'nearest_end_zip']],
    on='ride_id',
    how='left'
)

# fill in missing end_zip values with nearest zips
citibike_gdf['end_zip'] = citibike_gdf['end_zip'].fillna(citibike_gdf['nearest_end_zip'])

# drop nearest zip column
citibike_gdf = citibike_gdf.drop('nearest_end_zip', axis=1)

# display
citibike_gdf.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,geometry_start,geometry_end,start_zip,end_zip
0,0744109F13385D1D,electric_bike,2024-01-15 15:18:07,2024-01-15 15:32:44,Morris Canal,JC072,Oakland Ave,JC022,40.712297,-74.038185,40.737604,-74.052478,member,POINT (-74.038 40.712),POINT (-74.052 40.738),7302,7306
1,B1488BFEF9118000,classic_bike,2024-01-13 15:32:50,2024-01-13 15:36:18,JC Medical Center,JC110,Grove St PATH,JC115,40.715391,-74.049692,40.71941,-74.04309,member,POINT (-74.05 40.715),POINT (-74.043 40.719),7302,7302
2,95A2FE8E51B4C836,classic_bike,2024-01-19 13:11:00,2024-01-19 13:14:44,Morris Canal,JC072,Exchange Pl,JC116,40.712419,-74.038526,40.716366,-74.034344,member,POINT (-74.039 40.712),POINT (-74.034 40.716),7302,7302
3,95D9AFF6A1652DC1,classic_bike,2024-01-23 07:03:49,2024-01-23 07:07:11,Morris Canal,JC072,Exchange Pl,JC116,40.712419,-74.038526,40.716366,-74.034344,member,POINT (-74.039 40.712),POINT (-74.034 40.716),7302,7302
4,5F7408988A83B1B3,classic_bike,2024-01-01 16:46:10,2024-01-01 16:50:31,Morris Canal,JC072,Harborside,JC104,40.712419,-74.038526,40.719252,-74.034234,member,POINT (-74.039 40.712),POINT (-74.034 40.719),7302,7311


In [27]:
# double check if amount of nulls in end_zip = end_lat/lng
citibike_gdf.isna().sum()

ride_id                  0
rideable_type            0
started_at               0
ended_at                 0
start_station_name     104
start_station_id       104
end_station_name      3057
end_station_id        3334
start_lat                0
start_lng                0
end_lat                352
end_lng                352
member_casual            0
geometry_start           0
geometry_end             0
start_zip                0
end_zip                352
dtype: int64

### Get Cities, Counties, States

In [28]:
# join with cities gdf (start)
start_cities_joined_gdf = gpd.sjoin(citibike_gdf, cousubs_gdf, how='left', predicate='within')

# rename zip column
start_cities_joined_gdf = start_cities_joined_gdf.rename(columns={
    'NAME': 'start_city',
    'COUNTYFP': 'start_county',
    'STATEFP': 'start_state'
})

# merge start_zip back into citibike gdf
citibike_gdf[['start_city', 'start_county', 'start_state']] = start_cities_joined_gdf[['start_city', 'start_county', 'start_state']]

# display
citibike_gdf.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,geometry_start,geometry_end,start_zip,end_zip,start_city,start_county,start_state
0,0744109F13385D1D,electric_bike,2024-01-15 15:18:07,2024-01-15 15:32:44,Morris Canal,JC072,Oakland Ave,JC022,40.712297,-74.038185,40.737604,-74.052478,member,POINT (-74.038 40.712),POINT (-74.052 40.738),7302,7306,Jersey City,17,34
1,B1488BFEF9118000,classic_bike,2024-01-13 15:32:50,2024-01-13 15:36:18,JC Medical Center,JC110,Grove St PATH,JC115,40.715391,-74.049692,40.71941,-74.04309,member,POINT (-74.05 40.715),POINT (-74.043 40.719),7302,7302,Jersey City,17,34
2,95A2FE8E51B4C836,classic_bike,2024-01-19 13:11:00,2024-01-19 13:14:44,Morris Canal,JC072,Exchange Pl,JC116,40.712419,-74.038526,40.716366,-74.034344,member,POINT (-74.039 40.712),POINT (-74.034 40.716),7302,7302,Jersey City,17,34
3,95D9AFF6A1652DC1,classic_bike,2024-01-23 07:03:49,2024-01-23 07:07:11,Morris Canal,JC072,Exchange Pl,JC116,40.712419,-74.038526,40.716366,-74.034344,member,POINT (-74.039 40.712),POINT (-74.034 40.716),7302,7302,Jersey City,17,34
4,5F7408988A83B1B3,classic_bike,2024-01-01 16:46:10,2024-01-01 16:50:31,Morris Canal,JC072,Harborside,JC104,40.712419,-74.038526,40.719252,-74.034234,member,POINT (-74.039 40.712),POINT (-74.034 40.719),7302,7311,Jersey City,17,34


In [29]:
# set end geometry to active geometry
ends_gdf = gpd.GeoDataFrame(
    citibike_gdf.copy(),
    geometry='geometry_end',
    crs='EPSG:4326'
)

# create end cities column
ends_joined_gdf = gpd.sjoin(ends_gdf, cousubs_gdf, how='left', predicate='within').rename(columns={
    'NAME': 'end_city',
    'COUNTYFP': 'end_county',
    'STATEFP': 'end_state',
})

# assign end zip to main gdf
citibike_gdf[['end_city', 'end_county', 'end_state']] = ends_joined_gdf[['end_city', 'end_county', 'end_state']]

# display
citibike_gdf.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,...,geometry_start,geometry_end,start_zip,end_zip,start_city,start_county,start_state,end_city,end_county,end_state
0,0744109F13385D1D,electric_bike,2024-01-15 15:18:07,2024-01-15 15:32:44,Morris Canal,JC072,Oakland Ave,JC022,40.712297,-74.038185,...,POINT (-74.038 40.712),POINT (-74.052 40.738),7302,7306,Jersey City,17,34,Jersey City,17,34
1,B1488BFEF9118000,classic_bike,2024-01-13 15:32:50,2024-01-13 15:36:18,JC Medical Center,JC110,Grove St PATH,JC115,40.715391,-74.049692,...,POINT (-74.05 40.715),POINT (-74.043 40.719),7302,7302,Jersey City,17,34,Jersey City,17,34
2,95A2FE8E51B4C836,classic_bike,2024-01-19 13:11:00,2024-01-19 13:14:44,Morris Canal,JC072,Exchange Pl,JC116,40.712419,-74.038526,...,POINT (-74.039 40.712),POINT (-74.034 40.716),7302,7302,Jersey City,17,34,Jersey City,17,34
3,95D9AFF6A1652DC1,classic_bike,2024-01-23 07:03:49,2024-01-23 07:07:11,Morris Canal,JC072,Exchange Pl,JC116,40.712419,-74.038526,...,POINT (-74.039 40.712),POINT (-74.034 40.716),7302,7302,Jersey City,17,34,Jersey City,17,34
4,5F7408988A83B1B3,classic_bike,2024-01-01 16:46:10,2024-01-01 16:50:31,Morris Canal,JC072,Harborside,JC104,40.712419,-74.038526,...,POINT (-74.039 40.712),POINT (-74.034 40.719),7302,7311,Jersey City,17,34,Jersey City,17,34


#### Double checking counties--> missing end counties are in NYC (have several counties within it)

In [30]:
# look into nulls
citibike_gdf.isna().sum()

ride_id                  0
rideable_type            0
started_at               0
ended_at                 0
start_station_name     104
start_station_id       104
end_station_name      3057
end_station_id        3334
start_lat                0
start_lng                0
end_lat                352
end_lng                352
member_casual            0
geometry_start           0
geometry_end             0
start_zip                0
end_zip                352
start_city               0
start_county             0
start_state              0
end_city               352
end_county            3342
end_state              352
dtype: int64

In [31]:
# see what's causing county nulls
null_counties_with_cities = citibike_gdf[citibike_gdf['end_county'].isna() & citibike_gdf['end_city'].notna()]
print(null_counties_with_cities['end_state'].value_counts())
print(null_counties_with_cities['end_city'].value_counts())

end_state
36    2990
Name: count, dtype: int64
end_city
New York    2990
Name: count, dtype: int64


#### Add missing end counties from counties shapefile & update end_city

In [32]:
# fix gdf setup of end zip nulls with end coordinates
end_counties_nulls_gdf = null_counties_with_cities.iloc[:, :-3].reset_index(drop=True)

# set geometry to end coordinates and crs
end_counties_nulls_gdf = gpd.GeoDataFrame(
    end_counties_nulls_gdf,
    geometry='geometry_end',
    crs='EPSG:4326'
)

# create new end cities and counties column
end_counties_nulls_gdf = gpd.sjoin(end_counties_nulls_gdf, counties_gdf, how='left', predicate='within').rename(columns={
    'NAME': 'new_end_location',
    'COUNTYFP': 'new_end_county'
})

# display
end_counties_nulls_gdf[['ride_id', 'end_lat', 'end_lng', 'new_end_location', 'new_end_county']].head()

Unnamed: 0,ride_id,end_lat,end_lng,new_end_location,new_end_county
0,A040B477B036E5A8,40.707065,-74.007319,New York,61
1,479BDD97FE279988,40.717599,-74.01588,New York,61
2,70438D117D00C687,40.717571,-74.005549,New York,61
3,0D69EE4C57ECDE9E,40.750756,-73.978326,New York,61
4,19A542F7832439EC,40.750756,-73.978326,New York,61


In [33]:
# check if null counties
end_counties_nulls_gdf['new_end_county'].isna().sum()

0

In [34]:
# check new recognized areas in nyc
end_counties_nulls_gdf['new_end_location'].value_counts()

new_end_location
New York    2856
Kings        110
Queens        17
Bronx          7
Name: count, dtype: int64

In [35]:
# merge new end counties column to main gdf
citibike_gdf = citibike_gdf.merge(
    end_counties_nulls_gdf[['ride_id', 'new_end_county']],
    on='ride_id',
    how='left'
)

# fill in missing end_county values with new end county
citibike_gdf['end_county'] = citibike_gdf['end_county'].fillna(citibike_gdf['new_end_county'])

# drop new end county column
citibike_gdf = citibike_gdf.drop('new_end_county', axis=1)

# display
citibike_gdf.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,...,geometry_start,geometry_end,start_zip,end_zip,start_city,start_county,start_state,end_city,end_county,end_state
0,0744109F13385D1D,electric_bike,2024-01-15 15:18:07,2024-01-15 15:32:44,Morris Canal,JC072,Oakland Ave,JC022,40.712297,-74.038185,...,POINT (-74.038 40.712),POINT (-74.052 40.738),7302,7306,Jersey City,17,34,Jersey City,17,34
1,B1488BFEF9118000,classic_bike,2024-01-13 15:32:50,2024-01-13 15:36:18,JC Medical Center,JC110,Grove St PATH,JC115,40.715391,-74.049692,...,POINT (-74.05 40.715),POINT (-74.043 40.719),7302,7302,Jersey City,17,34,Jersey City,17,34
2,95A2FE8E51B4C836,classic_bike,2024-01-19 13:11:00,2024-01-19 13:14:44,Morris Canal,JC072,Exchange Pl,JC116,40.712419,-74.038526,...,POINT (-74.039 40.712),POINT (-74.034 40.716),7302,7302,Jersey City,17,34,Jersey City,17,34
3,95D9AFF6A1652DC1,classic_bike,2024-01-23 07:03:49,2024-01-23 07:07:11,Morris Canal,JC072,Exchange Pl,JC116,40.712419,-74.038526,...,POINT (-74.039 40.712),POINT (-74.034 40.716),7302,7302,Jersey City,17,34,Jersey City,17,34
4,5F7408988A83B1B3,classic_bike,2024-01-01 16:46:10,2024-01-01 16:50:31,Morris Canal,JC072,Harborside,JC104,40.712419,-74.038526,...,POINT (-74.039 40.712),POINT (-74.034 40.719),7302,7311,Jersey City,17,34,Jersey City,17,34


In [36]:
# double check null values
citibike_gdf.isna().sum()

ride_id                  0
rideable_type            0
started_at               0
ended_at                 0
start_station_name     104
start_station_id       104
end_station_name      3057
end_station_id        3334
start_lat                0
start_lng                0
end_lat                352
end_lng                352
member_casual            0
geometry_start           0
geometry_end             0
start_zip                0
end_zip                352
start_city               0
start_county             0
start_state              0
end_city               352
end_county             352
end_state              352
dtype: int64

## Find Station Names/IDs
---

### Look at null stations

In [39]:
# check nulls
citibike_gdf.isna().sum()

ride_id                  0
rideable_type            0
started_at               0
ended_at                 0
start_station_name     104
start_station_id       104
end_station_name      3057
end_station_id        3334
start_lat                0
start_lng                0
end_lat                352
end_lng                352
member_casual            0
geometry_start           0
geometry_end             0
start_zip                0
end_zip                352
start_city               0
start_county             0
start_state              0
end_city               352
end_county             352
end_state              352
dtype: int64

### GeoDataFame for Stations

In [41]:
stations_gdf = gpd.GeoDataFrame(
    stations_df,
    geometry=gpd.points_from_xy(stations_df['lon'], stations_df['lat']),
    crs='EPSG:4326'
)

# display
stations_gdf.head()

Unnamed: 0,short_name,name,region_id,lat,lon,geometry
0,7082.08,23 Ave & 31 St,71.0,40.774233,-73.912749,POINT (-73.913 40.774)
1,3460.06,2 Ave & 37 St,71.0,40.65624,-74.00933,POINT (-74.009 40.656)
2,8472.06,Valentine Ave & E 183 St,71.0,40.856987,-73.898237,POINT (-73.898 40.857)
3,5540.06,56 Dr & 61 St,71.0,40.72368,-73.90458,POINT (-73.905 40.724)
4,8717.07,E 201 St & Briggs Ave,71.0,40.87207,-73.88459,POINT (-73.885 40.872)


### Null Start Station --> lat/lng values are too rounded to pinpoint station

In [62]:
# look into nulls
start_station_nulls = citibike_gdf[citibike_gdf['start_station_name'].isna()].copy()
start_station_nulls.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,...,geometry_start,geometry_end,start_zip,end_zip,start_city,start_county,start_state,end_city,end_county,end_state
36634,D64972661F0BC489,electric_bike,2024-01-15 08:43:12,2024-01-15 08:48:39,,,Newport PATH,JC066,40.71,-74.04,...,POINT (-74.04 40.71),POINT (-74.034 40.727),7302,7310,Jersey City,17,34,Jersey City,17,34
62036,4F05F77F14AF5FDD,electric_bike,2024-02-26 10:11:20,2024-02-26 10:13:44,,,Stevens - River Ter & 6 St,HB602,40.74,-74.03,...,POINT (-74.03 40.74),POINT (-74.027 40.743),7030,7030,Hoboken,17,34,Hoboken,17,34
71216,FF2E6D36231516C0,electric_bike,2024-02-26 08:04:57,2024-02-26 08:10:10,,,Marin Light Rail,JC013,40.72,-74.05,...,POINT (-74.05 40.72),POINT (-74.043 40.715),7302,7302,Jersey City,17,34,Jersey City,17,34
73088,8F9C5D5E5B6AFE7C,electric_bike,2024-02-25 16:45:08,2024-02-25 17:04:50,,,Exchange Pl,JC116,40.74,-74.03,...,POINT (-74.03 40.74),POINT (-74.034 40.716),7030,7302,Hoboken,17,34,Jersey City,17,34
74552,8C67DB90D434C1AF,electric_bike,2024-02-27 17:49:39,2024-02-27 18:10:18,,,Baldwin at Montgomery,JC020,40.72,-74.06,...,POINT (-74.06 40.72),POINT (-74.064 40.724),7304,7304,Jersey City,17,34,Jersey City,17,34


In [70]:
# see number of lat/lon values that have only 2 decimal places or less
broad_start_lats = [lat for lat in start_station_nulls['start_lat'] if str(lat)[::-1].find('.') <= 2]
broad_start_lons = [lon for lon in start_station_nulls['start_lng'] if str(lon)[::-1].find('.') <= 2]

print(len(broad_start_lats))
print(len(broad_start_lons))


104
104


In [71]:
# see if rounded lat/lngs are unique in station
rounded_stations = stations_df.copy()
rounded_stations.lat = round(rounded_stations.lat, 2)
rounded_stations.lon = round(rounded_stations.lon, 2)

# groupby lat/lng and count
group_rounded = rounded_stations.groupby(['lat', 'lon']).size().reset_index(name='count')
dupe_cnts = group_rounded[group_rounded['count']>1]
dupe_cnts

Unnamed: 0,lat,lon,count
1,40.64,-74.03,3
2,40.64,-74.02,9
3,40.64,-74.01,10
6,40.64,-73.98,4
7,40.64,-73.97,6
...,...,...,...
282,40.88,-73.91,6
283,40.88,-73.90,8
284,40.88,-73.89,9
285,40.88,-73.88,2


In [None]:
# check if any are unique--> no
dupe_cnts[dupe_cnts['count']==1]

Unnamed: 0,lat,lon,count


### Null End Station ID but with Station Name

In [84]:
# look into null end station ids with names
null_end_station_ids = citibike_gdf[citibike_gdf['end_station_id'].isna() & citibike_gdf['end_station_name'].notna()].copy()
null_end_station_ids

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,...,geometry_start,geometry_end,start_zip,end_zip,start_city,start_county,start_state,end_city,end_county,end_state
106702,9A9DFAC166C798C4,electric_bike,2024-03-29 19:03:15.000,2024-03-29 19:06:56.000,Pershing Field,JC024,Riverview Park,,40.742406,-74.052011,...,POINT (-74.052 40.742),POINT (-74.04 40.74),07307,07030,Jersey City,017,34,Hoboken,017,34
107337,D0F18FBDC5E69B00,electric_bike,2024-03-28 21:43:02.000,2024-03-28 21:45:50.000,City Hall - Washington St & 1 St,HB105,Madison St & 1 St,,40.737225,-74.031022,...,POINT (-74.031 40.737),POINT (-74.04 40.74),07030,07030,Hoboken,017,34,Hoboken,017,34
107723,8ABA108361A7BD96,electric_bike,2024-03-29 13:29:59.000,2024-03-29 13:41:42.000,Manila & 1st,JC082,Riverview Park,,40.721709,-74.042933,...,POINT (-74.043 40.722),POINT (-74.04 40.74),07302,07030,Jersey City,017,34,Hoboken,017,34
107726,859FEA1C4E825877,electric_bike,2024-03-29 13:06:10.000,2024-03-29 13:22:48.000,City Hall - Washington St & 1 St,HB105,Riverview Park,,40.737251,-74.030991,...,POINT (-74.031 40.737),POINT (-74.04 40.74),07030,07030,Hoboken,017,34,Hoboken,017,34
111112,A7456E42304BF4AE,electric_bike,2024-03-28 16:57:17.000,2024-03-28 17:01:14.000,Hoboken Terminal - River St & Hudson Pl,HB102,Madison St & 1 St,,40.735961,-74.029133,...,POINT (-74.029 40.736),POINT (-74.04 40.74),07030,07030,Hoboken,017,34,Hoboken,017,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1045635,6D2DAB1D9BD76A9C,electric_bike,2024-12-12 13:32:26.462,2024-12-12 13:35:22.806,City Hall - Washington St & 1 St,HB105,Madison St & 1 St,,40.737360,-74.030970,...,POINT (-74.031 40.737),POINT (-74.04 40.74),07030,07030,Hoboken,017,34,Hoboken,017,34
1045641,E0719548E8A96127,electric_bike,2024-12-12 14:11:50.661,2024-12-12 14:14:26.323,City Hall - Washington St & 1 St,HB105,Madison St & 1 St,,40.737360,-74.030970,...,POINT (-74.031 40.737),POINT (-74.04 40.74),07030,07030,Hoboken,017,34,Hoboken,017,34
1046286,464D005D5A83D756,electric_bike,2024-12-12 15:02:15.139,2024-12-12 15:10:41.903,5 Corners Library,JC018,Christ Hospital,,40.734961,-74.059503,...,POINT (-74.06 40.735),POINT (-74.05 40.73),07306,07302,Jersey City,017,34,Jersey City,017,34
1046621,F3E3235AED147C12,electric_bike,2024-12-12 15:13:29.020,2024-12-12 15:15:03.341,Manila & 1st,JC082,Jersey & 3rd,,40.721651,-74.042884,...,POINT (-74.043 40.722),POINT (-74.05 40.72),07302,07302,Jersey City,017,34,Jersey City,017,34


In [95]:
# lower name values before merging
null_end_station_ids['end_station_name'] = null_end_station_ids['end_station_name'].str.lower()
nlower_stations_df = stations_df.copy()
nlower_stations_df['name'] = nlower_stations_df['name'].str.lower()

# check if station names are unique in stations df--> YES
print(nlower_stations_df.nunique())


# merge on station name
not_null_end_station_ids = pd.merge(
    null_end_station_ids,
    nlower_stations_df,
    left_on='end_station_name',
    right_on='name',
    how='left'
)

# display
not_null_end_station_ids[['ride_id', 'end_station_name', 'name', 'end_station_id', 'short_name', 'end_lat', 'end_lng', 'lat', 'lon']]

short_name    2231
name          2231
region_id        3
lat           2218
lon           2214
dtype: int64


Unnamed: 0,ride_id,end_station_name,name,end_station_id,short_name,end_lat,end_lng,lat,lon
0,9A9DFAC166C798C4,riverview park,riverview park,,JC057,40.74,-74.04,40.744319,-74.043991
1,D0F18FBDC5E69B00,madison st & 1 st,madison st & 1 st,,HB402,40.74,-74.04,40.738790,-74.039300
2,8ABA108361A7BD96,riverview park,riverview park,,JC057,40.74,-74.04,40.744319,-74.043991
3,859FEA1C4E825877,riverview park,riverview park,,JC057,40.74,-74.04,40.744319,-74.043991
4,A7456E42304BF4AE,madison st & 1 st,madison st & 1 st,,HB402,40.74,-74.04,40.738790,-74.039300
...,...,...,...,...,...,...,...,...,...
272,6D2DAB1D9BD76A9C,madison st & 1 st,madison st & 1 st,,HB402,40.74,-74.04,40.738790,-74.039300
273,E0719548E8A96127,madison st & 1 st,madison st & 1 st,,HB402,40.74,-74.04,40.738790,-74.039300
274,464D005D5A83D756,christ hospital,christ hospital,,JC034,40.73,-74.05,40.734786,-74.050444
275,F3E3235AED147C12,jersey & 3rd,jersey & 3rd,,JC074,40.72,-74.05,40.723332,-74.045953


In [97]:
# check amount if any null ids now
not_null_end_station_ids['short_name'].count()

277

In [None]:
# merge end ids into main gdf
citibike_gdf = citibike_gdf.merge(
    not_null_end_station_ids[['ride_id', 'short_name']],
    on='ride_id',
    how='left'
)

# fill in missing end_station_ids
citibike_gdf['end_station_id'] = citibike_gdf['end_station_id'].fillna(citibike_gdf['short_name'])

# drop short name col
citibike_gdf = citibike_gdf.drop('short_name', axis=1)

# check nulls now (should be: end_station_name = end_station_id)
citibike_gdf.isna().sum()

ride_id                  0
rideable_type            0
started_at               0
ended_at                 0
start_station_name     104
start_station_id       104
end_station_name      3057
end_station_id        3057
start_lat                0
start_lng                0
end_lat                352
end_lng                352
member_casual            0
geometry_start           0
geometry_end             0
start_zip                0
end_zip                352
start_city               0
start_county             0
start_state              0
end_city               352
end_county             352
end_state              352
dtype: int64

### Null End Station Name/ID

In [None]:
# look into nulls (end_station_name)
end_station_nulls = citibike_gdf[citibike_gdf['end_station_name'].isna() & citibike_gdf['end_lat'].notna()].copy()
end_station_nulls[['ride_id', 'end_station_id', 'end_station_name', 'end_lat', 'end_lng']]

Unnamed: 0,ride_id,end_station_id,end_station_name,end_lat,end_lng
106,C18374AB46D402E2,,,40.73,-74.05
110,DF52F569EAD7B3C7,,,40.72,-74.05
111,5005A0EDC82C9470,,,40.71,-74.09
113,3D7F554CA1AF23EF,,,40.71,-74.06
114,9BECE19B5DBC34FE,,,40.72,-74.07
...,...,...,...,...,...
1025893,34FBB991D156EFBC,,,40.74,-74.03
1025899,663574E54A39EEF9,,,40.72,-74.04
1025909,FEB7C71275AB0E65,,,40.74,-74.03
1025910,36FADFF6DBDD32C9,,,40.70,-74.08


In [81]:
# see number of lat/lon values that have only 2 decimal places or less
broad_end_lats = [lat for lat in end_station_nulls['start_lat'] if str(lat)[::-1].find('.') <= 2]
broad_end_lons = [lon for lon in end_station_nulls['start_lng'] if str(lon)[::-1].find('.') <= 2]

print(len(broad_end_lats))
print(len(broad_end_lons))

46
46


### Get station ID's based on lat/lng

In [45]:
# save null start station names/ids
start_station_nulls = citibike_gdf[citibike_gdf['start_station_name'].isna()].copy()

# set up gdf
start_station_nulls_gdf = gpd.GeoDataFrame(
    start_station_nulls,
    geometry='geometry_start',
    crs='EPSG:4326'
)

# reproject points to local projected crs (feet)
starts_proj = start_station_nulls_gdf .to_crs('EPSG:2263')
stations_proj = stations_gdf.to_crs('EPSG:2263')

# spatial join to find nearest station
start_station_nulls_gdf = gpd.sjoin_nearest(
    starts_proj,
    stations_proj,
    how='left',
    distance_col='dist_to_station'
)

# rename columns
start_station_nulls_gdf = start_station_nulls_gdf.rename(columns={
    'short_name': 'new_start_station_id',
    'name': 'new_start_station_name'
})

# display
start_station_nulls_gdf[['new_start_station_id', 'new_start_station_name','lat', 'lon', 'start_lat', 'start_lng', 'dist_to_station']]

Unnamed: 0,new_start_station_id,new_start_station_name,lat,lon,start_lat,start_lng,dist_to_station
36634,JC072,Morris Canal,40.712419,-74.038526,40.71,-74.04,971.444107
62036,HB611,4 St & River St,40.740814,-74.027406,40.74,-74.03,777.540570
71216,JC099,Montgomery St,40.719420,-74.050990,40.72,-74.05,346.356912
73088,HB611,4 St & River St,40.740814,-74.027406,40.74,-74.03,777.540570
74552,JC020,Baldwin at Montgomery,40.723659,-74.064194,40.72,-74.06,1768.814755
...,...,...,...,...,...,...,...
250908,JC014,Columbus Drive,40.718355,-74.038914,40.72,-74.04,670.563128
327020,JC072,Morris Canal,40.712419,-74.038526,40.71,-74.04,971.444107
432538,JC002,Paulus Hook,40.714145,-74.033552,40.71,-74.03,1802.947865
799902,JC094,Glenwood Ave,40.727551,-74.071061,40.73,-74.07,939.272537
