In [1]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
from rasterstats import zonal_stats

In [2]:
# Input file paths
raster_path = "Land_Cover/Land_Cover/NYC_2017_LiDAR_LandCover.img"
bike_data_csv = "processed_data_newyork_10042025.csv"
district_shp = "nycdwi_25a/nycdwi.shp"

In [3]:
# 2017 dataset class labels
class_labels_2017 = {
    1: "Tree Canopy",
    2: "Grass/Shrubs",
    3: "Bare Soil",
    4: "Water",
    5: "Buildings",
    6: "Roads",
    7: "Other Impervious",
    8: "Railroads"
}

In [4]:
# Load bike station CSV
bike_df = pd.read_csv(bike_data_csv)
unique_stations = bike_df[['name', 'longitude', 'latitude']].drop_duplicates()

# Convert to GeoDataFrame
geometry = [Point(xy) for xy in zip(unique_stations.longitude, unique_stations.latitude)]
station_gdf = gpd.GeoDataFrame(unique_stations, geometry=geometry, crs="EPSG:4326")
station_gdf = station_gdf.to_crs("EPSG:2263")  # Match NYC raster CRS

In [5]:
cd_gdf = gpd.read_file(district_shp).to_crs("EPSG:2263")

# Spatial join: add community district info
station_with_cd = gpd.sjoin(station_gdf, cd_gdf, how="left", predicate="within")
station_with_cd = station_with_cd.rename(columns={"BoroCD": "community_district"})

In [6]:
# Create 100 ft buffer around each station
station_with_cd["geometry"] = station_with_cd.geometry.buffer(100)

# Extract raster stats
stats = zonal_stats(
    station_with_cd,
    raster_path,
    stats=None,
    categorical=True,
    all_touched=True,
    nodata=None
)

In [7]:
# Parse stats into % per class
lc_df = []
for i, stat in enumerate(stats):
    total = sum(stat.values())
    result = {class_labels_2017.get(k, f"Class_{k}"): (v / total) * 100 for k, v in stat.items()}
    result["name"] = station_with_cd.iloc[i]["name"]
    result["community_district"] = station_with_cd.iloc[i]["community_district"]
    lc_df.append(result)

landcover_df = pd.DataFrame(lc_df)

In [8]:
# Add lat/lon back to results
final_df = station_with_cd[['name', 'longitude', 'latitude']].merge(landcover_df, on="name")

In [9]:
# Let's check the dataset
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                19 non-null     object 
 1   longitude           19 non-null     float64
 2   latitude            19 non-null     float64
 3   Tree Canopy         16 non-null     float64
 4   Grass/Shrubs        19 non-null     float64
 5   Roads               18 non-null     float64
 6   Other Impervious    19 non-null     float64
 7   community_district  19 non-null     int32  
 8   Buildings           11 non-null     float64
 9   Railroads           2 non-null      float64
 10  Bare Soil           2 non-null      float64
 11  Water               2 non-null      float64
dtypes: float64(10), int32(1), object(1)
memory usage: 1.8+ KB


In [10]:
final_df.head(19)

Unnamed: 0,name,longitude,latitude,Tree Canopy,Grass/Shrubs,Roads,Other Impervious,community_district,Buildings,Railroads,Bare Soil,Water
0,111th St at 50th Ave,-73.8525,40.74563,26.826757,6.629865,37.936988,28.60639,481,,,,
1,8th Ave at 50th St.,-73.98612,40.762348,1.783522,0.856122,55.142674,21.418106,104,20.799576,,,
2,Amsterdam Ave at 86th St.,-73.97505,40.7877,1.706836,0.94965,62.307832,24.484979,107,10.550702,,,
3,Brooklyn Bridge Bicycle Path (Roadway),-74.004464,40.712656,14.005877,2.145001,29.213367,45.889407,101,8.746346,,,
4,Brooklyn Bridge Bike Path,-74.00099,40.709274,0.209082,0.37223,71.953653,27.465034,101,,,,
5,Columbus Ave at 86th St.,-73.97505,40.7877,1.706836,0.94965,62.307832,24.484979,107,10.550702,,,
6,Ed Koch Queensboro Bridge Shared Path,-73.94045,40.75101,6.75315,1.190335,46.362074,23.748089,402,17.809087,4.137265,,
7,Emmons Ave Bikes,-73.93099,40.5841,17.324537,46.809117,24.734291,9.404749,315,,,1.727306,
8,Fountain Ave,-73.862951,40.655606,10.174713,21.709276,52.607236,7.736964,305,,,2.162928,5.608883
9,High Bridge Bikes,-73.93207,40.84219,68.906119,17.666672,,13.427209,112,,,,


In [11]:
# Here, missing means 0% coverage for that class
final_df.fillna(0, inplace=True)

In [12]:
final_df.head(19)

Unnamed: 0,name,longitude,latitude,Tree Canopy,Grass/Shrubs,Roads,Other Impervious,community_district,Buildings,Railroads,Bare Soil,Water
0,111th St at 50th Ave,-73.8525,40.74563,26.826757,6.629865,37.936988,28.60639,481,0.0,0.0,0.0,0.0
1,8th Ave at 50th St.,-73.98612,40.762348,1.783522,0.856122,55.142674,21.418106,104,20.799576,0.0,0.0,0.0
2,Amsterdam Ave at 86th St.,-73.97505,40.7877,1.706836,0.94965,62.307832,24.484979,107,10.550702,0.0,0.0,0.0
3,Brooklyn Bridge Bicycle Path (Roadway),-74.004464,40.712656,14.005877,2.145001,29.213367,45.889407,101,8.746346,0.0,0.0,0.0
4,Brooklyn Bridge Bike Path,-74.00099,40.709274,0.209082,0.37223,71.953653,27.465034,101,0.0,0.0,0.0,0.0
5,Columbus Ave at 86th St.,-73.97505,40.7877,1.706836,0.94965,62.307832,24.484979,107,10.550702,0.0,0.0,0.0
6,Ed Koch Queensboro Bridge Shared Path,-73.94045,40.75101,6.75315,1.190335,46.362074,23.748089,402,17.809087,4.137265,0.0,0.0
7,Emmons Ave Bikes,-73.93099,40.5841,17.324537,46.809117,24.734291,9.404749,315,0.0,0.0,1.727306,0.0
8,Fountain Ave,-73.862951,40.655606,10.174713,21.709276,52.607236,7.736964,305,0.0,0.0,2.162928,5.608883
9,High Bridge Bikes,-73.93207,40.84219,68.906119,17.666672,0.0,13.427209,112,0.0,0.0,0.0,0.0


In [13]:
# Drop community district from land cover DataFrame
final_df = final_df.drop(columns=["community_district", "latitude", "longitude"])

# Merge with the cycling dataset
final_bike_df = bike_df.merge(final_df, on="name", how="left")

# And, save the file
final_bike_df.to_csv("processed_data_newyork_15042025.csv", index=False)

In [14]:
final_bike_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36461 entries, 0 to 36460
Data columns (total 52 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   name                       36461 non-null  object 
 1   date                       36461 non-null  object 
 2   latitude                   36461 non-null  float64
 3   longitude                  36461 non-null  float64
 4   counts                     36461 non-null  int64  
 5   year                       36461 non-null  int64  
 6   distance_to_center_km      36461 non-null  float64
 7   maxspeed_near_station      36461 non-null  float64
 8   shops_within_0km           36461 non-null  int64  
 9   shops_within_1km           36461 non-null  int64  
 10  shops_within_2km           36461 non-null  int64  
 11  shops_within_5km           36461 non-null  int64  
 12  hotels_within_0km          36461 non-null  int64  
 13  hotels_within_1km          36461 non-null  int