In [1]:
import os
import osmium
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from math import radians, cos, sin, asin, sqrt

## Load the Daily New York Data

In [2]:
# Let's load the cycling data
df = pd.read_csv("daily_cycling_data_newyork_07042025.csv")

In [3]:
# Let's quickly check the dataset
df.info()

df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36461 entries, 0 to 36460
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   name       36461 non-null  object 
 1   date       36461 non-null  object 
 2   latitude   36461 non-null  float64
 3   longitude  36461 non-null  float64
 4   counts     36461 non-null  int64  
 5   year       36461 non-null  int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 1.7+ MB


Unnamed: 0,name,date,latitude,longitude,counts,year
0,111th St at 50th Ave,2022-05-09,40.74563,-73.8525,120,2022
1,111th St at 50th Ave,2022-05-10,40.74563,-73.8525,165,2022
2,111th St at 50th Ave,2022-05-11,40.74563,-73.8525,197,2022
3,111th St at 50th Ave,2022-05-12,40.74563,-73.8525,206,2022
4,111th St at 50th Ave,2022-05-13,40.74563,-73.8525,178,2022


## Unique Stations and Unique Year for the Bicycle Dataset

In [4]:
# And make sure that the timestamp is in the required format
df['date'] = pd.to_datetime(df['date'])

# Extract unique station-year combinations
unique_station_year = df[['name', 'latitude', 'longitude', 'year']].drop_duplicates()

# Save the sheet for future use
unique_station_year.to_csv("unique_station_year_new_york.csv", index=False)

## Max Speed Near the Counting Station

In [5]:
# Let's define the folder containing OSM files
osm_folder = "NY_osm_datasets"

# And, load the station data
station_df = pd.read_csv("unique_station_year_new_york.csv")

# And, ensure the year column is treated as an integer
station_df['year'] = station_df['year'].astype(int)

In [6]:
# Map each year to its corresponding OSM file for New York datasets
year_to_osm = {year: f"new-york-{str(year)[-2:]}0101.osm.pbf" for year in range(2015, 2025)}

In [7]:
# Now, let's define the maxspeed search radius (in meters)
MAXSPEED_RADIUS = 100  

# And also define the haversine function to calculate distances between two lat/lon points
def haversine(lat1, lon1, lat2, lon2):
    """Calculate the great circle distance in meters between two lat/lon points."""
    R = 6371000  # Radius of Earth in meters
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    return R * c

In [8]:
# Let's also define a class to extract roads with maxspeed values from OSM
class MaxSpeedHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.roads = []

    def way(self, w):
        if 'maxspeed' in w.tags and w.nodes:
            try:
                maxspeed = int(w.tags['maxspeed'].split()[0])  # Extract numeric value
            except ValueError:
                return  # Skip invalid values
            
            # Ensure node location is valid before accessing
            if w.nodes[0].location.valid():
                self.roads.append({
                    'lat': w.nodes[0].location.lat,
                    'lon': w.nodes[0].location.lon,
                    'maxspeed': maxspeed
                })

def extract_maxspeed(osm_file):
    """Extract road segments with maxspeed information from a given OSM file."""
    handler = MaxSpeedHandler()

    # Enable location lookup
    handler.apply_file(osm_file, locations=True)  

    return pd.DataFrame(handler.roads)

In [9]:
# List to store maxspeed results
maxspeed_results = []

# Iterate over unique years in the station data
for year in station_df['year'].unique():
    osm_file_path = os.path.join(osm_folder, year_to_osm[year])
    
    if not os.path.exists(osm_file_path):
        print(f"Warning: OSM file for year {year} not found: {osm_file_path}")
        continue
    
    print(f"Processing maxspeed data for year {year} using {osm_file_path}...")

    # Extract maxspeed information
    maxspeed_df = extract_maxspeed(osm_file_path)

    # Iterate over stations for this year
    for index, station in station_df[station_df['year'] == year].iterrows():
        lat_s, lon_s, station_name = station['latitude'], station['longitude'], station['name']

        # Compute distances to all road segments with maxspeed
        distances = maxspeed_df.apply(lambda road: haversine(lat_s, lon_s, road['lat'], road['lon']), axis=1)

        # Get maxspeed values within 500m
        valid_maxspeeds = maxspeed_df.loc[distances <= MAXSPEED_RADIUS, 'maxspeed']

        # Get the maximum maxspeed found near the station (or NaN if none found)
        max_maxspeed = valid_maxspeeds.max() if not valid_maxspeeds.empty else None

        # Store results
        maxspeed_results.append({
            'name': station_name,
            'year': year,
            'maxspeed_near_station': max_maxspeed
        })
        

Processing maxspeed data for year 2022 using NY_osm_datasets\new-york-220101.osm.pbf...
Processing maxspeed data for year 2023 using NY_osm_datasets\new-york-230101.osm.pbf...
Processing maxspeed data for year 2024 using NY_osm_datasets\new-york-240101.osm.pbf...
Processing maxspeed data for year 2020 using NY_osm_datasets\new-york-200101.osm.pbf...
Processing maxspeed data for year 2021 using NY_osm_datasets\new-york-210101.osm.pbf...
Processing maxspeed data for year 2015 using NY_osm_datasets\new-york-150101.osm.pbf...
Processing maxspeed data for year 2016 using NY_osm_datasets\new-york-160101.osm.pbf...
Processing maxspeed data for year 2017 using NY_osm_datasets\new-york-170101.osm.pbf...
Processing maxspeed data for year 2018 using NY_osm_datasets\new-york-180101.osm.pbf...
Processing maxspeed data for year 2019 using NY_osm_datasets\new-york-190101.osm.pbf...


In [10]:
# Convert results to DataFrame
maxspeed_df = pd.DataFrame(maxspeed_results)

# Merge maxspeed info into the original station dataset
station_df = station_df.merge(maxspeed_df, on=['name', 'year'], how='left')

# Save updated dataset
station_df.to_csv("stations_with_maxspeed.csv", index=False)

# Display first few rows
print(station_df.head())

                   name   latitude  longitude  year  maxspeed_near_station
0  111th St at 50th Ave  40.745630  -73.85250  2022                   25.0
1  111th St at 50th Ave  40.745630  -73.85250  2023                   25.0
2  111th St at 50th Ave  40.745630  -73.85250  2024                   25.0
3   8th Ave at 50th St.  40.762348  -73.98612  2020                   25.0
4   8th Ave at 50th St.  40.762348  -73.98612  2021                   25.0


### Taking care of the Missing Values

In [12]:
# Let's load the stations with maxspeed data
stations_df = pd.read_csv('stations_with_maxspeed.csv')

# Sort the dataframe by station 'name' and 'year' for consistency
stations_df = stations_df.sort_values(by=['name', 'year'])

In [13]:
# Let's define the stations and their respective maxspeed values in km/h
manual_maxspeeds = {
    "Ocean Pkwy at Avenue J": 40,  
    "Prospect Park West": 20,       
    "Staten Island Ferry": 10,
    "High Bridge Bikes": 25
}

# And, check if the station has a missing maxspeed value and fill it with the provided value
for station, maxspeed in manual_maxspeeds.items():
    stations_df.loc[stations_df['name'] == station, 'maxspeed_near_station'] = maxspeed

# Now, let's apply median to the remaining missing values for other stations
stations_df['maxspeed_near_station'] = stations_df.groupby('name')['maxspeed_near_station'].transform(
    lambda group: group.fillna(group.median())  # Fill remaining missing values with median for each station
)

In [14]:
# And save the final clean dataset
stations_df.to_csv('stations_with_maxspeed_03042025.csv', index=False)

## Bicycle Lane Type Near the Counting Stations

In [15]:
# Let's define the folder containing OSM files
osm_folder = "NY_osm_datasets"

# And, load the station data
station_df = pd.read_csv("unique_station_year_new_york.csv")

# And, ensure the year column is treated as an integer
station_df['year'] = station_df['year'].astype(int)

In [16]:
# Map each year to its corresponding OSM file for New York datasets
year_to_osm = {year: f"new-york-{str(year)[-2:]}0101.osm.pbf" for year in range(2015, 2025)}

In [17]:
# Let's define the search radius (in meters)
CYCLEWAY_RADIUS = 100 

# Let's define the haversine function to calculate distances between two lat/lon points
def haversine(lat1, lon1, lat2, lon2):
    """Calculate the great circle distance in meters between two lat/lon points."""
    R = 6371000  # Radius of Earth in meters
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    return R * c


In [18]:
# Let's also define a class to extract bicycle lane type with maxspeed values from OSM
class CyclewayHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.cycleways = []

    def way(self, w):
        # Extract cycleway-related ways
        if any(tag in w.tags for tag in ['cycleway', 'cycleway:left', 'cycleway:right', 'cycleway:both']):
            # Ensure node has valid location before accessing
            if w.nodes[0].location.valid():
                self.cycleways.append({
                    'lat': w.nodes[0].location.lat,
                    'lon': w.nodes[0].location.lon,
                    'bicycle_lane_type': w.tags.get('cycleway', 'unknown')  # Default to 'unknown' if missing
                })

def extract_cycleways(osm_file):
    """Extract cycleway-related data from a given OSM file with location resolution."""
    handler = CyclewayHandler()
    
    # Enable location lookup to resolve node positions
    handler.apply_file(osm_file, locations=True)

    return pd.DataFrame(handler.cycleways)


In [19]:
# List to store cycleway results
cycleway_results = []

# Iterate over unique years in the station data
for year in station_df['year'].unique():
    osm_file_path = os.path.join(osm_folder, year_to_osm[year])
    
    if not os.path.exists(osm_file_path):
        print(f"Warning: OSM file for year {year} not found: {osm_file_path}")
        continue
    
    print(f"Processing bicycle lane data for year {year} using {osm_file_path}...")

    # Extract cycleway information
    cycleway_df = extract_cycleways(osm_file_path)

    # Iterate over stations for this year
    for index, station in station_df[station_df['year'] == year].iterrows():
        lat_s, lon_s, station_name = station['latitude'], station['longitude'], station['name']

        # Compute distances to all cycleways
        distances = cycleway_df.apply(lambda way: haversine(lat_s, lon_s, way['lat'], way['lon']), axis=1)

        # Get cycleway types within 500m
        valid_cycleways = cycleway_df.loc[distances <= CYCLEWAY_RADIUS, 'bicycle_lane_type']

        # Select the most common bicycle lane type near the station
        most_common_lane = valid_cycleways.mode()[0] if not valid_cycleways.empty else "none"

        # Store results
        cycleway_results.append({
            'name': station_name,
            'year': year,
            'bicycle_lane_type': most_common_lane
        })


Processing bicycle lane data for year 2022 using NY_osm_datasets\new-york-220101.osm.pbf...
Processing bicycle lane data for year 2023 using NY_osm_datasets\new-york-230101.osm.pbf...
Processing bicycle lane data for year 2024 using NY_osm_datasets\new-york-240101.osm.pbf...
Processing bicycle lane data for year 2020 using NY_osm_datasets\new-york-200101.osm.pbf...
Processing bicycle lane data for year 2021 using NY_osm_datasets\new-york-210101.osm.pbf...
Processing bicycle lane data for year 2015 using NY_osm_datasets\new-york-150101.osm.pbf...
Processing bicycle lane data for year 2016 using NY_osm_datasets\new-york-160101.osm.pbf...
Processing bicycle lane data for year 2017 using NY_osm_datasets\new-york-170101.osm.pbf...
Processing bicycle lane data for year 2018 using NY_osm_datasets\new-york-180101.osm.pbf...
Processing bicycle lane data for year 2019 using NY_osm_datasets\new-york-190101.osm.pbf...


In [20]:
# Convert results to DataFrame
cycleway_df = pd.DataFrame(cycleway_results)

# Merge bicycle lane info into the original station dataset
station_df = station_df.merge(cycleway_df, on=['name', 'year'], how='left')

# Save updated dataset
station_df.to_csv("stations_with_bicycle_lane_type.csv", index=False)

# Display first few rows
print(station_df.head())

                   name   latitude  longitude  year bicycle_lane_type
0  111th St at 50th Ave  40.745630  -73.85250  2022           unknown
1  111th St at 50th Ave  40.745630  -73.85250  2023           unknown
2  111th St at 50th Ave  40.745630  -73.85250  2024           unknown
3   8th Ave at 50th St.  40.762348  -73.98612  2020              none
4   8th Ave at 50th St.  40.762348  -73.98612  2021              none


## Number of Shops within a radius of 0.5, 1, 2 & 5km

In [2]:
# Let's define the folder containing OSM files
osm_folder = "NY_osm_datasets"

# And, load the station data
station_df = pd.read_csv("unique_station_year_new_york.csv")

# And, ensure the year column is treated as an integer
station_df['year'] = station_df['year'].astype(int)

In [3]:
# Map each year to its corresponding OSM file for New York datasets
year_to_osm = {year: f"new-york-{str(year)[-2:]}0101.osm.pbf" for year in range(2015, 2025)}

In [4]:
# Let's define the haversine function to calculate distances between two lat/lon points
def haversine(lat1, lon1, lat2, lon2):
    """Calculate the great circle distance in meters between two lat/lon points."""
    R = 6371000  # Radius of Earth in meters
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    return R * c


In [5]:
# Let's also define a class to extract number of shops from OSM
class ShopHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.shops = []

    def node(self, n):
        if 'shop' in n.tags:
            self.shops.append({'lat': n.location.lat, 'lon': n.location.lon})

def extract_shops(osm_file):
    """Extract shop locations from a given OSM file."""
    handler = ShopHandler()
    handler.apply_file(osm_file)
    return pd.DataFrame(handler.shops)


In [6]:
# Define the search radii (in meters)
radii = [500, 1000, 2000, 5000]

# Create an empty list to store results
results = []

# Iterate over unique years in the station data
for year in station_df['year'].unique():
    osm_file_path = os.path.join(osm_folder, year_to_osm[year])
    
    if not os.path.exists(osm_file_path):
        print(f"Warning: OSM file for year {year} not found: {osm_file_path}")
        continue
    
    print(f"Processing year {year} using {osm_file_path}...")
    
    # Extract shop locations for the given year
    shop_df = extract_shops(osm_file_path)
    
    # Iterate over stations for the given year
    for index, station in station_df[station_df['year'] == year].iterrows():
        lat_s, lon_s, station_name = station['latitude'], station['longitude'], station['name']
        
        # Compute distances to all shops
        distances = shop_df.apply(lambda shop: haversine(lat_s, lon_s, shop['lat'], shop['lon']), axis=1)
        
        # Count shops within each radius
        shop_counts = {f'shops_within_{r//1000}km': (distances <= r).sum() for r in radii}
        
        # Store results
        results.append({
            'name': station_name,
            'year': year,
            **shop_counts
        })


Processing year 2022 using NY_osm_datasets\new-york-220101.osm.pbf...
Processing year 2023 using NY_osm_datasets\new-york-230101.osm.pbf...
Processing year 2024 using NY_osm_datasets\new-york-240101.osm.pbf...
Processing year 2020 using NY_osm_datasets\new-york-200101.osm.pbf...
Processing year 2021 using NY_osm_datasets\new-york-210101.osm.pbf...
Processing year 2015 using NY_osm_datasets\new-york-150101.osm.pbf...
Processing year 2016 using NY_osm_datasets\new-york-160101.osm.pbf...
Processing year 2017 using NY_osm_datasets\new-york-170101.osm.pbf...
Processing year 2018 using NY_osm_datasets\new-york-180101.osm.pbf...
Processing year 2019 using NY_osm_datasets\new-york-190101.osm.pbf...


In [7]:
# Convert results to DataFrame
result_df = pd.DataFrame(results)

# Merge bicycle lane info into the original station dataset
station_df = station_df.merge(result_df, on=['name', 'year'], how='left')

# Save updated dataset
station_df.to_csv("stations_with_shops.csv", index=False)

# Display the first few rows
print(result_df.head())

                                     name  year  shops_within_0km  \
0                    111th St at 50th Ave  2022                 8   
1                     8th Ave at 50th St.  2022                86   
2               Amsterdam Ave at 86th St.  2022               100   
3  Brooklyn Bridge Bicycle Path (Roadway)  2022                66   
4               Brooklyn Bridge Bike Path  2022                40   

   shops_within_1km  shops_within_2km  shops_within_5km  
0                16               175              1130  
1               403              1187              3994  
2               191               477              2541  
3               485              1157              4475  
4               348              1135              4706  


## Number of Hotels within a radius of 0.5, 1, 2 & 5km

In [8]:
# Let's define the folder containing OSM files
osm_folder = "NY_osm_datasets"

# And, load the station data
station_df = pd.read_csv("unique_station_year_new_york.csv")

# And, ensure the year column is treated as an integer
station_df['year'] = station_df['year'].astype(int)

In [9]:
# Map each year to its corresponding OSM file for New York datasets
year_to_osm = {year: f"new-york-{str(year)[-2:]}0101.osm.pbf" for year in range(2015, 2025)}

In [10]:
# Let's define the search radii in meters
radii = [500, 1000, 2000, 5000]  # 0.5km, 1km, 2km, 5km

# And, let's define the haversine function to calculate distances
def haversine(lat1, lon1, lat2, lon2):
    """Calculate the great circle distance in meters between two lat/lon points."""
    R = 6371000  # Radius of Earth in meters
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    return R * c


In [11]:
# Now, let's define a class to extract hotel locations from OSM
class HotelHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.hotels = []

    def node(self, n):
        if 'tourism' in n.tags and n.tags['tourism'] == 'hotel':
            self.hotels.append({
                'lat': n.location.lat,
                'lon': n.location.lon
            })

def extract_hotels(osm_file):
    """Extract hotel locations from a given OSM file."""
    handler = HotelHandler()
    
    # Enable location lookup
    handler.apply_file(osm_file, locations=True)

    return pd.DataFrame(handler.hotels)

In [12]:
# List to store hotel count results
hotel_results = []

# Iterate over unique years
for year in station_df['year'].unique():
    osm_file_path = os.path.join(osm_folder, year_to_osm[year])
    
    if not os.path.exists(osm_file_path):
        print(f"Warning: OSM file for year {year} not found: {osm_file_path}")
        continue
    
    print(f"Processing hotel data for year {year} using {osm_file_path}...")

    # Extract hotel locations
    hotel_df = extract_hotels(osm_file_path)

    # Iterate over stations for this year
    for index, station in station_df[station_df['year'] == year].iterrows():
        lat_s, lon_s, station_name = station['latitude'], station['longitude'], station['name']

        # Compute distances to all hotels
        distances = hotel_df.apply(lambda hotel: haversine(lat_s, lon_s, hotel['lat'], hotel['lon']), axis=1)

        # Count hotels within each radius
        hotel_counts = {f'hotels_within_{r//1000}km': (distances <= r).sum() for r in radii}

        # Store results
        hotel_results.append({
            'name': station_name,
            'year': year,
            **hotel_counts
        })


Processing hotel data for year 2022 using NY_osm_datasets\new-york-220101.osm.pbf...
Processing hotel data for year 2023 using NY_osm_datasets\new-york-230101.osm.pbf...
Processing hotel data for year 2024 using NY_osm_datasets\new-york-240101.osm.pbf...
Processing hotel data for year 2020 using NY_osm_datasets\new-york-200101.osm.pbf...
Processing hotel data for year 2021 using NY_osm_datasets\new-york-210101.osm.pbf...
Processing hotel data for year 2015 using NY_osm_datasets\new-york-150101.osm.pbf...
Processing hotel data for year 2016 using NY_osm_datasets\new-york-160101.osm.pbf...
Processing hotel data for year 2017 using NY_osm_datasets\new-york-170101.osm.pbf...
Processing hotel data for year 2018 using NY_osm_datasets\new-york-180101.osm.pbf...
Processing hotel data for year 2019 using NY_osm_datasets\new-york-190101.osm.pbf...


In [13]:
# Convert results to DataFrame
hotel_df = pd.DataFrame(hotel_results)

# Merge hotel count info into the original station dataset
station_df = station_df.merge(hotel_df, on=['name', 'year'], how='left')

# Save updated dataset
station_df.to_csv("stations_with_hotels.csv", index=False)

# Display first few rows
print(station_df.head())

                   name   latitude  longitude  year  hotels_within_0km  \
0  111th St at 50th Ave  40.745630  -73.85250  2022                  0   
1  111th St at 50th Ave  40.745630  -73.85250  2023                  0   
2  111th St at 50th Ave  40.745630  -73.85250  2024                  0   
3   8th Ave at 50th St.  40.762348  -73.98612  2020                 16   
4   8th Ave at 50th St.  40.762348  -73.98612  2021                 17   

   hotels_within_1km  hotels_within_2km  hotels_within_5km  
0                  2                  4                 15  
1                  2                  4                 17  
2                  2                  4                 17  
3                 35                 77                147  
4                 34                 74                146  


## Number of Education within a radius of 0.5, 1, 2 & 5km

In [2]:
# Let's define the folder containing OSM files
osm_folder = "NY_osm_datasets"

# And, load the station data
station_df = pd.read_csv("unique_station_year_new_york.csv")

# And, ensure the year column is treated as an integer
station_df['year'] = station_df['year'].astype(int)

In [3]:
# Map each year to its corresponding OSM file for New York datasets
year_to_osm = {year: f"new-york-{str(year)[-2:]}0101.osm.pbf" for year in range(2015, 2025)}

In [4]:
# Let's define the search radii in meters
radii = [500, 1000, 2000, 5000]  # 0.5km, 1km, 2km, 5km

# And, the haversine function to calculate distances
def haversine(lat1, lon1, lat2, lon2):
    """Calculate the great circle distance in meters between two lat/lon points."""
    R = 6371000  # Radius of Earth in meters
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    return R * c


In [5]:
# And, also define a class to extract educational institutions from OSM
class EducationHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.education_centers = []

    def node(self, n):
        if 'amenity' in n.tags and n.tags['amenity'] in [
            'kindergarten', 'school', 'driving_school', 'college', 'university', 'music_school',
            'childcare', 'research_institute', 'language_school', 'dancing_school', 'sailing_school',
            'sport_school', 'boat_school', 'first_aid_school', 'art_school'
        ]:
            self.education_centers.append({
                'lat': n.location.lat,
                'lon': n.location.lon,
                'type': n.tags['amenity']  # Store the type of educational institute
            })

def extract_educational_institutes(osm_file):
    """Extract educational institution locations from a given OSM file."""
    handler = EducationHandler()
    
    # Enable location lookup
    handler.apply_file(osm_file, locations=True)

    return pd.DataFrame(handler.education_centers)


In [6]:
# List to store educational institute count results
education_results = []

# Iterate over unique years
for year in station_df['year'].unique():
    osm_file_path = os.path.join(osm_folder, year_to_osm[year])
    
    if not os.path.exists(osm_file_path):
        print(f"Warning: OSM file for year {year} not found: {osm_file_path}")
        continue
    
    print(f"Processing educational institute data for year {year} using {osm_file_path}...")

    # Extract educational institution locations
    education_df = extract_educational_institutes(osm_file_path)

    # Iterate over stations for this year
    for index, station in station_df[station_df['year'] == year].iterrows():
        lat_s, lon_s, station_name = station['latitude'], station['longitude'], station['name']

        # Compute distances to all educational institutions
        distances = education_df.apply(lambda edu: haversine(lat_s, lon_s, edu['lat'], edu['lon']), axis=1)

        # Count institutions within each radius
        education_counts = {f'education_within_{r//1000}km': (distances <= r).sum() for r in radii}

        # Store results
        education_results.append({
            'name': station_name,
            'year': year,
            **education_counts
        })


Processing educational institute data for year 2022 using NY_osm_datasets\new-york-220101.osm.pbf...
Processing educational institute data for year 2023 using NY_osm_datasets\new-york-230101.osm.pbf...
Processing educational institute data for year 2024 using NY_osm_datasets\new-york-240101.osm.pbf...
Processing educational institute data for year 2020 using NY_osm_datasets\new-york-200101.osm.pbf...
Processing educational institute data for year 2021 using NY_osm_datasets\new-york-210101.osm.pbf...
Processing educational institute data for year 2015 using NY_osm_datasets\new-york-150101.osm.pbf...
Processing educational institute data for year 2016 using NY_osm_datasets\new-york-160101.osm.pbf...
Processing educational institute data for year 2017 using NY_osm_datasets\new-york-170101.osm.pbf...
Processing educational institute data for year 2018 using NY_osm_datasets\new-york-180101.osm.pbf...
Processing educational institute data for year 2019 using NY_osm_datasets\new-york-190101.o

In [7]:
# Convert results to DataFrame
education_df = pd.DataFrame(education_results)

# Merge educational institute count info into the original station dataset
station_df = station_df.merge(education_df, on=['name', 'year'], how='left')

# Save updated dataset
station_df.to_csv("stations_with_education_counts.csv", index=False)

# Display first few rows
print(station_df.head())

                   name   latitude  longitude  year  education_within_0km  \
0  111th St at 50th Ave  40.745630  -73.85250  2022                     0   
1  111th St at 50th Ave  40.745630  -73.85250  2023                     2   
2  111th St at 50th Ave  40.745630  -73.85250  2024                     2   
3   8th Ave at 50th St.  40.762348  -73.98612  2020                     2   
4   8th Ave at 50th St.  40.762348  -73.98612  2021                     2   

   education_within_1km  education_within_2km  education_within_5km  
0                     0                     1                    55  
1                     2                     7                    66  
2                     2                     8                    75  
3                    12                    39                   194  
4                    13                    46                   203  


## Number of Hospitals within a radius of 0.5, 1, 2 & 5km

In [8]:
# Let's define the folder containing OSM files
osm_folder = "NY_osm_datasets"

# And, load the station data
station_df = pd.read_csv("unique_station_year_new_york.csv")

# And, ensure the year column is treated as an integer
station_df['year'] = station_df['year'].astype(int)

In [9]:
# Map each year to its corresponding OSM file for New York datasets
year_to_osm = {year: f"new-york-{str(year)[-2:]}0101.osm.pbf" for year in range(2015, 2025)}

In [10]:
# Define the search radii in meters
radii = [500, 1000, 2000, 5000]  # 0.5km, 1km, 2km, 5km

### Haversine function to calculate distances
def haversine(lat1, lon1, lat2, lon2):
    """Calculate the great circle distance in meters between two lat/lon points."""
    R = 6371000  # Radius of Earth in meters
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    return R * c


In [11]:
### Define a class to extract hospital-related data from OSM
class HospitalHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.hospitals = []

    def node(self, n):
        if ('amenity' in n.tags and n.tags['amenity'] in [
            'hospital', 'clinic', 'doctor', 'medical_center'
        ]) or ('healthcare' in n.tags and n.tags['healthcare'] in [
            'hospital', 'clinic', 'doctor', 'health_center', 'pharmacy'
        ]):
            self.hospitals.append({
                'lat': n.location.lat,
                'lon': n.location.lon,
                'type': n.tags.get('amenity', n.tags.get('healthcare', 'unknown'))  # Store the type of hospital
            })

def extract_hospitals(osm_file):
    """Extract hospital locations from a given OSM file."""
    handler = HospitalHandler()
    
    # Enable location lookup
    handler.apply_file(osm_file, locations=True)

    return pd.DataFrame(handler.hospitals)


In [12]:
# List to store hospital count results
hospital_results = []

# Iterate over unique years
for year in station_df['year'].unique():
    osm_file_path = os.path.join(osm_folder, year_to_osm[year])
    
    if not os.path.exists(osm_file_path):
        print(f"Warning: OSM file for year {year} not found: {osm_file_path}")
        continue
    
    print(f"Processing hospital data for year {year} using {osm_file_path}...")

    # Extract hospital locations
    hospital_df = extract_hospitals(osm_file_path)

    # Iterate over stations for this year
    for index, station in station_df[station_df['year'] == year].iterrows():
        lat_s, lon_s, station_name = station['latitude'], station['longitude'], station['name']

        # Compute distances to all hospitals
        distances = hospital_df.apply(lambda hosp: haversine(lat_s, lon_s, hosp['lat'], hosp['lon']), axis=1)

        # Count hospitals within each radius
        hospital_counts = {f'hospitals_within_{r//1000}km': (distances <= r).sum() for r in radii}

        # Store results
        hospital_results.append({
            'name': station_name,
            'year': year,
            **hospital_counts
        })


Processing hospital data for year 2022 using NY_osm_datasets\new-york-220101.osm.pbf...
Processing hospital data for year 2023 using NY_osm_datasets\new-york-230101.osm.pbf...
Processing hospital data for year 2024 using NY_osm_datasets\new-york-240101.osm.pbf...
Processing hospital data for year 2020 using NY_osm_datasets\new-york-200101.osm.pbf...
Processing hospital data for year 2021 using NY_osm_datasets\new-york-210101.osm.pbf...
Processing hospital data for year 2015 using NY_osm_datasets\new-york-150101.osm.pbf...
Processing hospital data for year 2016 using NY_osm_datasets\new-york-160101.osm.pbf...
Processing hospital data for year 2017 using NY_osm_datasets\new-york-170101.osm.pbf...
Processing hospital data for year 2018 using NY_osm_datasets\new-york-180101.osm.pbf...
Processing hospital data for year 2019 using NY_osm_datasets\new-york-190101.osm.pbf...


In [13]:
# Convert results to DataFrame
hospital_df = pd.DataFrame(hospital_results)

# Merge hospital count info into the original station dataset
station_df = station_df.merge(hospital_df, on=['name', 'year'], how='left')

# Save updated dataset
station_df.to_csv("stations_with_hospitals.csv", index=False)

# Display first few rows
print(station_df.head())

                   name   latitude  longitude  year  hospitals_within_0km  \
0  111th St at 50th Ave  40.745630  -73.85250  2022                     0   
1  111th St at 50th Ave  40.745630  -73.85250  2023                     0   
2  111th St at 50th Ave  40.745630  -73.85250  2024                     0   
3   8th Ave at 50th St.  40.762348  -73.98612  2020                     4   
4   8th Ave at 50th St.  40.762348  -73.98612  2021                     4   

   hospitals_within_1km  hospitals_within_2km  hospitals_within_5km  
0                     0                    26                   149  
1                     2                    32                   178  
2                     2                    35                   221  
3                    24                    69                   247  
4                    25                    83                   297  


## Distance to the City Centre

In [15]:
# Let's load the cycling dataset
df = pd.read_csv("daily_cycling_data_newyork_07042025.csv")

In [16]:
# Let's add the distance to city center variable to the dataset
# Let's first define the coordinates for the city center - Midtown Manhattan
city_center = (40.7549, -73.9840)

# Compute distance (in km) to city center
df['distance_to_center_km'] = df.apply(
    lambda row: geodesic(city_center, (row['latitude'], row['longitude'])).km,
    axis=1
)

In [17]:
# And save the cycling dataset
df.to_csv("daily_cycling_data_newyork_07042025.csv", index=False)

## Merge all the Infrastructure Files

In [18]:
# Let's load each of the files
df_speed = pd.read_csv('stations_with_maxspeed_03042025.csv')
df_lane_type = pd.read_csv('stations_with_bicycle_lane_type.csv')
df_shops = pd.read_csv('stations_with_shops.csv')
df_hotels = pd.read_csv('stations_with_hotels.csv')
df_edu = pd.read_csv('stations_with_education_counts.csv')
df_hospitals = pd.read_csv('stations_with_hospitals.csv')

In [19]:
# Now, let's define common merge keys
merge_keys = ['name', 'latitude', 'longitude', 'year']

# And, merge all dataframes one by one on common keys
df_merged = df_speed.merge(df_lane_type, on=merge_keys, how='outer') \
                    .merge(df_shops, on=merge_keys, how='outer') \
                    .merge(df_hotels, on=merge_keys, how='outer') \
                    .merge(df_edu, on=merge_keys, how='outer') \
                    .merge(df_hospitals, on=merge_keys, how='outer')

# And, finally save to a new CSV file
df_merged.to_csv('infrastructure_data_newyork.csv', index=False)

In [20]:
# Now, let's merge with the final bicycle dataset
# Let's load the infrastructure data
infra_df = pd.read_csv('infrastructure_data_newyork.csv')

# Now, let's load the cycling data
cycling_df = pd.read_csv('daily_cycling_data_newyork_07042025.csv')

In [21]:
# before the merge process, let's add the year column to the bicycle data
cycling_df['date'] = pd.to_datetime(cycling_df['date'])
cycling_df['year'] = cycling_df['date'].dt.year

In [22]:
cycling_df.head()

Unnamed: 0,name,date,latitude,longitude,counts,year,distance_to_center_km
0,111th St at 50th Ave,2022-05-09,40.74563,-73.8525,120,2022,11.153039
1,111th St at 50th Ave,2022-05-10,40.74563,-73.8525,165,2022,11.153039
2,111th St at 50th Ave,2022-05-11,40.74563,-73.8525,197,2022,11.153039
3,111th St at 50th Ave,2022-05-12,40.74563,-73.8525,206,2022,11.153039
4,111th St at 50th Ave,2022-05-13,40.74563,-73.8525,178,2022,11.153039


In [23]:
# And, merge on common keys
merge_keys = ['name', 'latitude', 'longitude', 'year']
merged_df = cycling_df.merge(infra_df, on=merge_keys, how='left')

In [24]:
merged_df.head()

Unnamed: 0,name,date,latitude,longitude,counts,year,distance_to_center_km,maxspeed_near_station,bicycle_lane_type,shops_within_0km,...,hotels_within_2km,hotels_within_5km,education_within_0km,education_within_1km,education_within_2km,education_within_5km,hospitals_within_0km,hospitals_within_1km,hospitals_within_2km,hospitals_within_5km
0,111th St at 50th Ave,2022-05-09,40.74563,-73.8525,120,2022,11.153039,25.0,unknown,8,...,4,15,0,0,1,55,0,0,26,149
1,111th St at 50th Ave,2022-05-10,40.74563,-73.8525,165,2022,11.153039,25.0,unknown,8,...,4,15,0,0,1,55,0,0,26,149
2,111th St at 50th Ave,2022-05-11,40.74563,-73.8525,197,2022,11.153039,25.0,unknown,8,...,4,15,0,0,1,55,0,0,26,149
3,111th St at 50th Ave,2022-05-12,40.74563,-73.8525,206,2022,11.153039,25.0,unknown,8,...,4,15,0,0,1,55,0,0,26,149
4,111th St at 50th Ave,2022-05-13,40.74563,-73.8525,178,2022,11.153039,25.0,unknown,8,...,4,15,0,0,1,55,0,0,26,149


In [25]:
# And, save the dataset with infra data
merged_df.to_csv('cycle_infra_data_newyork_04042025.csv', index=False)

In [26]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36461 entries, 0 to 36460
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   name                   36461 non-null  object        
 1   date                   36461 non-null  datetime64[ns]
 2   latitude               36461 non-null  float64       
 3   longitude              36461 non-null  float64       
 4   counts                 36461 non-null  int64         
 5   year                   36461 non-null  int32         
 6   distance_to_center_km  36461 non-null  float64       
 7   maxspeed_near_station  36461 non-null  float64       
 8   bicycle_lane_type      36461 non-null  object        
 9   shops_within_0km       36461 non-null  int64         
 10  shops_within_1km       36461 non-null  int64         
 11  shops_within_2km       36461 non-null  int64         
 12  shops_within_5km       36461 non-null  int64         
 13  h