In [1]:
import os
import osmium
import pandas as pd
import numpy as np
from math import radians, cos, sin, asin, sqrt

## Distance to the City Center

In [2]:
# Let's load the cycling data
df = pd.read_csv("cycling_data_berlin_07032025.csv")

In [3]:
# Let's define the city center coordinates - Alexanderplatz
city_center = (52.5220, 13.4133)

In [4]:
# Now, let's take out the unique stations
unique_stations = df[['station_name', 'latitude', 'longitude']].drop_duplicates()

In [5]:
# Now, let's define a function to calculate distance
def calculate_distance_to_center(lat, lon, center=city_center):
    return geodesic((lat, lon), center).kilometers

# And, apply the function to each station
unique_stations['distance_to_center'] = unique_stations.apply(
    lambda row: calculate_distance_to_center(row['latitude'], row['longitude']), axis=1
)

In [6]:
# And, now merge back to the original dataset
df = df.merge(unique_stations, on=['station_name', 'latitude', 'longitude'], how='left')

In [7]:
# Let's check how the dataset looks now
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1248009 entries, 0 to 1248008
Data columns (total 6 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   timestamp           1248009 non-null  object 
 1   station_name        1248009 non-null  object 
 2   latitude            1248009 non-null  float64
 3   longitude           1248009 non-null  float64
 4   cycling_volume      1248009 non-null  float64
 5   distance_to_center  1248009 non-null  float64
dtypes: float64(4), object(2)
memory usage: 57.1+ MB


In [8]:
df.head()

Unnamed: 0,timestamp,station_name,latitude,longitude,cycling_volume,distance_to_center
0,2015-01-01 00:00:00,Schwedter Steg,52.549072,13.400367,8.0,3.137726
1,2015-01-01 01:00:00,Schwedter Steg,52.549072,13.400367,10.0,3.137726
2,2015-01-01 02:00:00,Schwedter Steg,52.549072,13.400367,8.0,3.137726
3,2015-01-01 03:00:00,Schwedter Steg,52.549072,13.400367,6.0,3.137726
4,2015-01-01 04:00:00,Schwedter Steg,52.549072,13.400367,6.0,3.137726


In [9]:
# And, save the updated dataset
df.to_csv("cycling_data_berlin_08032025.csv", index=False)

## Unique Stations and Unique Year for the Bicycle Dataset

In [2]:
# Let's load the bicycle data
data = pd.read_csv("cycling_data_berlin_08032025.csv")

In [3]:
# And make sure that the timestamp is in the required format
data['timestamp'] = pd.to_datetime(data['timestamp'])

# Now, let's add the year column to the dataset
data['year'] = data['timestamp'].dt.year

# Extract unique station-year combinations
unique_station_year = data[['station_name', 'latitude', 'longitude', 'year']].drop_duplicates()

# Save the sheet for future use
unique_station_year.to_csv("unique_station_year_berlin.csv", index=False)

## Max Speed Near the Counting Station

In [4]:
# Let's define the folder containing OSM files
osm_folder = "berlin_osm_datasets"

# And, load the station data
station_df = pd.read_csv("unique_station_year_berlin.csv")

# And, ensure the year column is treated as an integer
station_df['year'] = station_df['year'].astype(int)

In [5]:
# Map each year to its corresponding OSM file
year_to_osm = {year: f"berlin-{str(year)[-2:]}0101.osm.pbf" for year in range(2015, 2024)}

In [6]:
# Now, let's define the maxspeed search radius (in meters)
MAXSPEED_RADIUS = 100  

# And also define the haversine function to calculate distances between two lat/lon points
def haversine(lat1, lon1, lat2, lon2):
    """Calculate the great circle distance in meters between two lat/lon points."""
    R = 6371000  # Radius of Earth in meters
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    return R * c


In [9]:
# Let's also define a class to extract roads with maxspeed values from OSM
class MaxSpeedHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.roads = []

    def way(self, w):
        if 'maxspeed' in w.tags and w.nodes:
            try:
                maxspeed = int(w.tags['maxspeed'].split()[0])  # Extract numeric value
            except ValueError:
                return  # Skip invalid values
            
            # Ensure node location is valid before accessing
            if w.nodes[0].location.valid():
                self.roads.append({
                    'lat': w.nodes[0].location.lat,
                    'lon': w.nodes[0].location.lon,
                    'maxspeed': maxspeed
                })

def extract_maxspeed(osm_file):
    """Extract road segments with maxspeed information from a given OSM file."""
    handler = MaxSpeedHandler()

    # Enable location lookup
    handler.apply_file(osm_file, locations=True)  

    return pd.DataFrame(handler.roads)

In [10]:
# List to store maxspeed results
maxspeed_results = []

# Iterate over unique years in the station data
for year in station_df['year'].unique():
    osm_file_path = os.path.join(osm_folder, year_to_osm[year])
    
    if not os.path.exists(osm_file_path):
        print(f"Warning: OSM file for year {year} not found: {osm_file_path}")
        continue
    
    print(f"Processing maxspeed data for year {year} using {osm_file_path}...")

    # Extract maxspeed information
    maxspeed_df = extract_maxspeed(osm_file_path)

    # Iterate over stations for this year
    for index, station in station_df[station_df['year'] == year].iterrows():
        lat_s, lon_s, station_name = station['latitude'], station['longitude'], station['station_name']

        # Compute distances to all road segments with maxspeed
        distances = maxspeed_df.apply(lambda road: haversine(lat_s, lon_s, road['lat'], road['lon']), axis=1)

        # Get maxspeed values within 500m
        valid_maxspeeds = maxspeed_df.loc[distances <= MAXSPEED_RADIUS, 'maxspeed']

        # Get the maximum maxspeed found near the station (or NaN if none found)
        max_maxspeed = valid_maxspeeds.max() if not valid_maxspeeds.empty else None

        # Store results
        maxspeed_results.append({
            'station_name': station_name,
            'year': year,
            'maxspeed_near_station': max_maxspeed
        })
        

Processing maxspeed data for year 2015 using berlin_osm_datasets\berlin-150101.osm.pbf...
Processing maxspeed data for year 2016 using berlin_osm_datasets\berlin-160101.osm.pbf...
Processing maxspeed data for year 2017 using berlin_osm_datasets\berlin-170101.osm.pbf...
Processing maxspeed data for year 2018 using berlin_osm_datasets\berlin-180101.osm.pbf...
Processing maxspeed data for year 2019 using berlin_osm_datasets\berlin-190101.osm.pbf...
Processing maxspeed data for year 2020 using berlin_osm_datasets\berlin-200101.osm.pbf...
Processing maxspeed data for year 2021 using berlin_osm_datasets\berlin-210101.osm.pbf...
Processing maxspeed data for year 2022 using berlin_osm_datasets\berlin-220101.osm.pbf...
Processing maxspeed data for year 2023 using berlin_osm_datasets\berlin-230101.osm.pbf...


In [11]:
# Convert results to DataFrame
maxspeed_df = pd.DataFrame(maxspeed_results)

# Merge maxspeed info into the original station dataset
station_df = station_df.merge(maxspeed_df, on=['station_name', 'year'], how='left')

# Save updated dataset
station_df.to_csv("stations_with_maxspeed.csv", index=False)

# Display first few rows
print(station_df.head())

          station_name   latitude  longitude  year  maxspeed_near_station
0       Schwedter Steg  52.549072  13.400367  2015                   70.0
1      Jannowitzbrücke  52.513936  13.417722  2015                   80.0
2  Prinzregentenstraße  52.488136  13.333120  2015                   50.0
3          Yorckstraße  52.492110  13.373341  2015                  120.0
4           Markstraße  52.558190  13.364944  2015                   50.0


## Bicycle Lane Type Near the Counting Stations

In [12]:
# Let's define the folder containing OSM files
osm_folder = "berlin_osm_datasets"

# And, load the station data
station_df = pd.read_csv("unique_station_year_berlin.csv")

# And, ensure the year column is treated as an integer
station_df['year'] = station_df['year'].astype(int)

In [13]:
# Map each year to its corresponding OSM file
year_to_osm = {year: f"berlin-{str(year)[-2:]}0101.osm.pbf" for year in range(2015, 2024)}

In [14]:
# Let's define the search radius (in meters)
CYCLEWAY_RADIUS = 100 

# Let's define the haversine function to calculate distances between two lat/lon points
def haversine(lat1, lon1, lat2, lon2):
    """Calculate the great circle distance in meters between two lat/lon points."""
    R = 6371000  # Radius of Earth in meters
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    return R * c


In [15]:
# Let's also define a class to extract bicycle lane type with maxspeed values from OSM
class CyclewayHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.cycleways = []

    def way(self, w):
        # Extract cycleway-related ways
        if any(tag in w.tags for tag in ['cycleway', 'cycleway:left', 'cycleway:right', 'cycleway:both']):
            # Ensure node has valid location before accessing
            if w.nodes[0].location.valid():
                self.cycleways.append({
                    'lat': w.nodes[0].location.lat,
                    'lon': w.nodes[0].location.lon,
                    'bicycle_lane_type': w.tags.get('cycleway', 'unknown')  # Default to 'unknown' if missing
                })

def extract_cycleways(osm_file):
    """Extract cycleway-related data from a given OSM file with location resolution."""
    handler = CyclewayHandler()
    
    # Enable location lookup to resolve node positions
    handler.apply_file(osm_file, locations=True)

    return pd.DataFrame(handler.cycleways)


In [16]:
# List to store cycleway results
cycleway_results = []

# Iterate over unique years in the station data
for year in station_df['year'].unique():
    osm_file_path = os.path.join(osm_folder, year_to_osm[year])
    
    if not os.path.exists(osm_file_path):
        print(f"Warning: OSM file for year {year} not found: {osm_file_path}")
        continue
    
    print(f"Processing bicycle lane data for year {year} using {osm_file_path}...")

    # Extract cycleway information
    cycleway_df = extract_cycleways(osm_file_path)

    # Iterate over stations for this year
    for index, station in station_df[station_df['year'] == year].iterrows():
        lat_s, lon_s, station_name = station['latitude'], station['longitude'], station['station_name']

        # Compute distances to all cycleways
        distances = cycleway_df.apply(lambda way: haversine(lat_s, lon_s, way['lat'], way['lon']), axis=1)

        # Get cycleway types within 500m
        valid_cycleways = cycleway_df.loc[distances <= CYCLEWAY_RADIUS, 'bicycle_lane_type']

        # Select the most common bicycle lane type near the station
        most_common_lane = valid_cycleways.mode()[0] if not valid_cycleways.empty else "none"

        # Store results
        cycleway_results.append({
            'station_name': station_name,
            'year': year,
            'bicycle_lane_type': most_common_lane
        })


Processing bicycle lane data for year 2015 using berlin_osm_datasets\berlin-150101.osm.pbf...
Processing bicycle lane data for year 2016 using berlin_osm_datasets\berlin-160101.osm.pbf...
Processing bicycle lane data for year 2017 using berlin_osm_datasets\berlin-170101.osm.pbf...
Processing bicycle lane data for year 2018 using berlin_osm_datasets\berlin-180101.osm.pbf...
Processing bicycle lane data for year 2019 using berlin_osm_datasets\berlin-190101.osm.pbf...
Processing bicycle lane data for year 2020 using berlin_osm_datasets\berlin-200101.osm.pbf...
Processing bicycle lane data for year 2021 using berlin_osm_datasets\berlin-210101.osm.pbf...
Processing bicycle lane data for year 2022 using berlin_osm_datasets\berlin-220101.osm.pbf...
Processing bicycle lane data for year 2023 using berlin_osm_datasets\berlin-230101.osm.pbf...


In [17]:
# Convert results to DataFrame
cycleway_df = pd.DataFrame(cycleway_results)

# Merge bicycle lane info into the original station dataset
station_df = station_df.merge(cycleway_df, on=['station_name', 'year'], how='left')

# Save updated dataset
station_df.to_csv("stations_with_bicycle_lane_type.csv", index=False)

# Display first few rows
print(station_df.head())

          station_name   latitude  longitude  year bicycle_lane_type
0       Schwedter Steg  52.549072  13.400367  2015              none
1      Jannowitzbrücke  52.513936  13.417722  2015              lane
2  Prinzregentenstraße  52.488136  13.333120  2015              lane
3          Yorckstraße  52.492110  13.373341  2015              none
4           Markstraße  52.558190  13.364944  2015             track


## Number of Shops within a radius of 0.5, 1, 2 & 5km

In [18]:
# Let's define the folder containing OSM files
osm_folder = "berlin_osm_datasets"

# And, load the station data
station_df = pd.read_csv("unique_station_year_berlin.csv")

# And, ensure the year column is treated as an integer
station_df['year'] = station_df['year'].astype(int)

In [19]:
# Map each year to its corresponding OSM file
year_to_osm = {year: f"berlin-{str(year)[-2:]}0101.osm.pbf" for year in range(2015, 2024)}

In [20]:
# Let's define the haversine function to calculate distances between two lat/lon points
def haversine(lat1, lon1, lat2, lon2):
    """Calculate the great circle distance in meters between two lat/lon points."""
    R = 6371000  # Radius of Earth in meters
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    return R * c


In [21]:
# Let's also define a class to extract number of shops from OSM
class ShopHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.shops = []

    def node(self, n):
        if 'shop' in n.tags:
            self.shops.append({'lat': n.location.lat, 'lon': n.location.lon})

def extract_shops(osm_file):
    """Extract shop locations from a given OSM file."""
    handler = ShopHandler()
    handler.apply_file(osm_file)
    return pd.DataFrame(handler.shops)


In [22]:
# Define the search radii (in meters)
radii = [500, 1000, 2000, 5000]

# Create an empty list to store results
results = []

# Iterate over unique years in the station data
for year in station_df['year'].unique():
    osm_file_path = os.path.join(osm_folder, year_to_osm[year])
    
    if not os.path.exists(osm_file_path):
        print(f"Warning: OSM file for year {year} not found: {osm_file_path}")
        continue
    
    print(f"Processing year {year} using {osm_file_path}...")
    
    # Extract shop locations for the given year
    shop_df = extract_shops(osm_file_path)
    
    # Iterate over stations for the given year
    for index, station in station_df[station_df['year'] == year].iterrows():
        lat_s, lon_s, station_name = station['latitude'], station['longitude'], station['station_name']
        
        # Compute distances to all shops
        distances = shop_df.apply(lambda shop: haversine(lat_s, lon_s, shop['lat'], shop['lon']), axis=1)
        
        # Count shops within each radius
        shop_counts = {f'shops_within_{r//1000}km': (distances <= r).sum() for r in radii}
        
        # Store results
        results.append({
            'station_name': station_name,
            'year': year,
            **shop_counts
        })


Processing year 2015 using berlin_osm_datasets\berlin-150101.osm.pbf...
Processing year 2016 using berlin_osm_datasets\berlin-160101.osm.pbf...
Processing year 2017 using berlin_osm_datasets\berlin-170101.osm.pbf...
Processing year 2018 using berlin_osm_datasets\berlin-180101.osm.pbf...
Processing year 2019 using berlin_osm_datasets\berlin-190101.osm.pbf...
Processing year 2020 using berlin_osm_datasets\berlin-200101.osm.pbf...
Processing year 2021 using berlin_osm_datasets\berlin-210101.osm.pbf...
Processing year 2022 using berlin_osm_datasets\berlin-220101.osm.pbf...
Processing year 2023 using berlin_osm_datasets\berlin-230101.osm.pbf...


In [23]:
# Convert results to DataFrame
result_df = pd.DataFrame(results)

# Merge bicycle lane info into the original station dataset
station_df = station_df.merge(result_df, on=['station_name', 'year'], how='left')

# Save updated dataset
station_df.to_csv("stations_with_shops.csv", index=False)

# Display the first few rows
print(result_df.head())

          station_name  year  shops_within_0km  shops_within_1km  \
0       Schwedter Steg  2015                 6               162   
1      Jannowitzbrücke  2015                31               100   
2  Prinzregentenstraße  2015                43               187   
3          Yorckstraße  2015                31               155   
4           Markstraße  2015                 7                93   

   shops_within_2km  shops_within_5km  
0               743              2421  
1               759              3729  
2              1141              3386  
3               676              4073  
4               484              2091  


## Number of Hotels within a radius of 0.5, 1, 2 & 5km

In [24]:
# Let's define the folder containing OSM files
osm_folder = "berlin_osm_datasets"

# And, load the station data
station_df = pd.read_csv("unique_station_year_berlin.csv")

# And, ensure the year column is treated as an integer
station_df['year'] = station_df['year'].astype(int)

In [25]:
# Map each year to its corresponding OSM file
year_to_osm = {year: f"berlin-{str(year)[-2:]}0101.osm.pbf" for year in range(2015, 2024)}

In [26]:
# Let's define the search radii in meters
radii = [500, 1000, 2000, 5000]  # 0.5km, 1km, 2km, 5km

# And, let's define the haversine function to calculate distances
def haversine(lat1, lon1, lat2, lon2):
    """Calculate the great circle distance in meters between two lat/lon points."""
    R = 6371000  # Radius of Earth in meters
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    return R * c


In [27]:
# Now, let's define a class to extract hotel locations from OSM
class HotelHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.hotels = []

    def node(self, n):
        if 'tourism' in n.tags and n.tags['tourism'] == 'hotel':
            self.hotels.append({
                'lat': n.location.lat,
                'lon': n.location.lon
            })

def extract_hotels(osm_file):
    """Extract hotel locations from a given OSM file."""
    handler = HotelHandler()
    
    # Enable location lookup
    handler.apply_file(osm_file, locations=True)

    return pd.DataFrame(handler.hotels)

In [28]:
# List to store hotel count results
hotel_results = []

# Iterate over unique years
for year in station_df['year'].unique():
    osm_file_path = os.path.join(osm_folder, year_to_osm[year])
    
    if not os.path.exists(osm_file_path):
        print(f"Warning: OSM file for year {year} not found: {osm_file_path}")
        continue
    
    print(f"Processing hotel data for year {year} using {osm_file_path}...")

    # Extract hotel locations
    hotel_df = extract_hotels(osm_file_path)

    # Iterate over stations for this year
    for index, station in station_df[station_df['year'] == year].iterrows():
        lat_s, lon_s, station_name = station['latitude'], station['longitude'], station['station_name']

        # Compute distances to all hotels
        distances = hotel_df.apply(lambda hotel: haversine(lat_s, lon_s, hotel['lat'], hotel['lon']), axis=1)

        # Count hotels within each radius
        hotel_counts = {f'hotels_within_{r//1000}km': (distances <= r).sum() for r in radii}

        # Store results
        hotel_results.append({
            'station_name': station_name,
            'year': year,
            **hotel_counts
        })


Processing hotel data for year 2015 using berlin_osm_datasets\berlin-150101.osm.pbf...
Processing hotel data for year 2016 using berlin_osm_datasets\berlin-160101.osm.pbf...
Processing hotel data for year 2017 using berlin_osm_datasets\berlin-170101.osm.pbf...
Processing hotel data for year 2018 using berlin_osm_datasets\berlin-180101.osm.pbf...
Processing hotel data for year 2019 using berlin_osm_datasets\berlin-190101.osm.pbf...
Processing hotel data for year 2020 using berlin_osm_datasets\berlin-200101.osm.pbf...
Processing hotel data for year 2021 using berlin_osm_datasets\berlin-210101.osm.pbf...
Processing hotel data for year 2022 using berlin_osm_datasets\berlin-220101.osm.pbf...
Processing hotel data for year 2023 using berlin_osm_datasets\berlin-230101.osm.pbf...


In [29]:
# Convert results to DataFrame
hotel_df = pd.DataFrame(hotel_results)

# Merge hotel count info into the original station dataset
station_df = station_df.merge(hotel_df, on=['station_name', 'year'], how='left')

# Save updated dataset
station_df.to_csv("stations_with_hotels.csv", index=False)

# Display first few rows
print(station_df.head())

          station_name   latitude  longitude  year  hotels_within_0km  \
0       Schwedter Steg  52.549072  13.400367  2015                  0   
1      Jannowitzbrücke  52.513936  13.417722  2015                  3   
2  Prinzregentenstraße  52.488136  13.333120  2015                  2   
3          Yorckstraße  52.492110  13.373341  2015                  0   
4           Markstraße  52.558190  13.364944  2015                  0   

   hotels_within_1km  hotels_within_2km  hotels_within_5km  
0                  4                 11                115  
1                  8                 46                168  
2                  4                 81                199  
3                 10                 38                242  
4                  0                  2                 77  


## Number of Education within a radius of 0.5, 1, 2 & 5km

In [2]:
# Let's define the folder containing OSM files
osm_folder = "berlin_osm_datasets"

# And, load the station data
station_df = pd.read_csv("unique_station_year_berlin.csv")

# And, ensure the year column is treated as an integer
station_df['year'] = station_df['year'].astype(int)

In [3]:
# Map each year to its corresponding OSM file
year_to_osm = {year: f"berlin-{str(year)[-2:]}0101.osm.pbf" for year in range(2015, 2024)}

In [4]:
# Let's define the search radii in meters
radii = [500, 1000, 2000, 5000]  # 0.5km, 1km, 2km, 5km

# And, the haversine function to calculate distances
def haversine(lat1, lon1, lat2, lon2):
    """Calculate the great circle distance in meters between two lat/lon points."""
    R = 6371000  # Radius of Earth in meters
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    return R * c


In [5]:
# And, also define a class to extract educational institutions from OSM
class EducationHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.education_centers = []

    def node(self, n):
        if 'amenity' in n.tags and n.tags['amenity'] in [
            'kindergarten', 'school', 'driving_school', 'college', 'university', 'music_school',
            'childcare', 'research_institute', 'language_school', 'dancing_school', 'sailing_school',
            'sport_school', 'boat_school', 'first_aid_school', 'art_school'
        ]:
            self.education_centers.append({
                'lat': n.location.lat,
                'lon': n.location.lon,
                'type': n.tags['amenity']  # Store the type of educational institute
            })

def extract_educational_institutes(osm_file):
    """Extract educational institution locations from a given OSM file."""
    handler = EducationHandler()
    
    # Enable location lookup
    handler.apply_file(osm_file, locations=True)

    return pd.DataFrame(handler.education_centers)


In [6]:
# List to store educational institute count results
education_results = []

# Iterate over unique years
for year in station_df['year'].unique():
    osm_file_path = os.path.join(osm_folder, year_to_osm[year])
    
    if not os.path.exists(osm_file_path):
        print(f"Warning: OSM file for year {year} not found: {osm_file_path}")
        continue
    
    print(f"Processing educational institute data for year {year} using {osm_file_path}...")

    # Extract educational institution locations
    education_df = extract_educational_institutes(osm_file_path)

    # Iterate over stations for this year
    for index, station in station_df[station_df['year'] == year].iterrows():
        lat_s, lon_s, station_name = station['latitude'], station['longitude'], station['station_name']

        # Compute distances to all educational institutions
        distances = education_df.apply(lambda edu: haversine(lat_s, lon_s, edu['lat'], edu['lon']), axis=1)

        # Count institutions within each radius
        education_counts = {f'education_within_{r//1000}km': (distances <= r).sum() for r in radii}

        # Store results
        education_results.append({
            'station_name': station_name,
            'year': year,
            **education_counts
        })


Processing educational institute data for year 2015 using berlin_osm_datasets\berlin-150101.osm.pbf...
Processing educational institute data for year 2016 using berlin_osm_datasets\berlin-160101.osm.pbf...
Processing educational institute data for year 2017 using berlin_osm_datasets\berlin-170101.osm.pbf...
Processing educational institute data for year 2018 using berlin_osm_datasets\berlin-180101.osm.pbf...
Processing educational institute data for year 2019 using berlin_osm_datasets\berlin-190101.osm.pbf...
Processing educational institute data for year 2020 using berlin_osm_datasets\berlin-200101.osm.pbf...
Processing educational institute data for year 2021 using berlin_osm_datasets\berlin-210101.osm.pbf...
Processing educational institute data for year 2022 using berlin_osm_datasets\berlin-220101.osm.pbf...
Processing educational institute data for year 2023 using berlin_osm_datasets\berlin-230101.osm.pbf...


In [7]:
# Convert results to DataFrame
education_df = pd.DataFrame(education_results)

# Merge educational institute count info into the original station dataset
station_df = station_df.merge(education_df, on=['station_name', 'year'], how='left')

# Save updated dataset
station_df.to_csv("stations_with_education_counts.csv", index=False)

# Display first few rows
print(station_df.head())

          station_name   latitude  longitude  year  education_within_0km  \
0       Schwedter Steg  52.549072  13.400367  2015                    11   
1      Jannowitzbrücke  52.513936  13.417722  2015                     3   
2  Prinzregentenstraße  52.488136  13.333120  2015                     9   
3          Yorckstraße  52.492110  13.373341  2015                     8   
4           Markstraße  52.558190  13.364944  2015                     3   

   education_within_1km  education_within_2km  education_within_5km  
0                    30                   114                   438  
1                    13                   116                   630  
2                    59                   191                   618  
3                    32                   146                   706  
4                    16                    94                   402  


## Number of Hospitals within a radius of 0.5, 1, 2 & 5km

In [8]:
# Let's define the folder containing OSM files
osm_folder = "berlin_osm_datasets"

# And, load the station data
station_df = pd.read_csv("unique_station_year_berlin.csv")

# And, ensure the year column is treated as an integer
station_df['year'] = station_df['year'].astype(int)

In [9]:
# Map each year to its corresponding OSM file
year_to_osm = {year: f"berlin-{str(year)[-2:]}0101.osm.pbf" for year in range(2015, 2024)}

In [10]:
# Define the search radii in meters
radii = [500, 1000, 2000, 5000]  # 0.5km, 1km, 2km, 5km

### Haversine function to calculate distances
def haversine(lat1, lon1, lat2, lon2):
    """Calculate the great circle distance in meters between two lat/lon points."""
    R = 6371000  # Radius of Earth in meters
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    return R * c


In [11]:
### Define a class to extract hospital-related data from OSM
class HospitalHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.hospitals = []

    def node(self, n):
        if ('amenity' in n.tags and n.tags['amenity'] in [
            'hospital', 'clinic', 'doctor', 'medical_center'
        ]) or ('healthcare' in n.tags and n.tags['healthcare'] in [
            'hospital', 'clinic', 'doctor', 'health_center', 'pharmacy'
        ]):
            self.hospitals.append({
                'lat': n.location.lat,
                'lon': n.location.lon,
                'type': n.tags.get('amenity', n.tags.get('healthcare', 'unknown'))  # Store the type of hospital
            })

def extract_hospitals(osm_file):
    """Extract hospital locations from a given OSM file."""
    handler = HospitalHandler()
    
    # Enable location lookup
    handler.apply_file(osm_file, locations=True)

    return pd.DataFrame(handler.hospitals)


In [12]:
# List to store hospital count results
hospital_results = []

# Iterate over unique years
for year in station_df['year'].unique():
    osm_file_path = os.path.join(osm_folder, year_to_osm[year])
    
    if not os.path.exists(osm_file_path):
        print(f"Warning: OSM file for year {year} not found: {osm_file_path}")
        continue
    
    print(f"Processing hospital data for year {year} using {osm_file_path}...")

    # Extract hospital locations
    hospital_df = extract_hospitals(osm_file_path)

    # Iterate over stations for this year
    for index, station in station_df[station_df['year'] == year].iterrows():
        lat_s, lon_s, station_name = station['latitude'], station['longitude'], station['station_name']

        # Compute distances to all hospitals
        distances = hospital_df.apply(lambda hosp: haversine(lat_s, lon_s, hosp['lat'], hosp['lon']), axis=1)

        # Count hospitals within each radius
        hospital_counts = {f'hospitals_within_{r//1000}km': (distances <= r).sum() for r in radii}

        # Store results
        hospital_results.append({
            'station_name': station_name,
            'year': year,
            **hospital_counts
        })


Processing hospital data for year 2015 using berlin_osm_datasets\berlin-150101.osm.pbf...
Processing hospital data for year 2016 using berlin_osm_datasets\berlin-160101.osm.pbf...
Processing hospital data for year 2017 using berlin_osm_datasets\berlin-170101.osm.pbf...
Processing hospital data for year 2018 using berlin_osm_datasets\berlin-180101.osm.pbf...
Processing hospital data for year 2019 using berlin_osm_datasets\berlin-190101.osm.pbf...
Processing hospital data for year 2020 using berlin_osm_datasets\berlin-200101.osm.pbf...
Processing hospital data for year 2021 using berlin_osm_datasets\berlin-210101.osm.pbf...
Processing hospital data for year 2022 using berlin_osm_datasets\berlin-220101.osm.pbf...
Processing hospital data for year 2023 using berlin_osm_datasets\berlin-230101.osm.pbf...


In [13]:
# Convert results to DataFrame
hospital_df = pd.DataFrame(hospital_results)

# Merge hospital count info into the original station dataset
station_df = station_df.merge(hospital_df, on=['station_name', 'year'], how='left')

# Save updated dataset
station_df.to_csv("stations_with_hospitals.csv", index=False)

# Display first few rows
print(station_df.head())

          station_name   latitude  longitude  year  hospitals_within_0km  \
0       Schwedter Steg  52.549072  13.400367  2015                     0   
1      Jannowitzbrücke  52.513936  13.417722  2015                     0   
2  Prinzregentenstraße  52.488136  13.333120  2015                     0   
3          Yorckstraße  52.492110  13.373341  2015                     1   
4           Markstraße  52.558190  13.364944  2015                     0   

   hospitals_within_1km  hospitals_within_2km  hospitals_within_5km  
0                     0                     0                     4  
1                     0                     0                     6  
2                     0                     2                     6  
3                     1                     3                     7  
4                     0                     1                     3  


## Merge the Files

In [2]:
# Now first let's merge all the infrastructure files
# Let's load each of the files
df_speed = pd.read_csv('stations_with_maxspeed.csv')
df_lane_type = pd.read_csv('stations_with_bicycle_lane_type.csv')
df_shops = pd.read_csv('stations_with_shops.csv')
df_hotels = pd.read_csv('stations_with_hotels.csv')
df_edu = pd.read_csv('stations_with_education_counts.csv')
df_hospitals = pd.read_csv('stations_with_hospitals.csv')

# Now, let's define common merge keys
merge_keys = ['station_name', 'latitude', 'longitude', 'year']

# And, merge all dataframes one by one on common keys
df_merged = df_speed.merge(df_lane_type, on=merge_keys, how='outer') \
                    .merge(df_shops, on=merge_keys, how='outer') \
                    .merge(df_hotels, on=merge_keys, how='outer') \
                    .merge(df_edu, on=merge_keys, how='outer') \
                    .merge(df_hospitals, on=merge_keys, how='outer')

# And, finally save to a new CSV file
df_merged.to_csv('infrastructure_data_berlin.csv', index=False)

In [3]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   station_name           150 non-null    object 
 1   latitude               150 non-null    float64
 2   longitude              150 non-null    float64
 3   year                   150 non-null    int64  
 4   maxspeed_near_station  146 non-null    float64
 5   bicycle_lane_type      150 non-null    object 
 6   shops_within_0km       150 non-null    int64  
 7   shops_within_1km       150 non-null    int64  
 8   shops_within_2km       150 non-null    int64  
 9   shops_within_5km       150 non-null    int64  
 10  hotels_within_0km      150 non-null    int64  
 11  hotels_within_1km      150 non-null    int64  
 12  hotels_within_2km      150 non-null    int64  
 13  hotels_within_5km      150 non-null    int64  
 14  education_within_0km   150 non-null    int64  
 15  educat

In [4]:
# Now, let's merge with the final bicycle dataset
# Let's load the infrastructure data
infra_df = pd.read_csv('infrastructure_data_berlin.csv')

# Now, let's load the cycling data
cycling_df = pd.read_csv('cycling_data_berlin_08032025.csv')

In [5]:
# before the merge process, let's add the year column to the bicycle data
cycling_df['timestamp'] = pd.to_datetime(cycling_df['timestamp'])
cycling_df['year'] = cycling_df['timestamp'].dt.year

In [6]:
# And, merge on common keys
merge_keys = ['station_name', 'latitude', 'longitude', 'year']
merged_df = cycling_df.merge(infra_df, on=merge_keys, how='left')

# And, save the dataset with infra data
merged_df.to_csv('cycle_infra_data_berlin.csv', index=False)

In [7]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1248009 entries, 0 to 1248008
Data columns (total 25 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   timestamp              1248009 non-null  datetime64[ns]
 1   station_name           1248009 non-null  object        
 2   latitude               1248009 non-null  float64       
 3   longitude              1248009 non-null  float64       
 4   cycling_volume         1248009 non-null  float64       
 5   distance_to_center     1248009 non-null  float64       
 6   year                   1248009 non-null  int32         
 7   maxspeed_near_station  1216569 non-null  float64       
 8   bicycle_lane_type      1248009 non-null  object        
 9   shops_within_0km       1248009 non-null  int64         
 10  shops_within_1km       1248009 non-null  int64         
 11  shops_within_2km       1248009 non-null  int64         
 12  shops_within_5km       12480

In [8]:
merged_df.head()

Unnamed: 0,timestamp,station_name,latitude,longitude,cycling_volume,distance_to_center,year,maxspeed_near_station,bicycle_lane_type,shops_within_0km,...,hotels_within_2km,hotels_within_5km,education_within_0km,education_within_1km,education_within_2km,education_within_5km,hospitals_within_0km,hospitals_within_1km,hospitals_within_2km,hospitals_within_5km
0,2015-01-01 00:00:00,Schwedter Steg,52.549072,13.400367,8.0,3.137726,2015,70.0,none,6,...,11,115,11,30,114,438,0,0,0,4
1,2015-01-01 01:00:00,Schwedter Steg,52.549072,13.400367,10.0,3.137726,2015,70.0,none,6,...,11,115,11,30,114,438,0,0,0,4
2,2015-01-01 02:00:00,Schwedter Steg,52.549072,13.400367,8.0,3.137726,2015,70.0,none,6,...,11,115,11,30,114,438,0,0,0,4
3,2015-01-01 03:00:00,Schwedter Steg,52.549072,13.400367,6.0,3.137726,2015,70.0,none,6,...,11,115,11,30,114,438,0,0,0,4
4,2015-01-01 04:00:00,Schwedter Steg,52.549072,13.400367,6.0,3.137726,2015,70.0,none,6,...,11,115,11,30,114,438,0,0,0,4
