# Get Data

This notebook is the crucial first step in creating a graph network of linear infrastructures and natural corridors within a specific area. Here, you will define the area you want to model, and we will retrieve all relevant data. It is important to save the boundaries of your area, as you will need to use exactly the same boundaries in other parts of the workflow to avoid errors.

A clear selection was made in this research regarding which types of data to retrieve, all of which are described and justified in the research report. However, this notebook gives you the flexibility to expand or reduce the data types as needed for your own purposes.

If you want to collect data for different cities or areas, you can simply rerun this notebook with a different boundary and city name, and it will retrieve all the data for you.



In [None]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import osmnx as ox
import pandas as pd
import geopandas as gpd
import os

In [None]:
# Set a fixed structure for the DataFrame
# This structure will be used to ensure consistency across different datasets
columns_to_keep = ['geometry', 'name', 'id', 'description', 'area_type', 'category']

In [None]:
# Define if you want to save the data in separate files
save_separate_files = False

## 0. Set boundaries

Adapt the boundaries and city name to match the desired environment from which you want to retrieve data. The three boundaries already listed are examples that are used throughout this repository and in the thesis research. Make sure you create a folder for the output based on the city name within the output directory.

In [None]:
# Set boundaries for the area of interest
# Adapt the boundaries and city name to match the desired environment from which you want to retrieve data
# The three boundaries already listed are examples that are used throughout this repository and in the thesis research

boundaries = [
    'Alphen aan den Rijn, Zuid-Holland, Netherlands',
    'Waddinxveen, Zuid-Holland, Netherlands',
    'Boskoop, Zuid-Holland, Netherlands'
]

# boundaries = ['Breda, Noord-Brabant, Netherlands']

# boundaries = ['Borsele, Zeeland, Netherlands']

In [None]:
# Define the city name for output folder
# This will be used to create a specific folder for the output data

city = 'alphen-waddinxveen'
# city = 'breda'
# city = 'borsele'

In [None]:
file_name = f'{city}.geojson'

In [None]:
output_path = "output/" + city

os.makedirs(output_path, exist_ok=True)

## 1.  Mapping
This section shows how we mapped all OSM data to the categories used in the study. We group them in this way to make the analysis easier. You can adjust these mappings to suit your own needs.

In [None]:
# Function to map the area type based on the row data

def determine_area_type_from_row(row):
    # Transportation
    if "highway" in row:
        if row["highway"] in ["motorway", "trunk", "primary"]:
            return "Motorways and major roads"
        elif row["highway"] in ["secondary", "tertiary", "residential"]:
            return "Regional roads"
        elif row["highway"] in ["track", "unclassified", "service"]:
            return "Tracks and rural access roads"
        elif row["highway"] in ["footway", "cycleway", "path", "bridleway" ,"pedestrian"]:
            return "Pedestrian and cycling paths"
        elif row["highway"] in ["living_street"]:
            return "Living and residential streets"

    if "railway" in row and row["railway"] in ["rail", "light_rail"]:
        return "Railways"

    # Power-related
    if "power" in row and row["power"] in ["line", "tower"]:
        return "Power lines"
    if "power" in row and row["power"] == "plant":
        return "Power plants"

    # Man-made infrastructure
    if "man_made" in row:
        val = row["man_made"]
        if val == "waste_water_plant":
            return "Power plants"
        elif val in ["communications_tower", "mast"]:
            return "Communication towers"
        elif val in ["water_tower", "silo", "chimney", "tank", "windmill", "wind_turbine"]:
            return "High infrastructures"

    # Land use types
    if "landuse" in row:
        val = row["landuse"]
        if val == "industrial":
            return "Industrial zones"
        elif val == "commercial":
            return "Commercial zones"
        elif val == "retail":
            return "Retail zones"
        elif val == "residential":
            return "Residential areas"
        elif val == "recreation_ground":
            return "Recreational zones"
        elif val in ["farmland", "orchard"]:
            return "Agricultural lands"

    # Nature areas
    if "landuse" in row:
        val = row["landuse"]
        if val in ["forest", "wood"]:
            return "Forests and woodlands"
        elif val in ["meadow", "scrub", "grassland", "grass"]:
            return "Meadows and open grass"

    if "natural" in row and row["natural"] == "wetland":
        return "Wetlands"

    if "wetland" in row and row["wetland"] in ["marsh", "bog", "fen"]:
        return "Wetlands"

    # Water areas
    if "waterway" in row and row["waterway"] in ["river", "stream", "canal"]:
        return "Rivers, canals and streams"

    if "water" in row and row["water"] in ["lake", "reservoir", "pond"]:
        return "Lakes and ponds"

    if "water" in row and row["water"] in ["reservoir"]:
        return "Reservoirs"

    # Public facilities
    if "amenity" in row:
        if row["amenity"] in ["school", "kindergarten", "university"]:
            return "Schools and universities"
        if row["amenity"] == "hospital":
            return "Hospitals"
        if row["amenity"] == "prison":
            return "Prisons"
        if row["amenity"] == "place_of_worship":
            return "Religious sites"
        if "historic" in row:
            return "Cultural sites"
        if "tourism" in row and row["tourism"] == "attraction":
            return "Cultural sites"
        if "landuse" in row and row["landuse"] == "cemetery":
            return "Cemeteries"
        if "leisure" in row and row["leisure"] == "park":
            return "Parks"

    return "Unknown"



## 2. Get all data from OSM

We have divided the data into several subgroups. For each subgroup, you need to specify which OSM tags you want to retrieve. If you want to add more tags or need more information, you can visit https://wiki.openstreetmap.org/ for background information about the tags and their corresponding infrastructure.

### Transportation infrastructure

In [None]:
# Define the tags for the category of data to be retrieved from OSM
transportation_tags = {
    "highway": [
        "motorway", "trunk", "primary",         # Motorways and major roads
        "secondary", "tertiary", "residential", # Regional/local roads
        "living_street",                        # Living streets    
        "track", "unclassified", "service",     # Tracks and rural access roads
        "footway", "cycleway", "path", "bridleway", "pedestrian"  # Pedestrian/cycling paths
    ],
    "railway": [
        "rail", "light_rail"                    # Railways
    ]
}

In [None]:
# Get data from OSM
transportation_data = ox.features_from_place(boundaries, tags=transportation_tags)

In [None]:
# Clean up the data
transportation_data.reset_index(inplace=True)
transportation_data["area_type"] = transportation_data.apply(determine_area_type_from_row, axis=1)
transportation_data['category'] = "Transportation"

In [None]:
# Ensure all columns are present in the DataFrame
for column in columns_to_keep:
    if column not in transportation_data.columns:
        transportation_data[column] = None

transportation_data = transportation_data[columns_to_keep]

In [None]:
# Save the data to GeoJSON files
if save_separate_files:
    transportation_data.to_file(f'../1.get_osm_data/{output_path}/transportation_{file_name}')

### Energy/Industrial

In [None]:
# Define the tags for the category of data to be retrieved from OSM
energy_industrial_tags = {
    "power": ["line", "plant", "tower"],
    "man_made": [
        "waste_water_plant", 
        "communications_tower", 
        "mast", 
        "windmill", 
        "wind_turbine", 
        "water_tower", 
        "silo", 
        "chimney", 
        "tank"
    ]
}

In [None]:
# Get data from OSM
industrial_data = ox.features_from_place(boundaries, tags=energy_industrial_tags)

In [None]:
# Clean up the data
industrial_data.reset_index(inplace=True)
industrial_data['category'] = "Energy/Industrial"
industrial_data["area_type"] = industrial_data.apply(determine_area_type_from_row, axis=1)


In [None]:
# Ensure all columns are present in the DataFrame
for column in columns_to_keep:
    if column not in industrial_data.columns:
        industrial_data[column] = None

industrial_data = industrial_data[columns_to_keep]

In [None]:
# Save the data to GeoJSON files
if save_separate_files:
    industrial_data.to_file(f'../1.get_osm_data/{output_path}/energy_industrial_{file_name}')

### Land use

In [None]:
# Define the tags for the category of data to be retrieved from OSM
land_use_tags = {
    "landuse": [
        "industrial",
        "commercial",
        "retail",
        "residential",
        "recreation_ground",
        "farmland",
        "orchard"
    ]
}

In [None]:
# Get data from OSM
land_use_data = ox.features_from_place(boundaries, tags=land_use_tags)

In [None]:
# Clean up the data
land_use_data.reset_index(inplace=True)
land_use_data['category'] = "Land use"
land_use_data["area_type"] = land_use_data.apply(determine_area_type_from_row, axis=1)

In [None]:
# Ensure all columns are present in the DataFrame
for column in columns_to_keep:
    if column not in land_use_data.columns:
        land_use_data[column] = None

land_use_data = land_use_data[columns_to_keep]

In [None]:
# Save the data to GeoJSON files
if save_separate_files:
    land_use_data.to_file(f'../1.get_osm_data/{output_path}/land_use_{file_name}')

### Nature

In [None]:
# Define the tags for the category of data to be retrieved from OSM
nature_tags = {
    "landuse": ["forest", "wood", "scrub", "meadow", "grass", "grassland"],
    "natural": ["wetland"],
    "wetland": ["marsh", "bog", "fen"]
}

In [None]:
# Get data from OSM
nature_data = ox.features_from_place(boundaries, tags=nature_tags)


In [None]:
# Clean up the data
nature_data.reset_index(inplace=True)
nature_data["area_type"] = nature_data.apply(determine_area_type_from_row, axis=1)
nature_data['category'] = "Nature"

In [None]:
# Ensure all columns are present in the DataFrame
for column in columns_to_keep:
    if column not in nature_data.columns:
        nature_data[column] = None

nature_data = nature_data[columns_to_keep]

In [None]:
# Save the data to GeoJSON files
if save_separate_files:
    nature_data.to_file(f'../1.get_osm_data/{output_path}/nature_{file_name}')

### Water

In [None]:
# Define the tags for the category of data to be retrieved from OSM
water_tags = {
    "waterway": ["river", "stream", "canal"],
    "water": ["lake", "reservoir", "pond", "bog", "fen"] 
}

In [None]:
# Get data from OSM
water_data = ox.features_from_place(boundaries, tags=water_tags)

In [None]:
# Clean up the data
water_data.reset_index(inplace=True)
water_data["area_type"] = water_data.apply(determine_area_type_from_row, axis=1)
water_data['category'] = "Water"

In [None]:
# Ensure all columns are present in the DataFrame
for column in columns_to_keep:
    if column not in water_data.columns:
        water_data[column] = None

water_data = water_data[columns_to_keep]

In [None]:
# Save the data to GeoJSON files
if save_separate_files:
    water_data.to_file(f'../1.get_osm_data/{output_path}/water_{file_name}')

### Public facility

In [None]:
# Define the tags for the category of data to be retrieved from OSM
public_facility_tags = {
    "amenity": [
        "school",
        "kindergarten",
        "university",
        "hospital",
        "prison",
        "place_of_worship"],
    "historic": ["*"], 
    "tourism": ["attraction"],
    "landuse": ["cemetery"],
    "leisure": ["park"]
}

In [None]:
# Get data from OSM
public_data = ox.features_from_place(boundaries, tags=public_facility_tags)

In [None]:
# Clean up the data
public_data.reset_index(inplace=True)
public_data['category'] = 'Public facility'
public_data["area_type"] = public_data.apply(determine_area_type_from_row, axis=1)

In [None]:
# Ensure all columns are present in the DataFrame
for column in columns_to_keep:
    if column not in public_data.columns:
        public_data[column] = None

public_data = public_data[columns_to_keep]

In [None]:
# Save the data to GeoJSON files
if save_separate_files:
    public_data.to_file(f'../1.get_osm_data/{output_path}/public_facilities_{file_name}')

### Postnl

In [None]:
# Define the tags for the category of data to be retrieved from OSM
post_tags = {"amenity": ["post_office", "post_box", "post_depot"]}

In [None]:
# Get data from OSM
post_data = ox.features_from_place(boundaries, tags=post_tags)

In [None]:
# Clean up the data
post_data.reset_index(inplace=True)

In [None]:
# Ensure all columns are present in the DataFrame
for column in columns_to_keep:
    if column not in post_data.columns:
        post_data[column] = None

post_data = post_data[columns_to_keep]

In [None]:
# Clean up the data
post_data['area_type'] = 'postnl point'
post_data['category'] = 'PostNL'

In [None]:
# Save the data to GeoJSON files
if save_separate_files:
    post_data.to_file(f'../1.get_osm_data/{output_path}/post_{file_name}')

## 2. Concat all data

Now that we have collected all the data, we want to combine it into a clean dataframe.

In [None]:
categories = [
    transportation_data,
    industrial_data,
    land_use_data,
    nature_data,
    water_data,
    public_data,
    post_data
]

# Filter out None
categories = [cat for cat in categories if cat is not None]

# Concat if there's anything to concat
if categories:
    df = pd.concat(categories, ignore_index=True)
else:
    print("No data to combine.")


In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df['area_type'].value_counts())

In [None]:
# Convert the DataFrame to a GeoDataFrame
gdf = gpd.GeoDataFrame(df, crs="EPSG:4326")

In [None]:
# if the whole row is dubplicate, drop it, print how many were dropped
print(f"Number of duplicates before dropping: {len(gdf)}")

gdf = gdf.drop_duplicates(subset=['geometry', 'name', 'id', 'description'])
print(f"Number of duplicates after dropping: {len(gdf)}")


In [None]:
# Save the combined data to a GeoJSON file
gdf.to_file(f'../1.get_osm_data/{output_path}/osm_data_{city}.geojson', driver='GeoJSON')