# Data collector
Tool for downloading rivers data from OpenStreeMap and converting to csv format.

## Usage
Please set name of river - if you want to collect specific river or coordinates of area - to download all rivers from that area.

And run the script.

In [None]:
river_name = "Wisła"  # Keep empty if you want to collect rivers from area
# Define coordinates for the bounding box (southwest and northeast corners)
rivers_area = (
    50.0,
    14.0,
    54.0,
    24.0,
)  # For example coords for polish rivers: (50.0, 14.0, 54.0, 24.0)

In [None]:
import requests
import json
import os
from unidecode import unidecode


DATA_PATH = "data"
# Define the Overpass API endpoint
OVERPASS_URL = "http://overpass-api.de/api/interpreter"


if river_name:
    collection_name = unidecode(river_name.lower())
    # Define the Overpass query to get the specified river
    overpass_query = f"""
[out:json];
way["waterway"="river"]["name"="{river_name}"];
out body;
>;
out skel qt;
"""

elif rivers_area:
    collection_name = "rivers"
    # Define the Overpass query to get all rivers using the coordinates
    overpass_query = f"""
[out:json];
way["waterway"="river"]{rivers_area}; 
out body;
>;
out skel qt;
"""
else:
    raise ValueError("Either river_name or rivers_area must be provided")

collection_path = os.path.join(DATA_PATH, collection_name)
os.makedirs(collection_path, exist_ok=True)

# Define the output file
raw_geojson_file = os.path.join(collection_path, "raw.geojson")


# Send the request to Overpass API
response = requests.post(OVERPASS_URL, data={"data": overpass_query})

# Check if the response is successful
if response.status_code != 200:
    print("Error: Received response code", response.status_code)
    exit(1)

# Parse the JSON response
data = response.json()

# Check if the response contains data
if not data or "elements" not in data or not data["elements"]:
    print("Error: No elements found in the response.")
    exit(1)

# Save the response to the output file
with open(raw_geojson_file, "w") as f:
    json.dump(data, f, indent=2)

# Check if the output file is not empty
if os.path.getsize(raw_geojson_file) > 0:
    print("Download complete:", raw_geojson_file)
else:
    print("Error: Output file is empty.")

Downloaded json has nested structure, but interesting one is "elements" property. It stores object with two types (property "type") - "ways" for rivers and "node" for node. 
One river is separated for many smaller parts called ways. every way has start and end coordinates. They are nodes. Lets create separated files for ways and nodes.

In [None]:
# Step 1: Read the JSON file
with open(raw_geojson_file, "r") as f:
    data = json.load(f)


ways_file = os.path.join(collection_path, "raw_ways.json")
nodes_file = os.path.join(collection_path, "raw_nodes.json")

# Step 2: Filter elements into nodes and ways
nodes = []
ways = []

for element in data.get("elements", []):
    if element.get("type") == "node":
        nodes.append(element)
    elif element.get("type") == "way":
        ways.append(element)

# Step 3: Write nodes to a file
with open(nodes_file, "w") as f:
    json.dump(nodes, f, indent=4)

# Write ways to a file
with open(ways_file, "w") as f:
    json.dump(ways, f, indent=4)

print("Files created: nodes.json and ways.json")

Now lets simplify structure and split data to final datasets:

**ways**
* id
* name
* boat (https://wiki.openstreetmap.org/wiki/Key:boat)
* waterway (https://wiki.openstreetmap.org/wiki/Key:waterway)
* motorboat (https://wiki.openstreetmap.org/wiki/Key:motorboat)
* fixme (https://wiki.openstreetmap.org/wiki/Key:fixme)

**nodes**
* way_id
* lat
* lon

**ways_names**
* way_id
* language_code
* name

**ways_nodes**
* way_id
* node_id
* sequence 

In [None]:
import json
import pandas as pd
import os

# Read the nested JSON file
with open(ways_file, "r") as f:
    raw_ways_data = json.load(f)

# Prepare a list to hold processed data
ways_data = []
ways_nodes_data = []
ways_names_data = []

# Extract relevant information from each object
for way in raw_ways_data:
    way_id = way.get("id")
    tags = way.get("tags", {})

    # Create a dictionary with all tags and their values
    way_data = {"id": way_id}
    way_data.update(tags)  # Add all tags to the dictionary

    ways_data.append(way_data)
    # Extract nodes information
    nodes = way.get("nodes", [])
    for sequence, node_id in enumerate(nodes):
        ways_nodes_data.append(
            {"way_id": way_id, "node_id": node_id, "sequence": sequence}
        )

    # Extract names with language codes
    for key, value in tags.items():
        if key.startswith("name:"):
            language_code = key.split(":")[1]  # Get the language code from the key
            ways_names_data.append(
                {"way_id": way_id, "language_code": language_code, "name": value}
            )


# Create a DataFrame from the processed data
ways_df_full = pd.DataFrame(ways_data)
ways_nodes_df = pd.DataFrame(ways_nodes_data)
ways_names_df = pd.DataFrame(ways_names_data)


# Specify the columns to keep in the CSV
ways_columns_reduced = ["id", "name", "boat", "waterway", "motorboat", "fixme"]
ways_columns_reduced = [col for col in ways_columns_reduced if col in ways_df_full.columns] # Filter out non existing columns

ways_df_reduced = ways_df_full[ways_columns_reduced]
# Save only specified columns to a CSV file
ways_csv = os.path.join(collection_path, "ways.csv")
ways_df_reduced.to_csv(ways_csv, index=False)

# Save nodes data to a CSV file
ways_nodes_csv = os.path.join(collection_path, "ways_nodes.csv")
ways_nodes_df.to_csv(ways_nodes_csv, index=False)

# Save names data to a CSV file
ways_names_csv = os.path.join(collection_path, "ways_names.csv")
ways_names_df.to_csv(ways_names_csv, index=False)

print("CSV file 'ways.csv' created successfully!")
print("CSV file 'ways_nodes.csv' created successfully!")
print("CSV file 'ways_names.csv' created successfully!")

In [None]:
import pandas as pd
import json


# Load the JSON data from the file
with open(nodes_file, "r") as f:
    nodes_data = json.load(f)

# Create a DataFrame with only the specified columns
nodes_df = pd.DataFrame(nodes_data)[["id", "lat", "lon"]]

nodes_csv = os.path.join(collection_path, "nodes.csv")
# Save the DataFrame to a CSV file
nodes_df.to_csv(nodes_csv, index=False)

print("CSV file 'nodes.csv' created successfully!")

In [None]:
print(f"ways: {ways_df_reduced.shape[0]}")
print(f"ways_nodes: {ways_nodes_df.shape[0]}")
print(f"ways_names: {ways_names_df.shape[0]}")
print(f"nodes: {nodes_df.shape[0]}")