In [2]:
import osmium
import geopandas as gpd
from shapely.geometry import LineString
import os
import time

class HighwayGeometryExtractor(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.highways = []
        self.count = 0
        self.error_count = 0
        self.start_time = time.time()
        self.debug_samples = []

    def way(self, w):
        if 'highway' in w.tags:
            self.count += 1
            if self.count % 10000 == 0:
                elapsed = time.time() - self.start_time
                print(f"Processed {self.count} highways in {elapsed:.2f} seconds")
            if self.count <= 5:
                print(f"Way {w.id}: {len(w.nodes)} nodes, first node: {w.nodes[0].ref}")
            try:
                coords = []
                for n in w.nodes:
                    try:
                        coords.append((n.lon, n.lat))
                    except:
                        pass
                if len(coords) >= 2:
                    way_dict = {
                        'id': w.id,
                        'highway': w.tags.get('highway', ''),
                        'surface': w.tags.get('surface', ''),
                        'name': w.tags.get('name', ''),
                        'geometry': LineString(coords)
                    }
                    self.highways.append(way_dict)
                else:
                    if len(self.debug_samples) < 3:
                        self.debug_samples.append(w.id)
                    self.error_count += 1
            except Exception:
                self.error_count += 1


In [4]:
input_dir = r"C:\Users\Arnell\OneDrive - Food and Agriculture Organization\project_work\p0002_primary_forest_support\raw\roads\osm\osm_regional_250521\europe"
output_dir = r"C:\Users\Arnell\OneDrive - Food and Agriculture Organization\project_work\p0002_primary_forest_support\work_in_progress\roads\osm\osm_regional_250521\europe_gpkg"

os.makedirs(output_dir, exist_ok=True)

# Find input files that don't have corresponding output
input_files = [f for f in os.listdir(input_dir) if f.endswith("-latest.osm.pbf")]
files_to_process = []

for file in input_files:
    country_name = file.replace("-latest.osm.pbf", "").replace("-", "_")
    input_path = os.path.join(input_dir, file)
    output_path = os.path.join(output_dir, f"{country_name}_highways.gpkg")

    if os.path.exists(output_path):
        print(f"Output already exists for {country_name}. Skipping.")
    else:
        files_to_process.append((input_path, output_path))
        print(f"Will process: {country_name}")

print(f"Found {len(files_to_process)} files to process out of {len(input_files)}")

# Process each file
for input_path, output_path in files_to_process:
    file = os.path.basename(input_path)
    country_name = file.replace("-latest.osm.pbf", "").replace("-", "_")
    print(f"\nProcessing: {file} -> {output_path}")

    try:
        osm = osmium.io.Reader(input_path)
        idx = osmium.index.create_map("sparse_mem_array")
        lh = osmium.NodeLocationsForWays(idx)
        handler = HighwayGeometryExtractor()
        osmium.apply(osm, lh, handler)

        print(f"Found {len(handler.highways)} highways, Errors: {handler.error_count}")
        if handler.highways:
            gdf = gpd.GeoDataFrame(handler.highways, crs="EPSG:4326")
            gdf.to_file(output_path, driver="GPKG")
            print(f"Saved to {output_path}")
        else:
            print("No valid highways to save.")
    except Exception as e:
        print(f"Error processing {file}: {e}")

# NB it took 256 minutes to process the 50 national files in Europe - total of 31.2 GB. SLow but no memory errors.  


Output already exists for albania. Skipping.
Output already exists for andorra. Skipping.
Output already exists for austria. Skipping.
Output already exists for azores. Skipping.
Output already exists for belarus. Skipping.
Output already exists for belgium. Skipping.
Output already exists for bosnia_herzegovina. Skipping.
Output already exists for bulgaria. Skipping.
Output already exists for croatia. Skipping.
Output already exists for cyprus. Skipping.
Output already exists for czech_republic. Skipping.
Output already exists for denmark. Skipping.
Output already exists for estonia. Skipping.
Output already exists for faroe_islands. Skipping.
Output already exists for finland. Skipping.
Output already exists for france. Skipping.
Output already exists for georgia. Skipping.
Output already exists for germany. Skipping.
Output already exists for greece. Skipping.
Output already exists for guernsey_jersey. Skipping.
Output already exists for hungary. Skipping.
Output already exists for 