In [None]:
# This cell defines the HighwayGeometryExtractor (already integrated below)

In [None]:
import pandas as pd
from shapely.geometry import LineString
from tqdm.notebook import tqdm
from multiprocessing import Pool, cpu_count
import osmium
import geopandas as gpd
import time
import os

class HighwayGeometryExtractor(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.highways = []
        self.count = 0
        self.error_count = 0
        self.start_time = time.time()
        self.debug_samples = []

    def way(self, w):
        if 'highway' in w.tags:
            self.count += 1
            try:
                coords = [(n.lon, n.lat) for n in w.nodes if n.location.valid()]
                if len(coords) >= 2:
                    self.highways.append({
                        'id': w.id,
                        'highway': w.tags.get('highway', ''),
                        'surface': w.tags.get('surface', ''),
                        'name': w.tags.get('name', ''),
                        'geometry': LineString(coords)
                    })
            except Exception:
                self.error_count += 1

input_dir = r"C:\Users\Arnell\OneDrive - Food and Agriculture Organization\project_work\p0002_primary_forest_support\raw\roads\osm\osm_regional_250521\europe"
output_dir = r"C:\Users\Arnell\OneDrive - Food and Agriculture Organization\project_work\p0002_primary_forest_support\work_in_progress\roads\osm\osm_regional_250521\europe_gpkg"
log_path = os.path.join(output_dir, "processing_log.csv")

os.makedirs(output_dir, exist_ok=True)

# Identify files to process
input_files = [f for f in os.listdir(input_dir) if f.endswith("-latest.osm.pbf")]
files_to_process = []

for file in input_files:
    country_name = file.replace("-latest.osm.pbf", "").replace("-", "_")
    input_path = os.path.join(input_dir, file)
    output_path = os.path.join(output_dir, f"{country_name}_highways.gpkg")

    if not os.path.exists(output_path):
        files_to_process.append((country_name, input_path, output_path))

print(f"Processing {len(files_to_process)} out of {len(input_files)} files with multiprocessing...")

# Function to run in parallel
def process_file(args):
    country_name, input_path, output_path = args
    log = {
        "country": country_name,
        "input_file": os.path.basename(input_path),
        "output_file": os.path.basename(output_path),
        "status": "",
        "highways_found": 0,
        "errors": 0
    }

    try:
        osm = osmium.io.Reader(input_path)
        idx = osmium.index.create_map("sparse_mem_array")
        lh = osmium.NodeLocationsForWays(idx)
        handler = HighwayGeometryExtractor()
        osmium.apply(osm, lh, handler)

        log["highways_found"] = len(handler.highways)
        log["errors"] = handler.error_count

        if handler.highways:
            gdf = gpd.GeoDataFrame(handler.highways, crs="EPSG:4326")
            gdf.to_file(output_path, driver="GPKG")
            log["status"] = "Success"
        else:
            log["status"] = "Empty"

    except Exception as e:
        log["status"] = f"Error: {str(e)}"

    return log

# Run multiprocessing
with Pool(cpu_count()) as pool:
    results = list(tqdm(pool.imap_unordered(process_file, files_to_process), total=len(files_to_process)))

# Save log
log_df = pd.DataFrame(results)
log_df.to_csv(log_path, index=False)
print(f"Processing complete. Log saved to {log_path}")
