In [1]:
import multiprocessing as mp
import time

import geopandas as gpd
import osmium as osm

from shapely.geometry import MultiPolygon, Polygon



class StateMultipolygonHandler(osm.SimpleHandler):

    def __init__(self):

        osm.SimpleHandler.__init__(self)

        self.polygons = []


    def area(self, a):
        if "boundary" in a.tags and a.tags["boundary"] == "administrative":
            if "admin_level" in a.tags and a.tags["admin_level"] == "4":

                outer_rings = []

                inner_rings = []

                for ring in a.outer_rings():

                    outer_rings.append(Polygon([(n.lon, n.lat) for n in ring]))

                for ring in a.inner_rings():

                    inner_rings.append(Polygon([(n.lon, n.lat) for n in ring]))

                if outer_rings:

                    multipolygon = MultiPolygon(outer_rings, inner_rings)
                    self.polygons.append(
                        {
                            "id": a.id,
                            "name": a.tags.get("name", "unknown"),
                            "geometry": multipolygon,
                        }
                    )



def process_chunk(osm_file, start, size):

    handler = StateMultipolygonHandler()

    handler.apply_file(osm_file, locations=True, idx="flex_mem", start=start, size=size)

    return handler.polygons



def save_to_files(polygons):

    gdf = gpd.GeoDataFrame(polygons)
    gdf.to_file("us_states_multipolygons.geojson", driver="GeoJSON")

    gdf["geometry"] = gdf["geometry"].apply(lambda x: x.wkt)

    gdf[["id", "name", "geometry"]].to_csv("us_states_multipolygons.csv", index=False)
    print(
        "Данные сохранены в файлы us_states_multipolygons.geojson и us_states_multipolygons.csv"
    )



if __name__ == "__main__":
    start_time = time.time()


    osm_file = r"C:\Users\Lenovo\Downloads\us-latest.osm.pbf"

    file_size = 9 * 1024 * 1024 * 1024

    chunk_size = file_size // mp.cpu_count()


    with mp.Pool(processes=mp.cpu_count()) as pool:

        results = [
            pool.apply_async(process_chunk, (osm_file, i * chunk_size, chunk_size))
            for i in range(mp.cpu_count())
        ]


        polygons = []

        for result in results:

            polygons.extend(result.get())


    save_to_files(polygons)

    end_time = time.time()

    print(f"Время выполнения: {end_time - start_time} секунд")