In [5]:
import os
import geopandas as gpd
import pandas as pd
from shapely import wkt
import math
import glob

def split_geopackage_to_csv(
    input_gpkg, 
    output_folder, 
    chunk_size=10000, 
    prefix="chunk", 
    columns=None
):
    """
    Splits a GeoPackage into multiple CSV files with WKT geometries.
    Skips chunks that already exist based on filename pattern.
    
    Parameters:
    -----------
    input_gpkg : str
        Path to the input GeoPackage file
    output_folder : str
        Folder to save the output CSV files
    chunk_size : int
        Number of rows per chunk
    prefix : str
        Prefix for output CSV files
    columns : list
        List of columns to include in output. If None, includes all columns
    """
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    try:
        # Read the GeoPackage
        print(f"Reading GeoPackage: {input_gpkg}")
        gdf = gpd.read_file(input_gpkg)
        
        # Get total number of rows
        total_rows = len(gdf)
        print(f"Total features: {total_rows}")
        
        # Calculate number of chunks
        num_chunks = math.ceil(total_rows / chunk_size)
        print(f"Splitting into {num_chunks} chunks of {chunk_size} rows each")
        
        # If columns specified, ensure 'geometry' is included
        if columns is not None and 'geometry' not in columns:
            columns.append('geometry')
            
        # Process each chunk
        for i in range(num_chunks):
            start_idx = i * chunk_size
            end_idx = min((i + 1) * chunk_size, total_rows)
            
            # Create output filename with row information
            output_file = os.path.join(output_folder, f"{prefix}_{start_idx}_to_{end_idx}_of_{total_rows}.csv")
            
            # Check if file already exists
            if os.path.exists(output_file):
                print(f"Chunk {i+1}/{num_chunks} (rows {start_idx} to {end_idx}) already exists - skipping")
                continue
                
            print(f"Processing chunk {i+1}/{num_chunks} (rows {start_idx} to {end_idx})")
            
            # Extract chunk
            chunk = gdf.iloc[start_idx:end_idx]
            
            # Filter columns if specified
            if columns is not None:
                chunk = chunk[columns]
                
            # Convert to DataFrame with WKT geometry
            df = pd.DataFrame(chunk)
            df['geometry'] = df['geometry'].apply(lambda x: x.wkt if x else "")
            
            # Save to CSV
            df.to_csv(output_file, index=False)
            print(f"Saved to {output_file}")
            
        print(f"Processing complete! {num_chunks} CSV files created in {output_folder}")
        
    except Exception as e:
        print(f"Error processing {input_gpkg}: {str(e)}")
def process_all_geopackages_in_folder(
    input_folder,
    output_folder,
    pattern="*_highways.gpkg",
    chunk_size=10000,
    columns=None
):
    """
    Process all geopackage files in a folder that match a pattern.
    Skips files where all chunks already exist.
    """
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Find all matching files
    search_pattern = os.path.join(input_folder, pattern)
    geopackage_files = glob.glob(search_pattern)
    
    if not geopackage_files:
        print(f"No files matching '{pattern}' found in {input_folder}")
        return
        
    print(f"Found {len(geopackage_files)} files to process")
    files_to_process = []
    
    # Check which files need processing by examining existing output chunks
    for gpkg_file in geopackage_files:
        filename = os.path.basename(gpkg_file)
        prefix = filename.replace("_highways.gpkg", "")
        
        # Look for existing chunks with this prefix
        existing_chunks = glob.glob(os.path.join(output_folder, f"{prefix}_*.csv"))
        
        # Skip processing if we've already created chunks for this file
        if existing_chunks:
            # Try to determine total rows from existing chunk filenames
            total_rows = None
            for chunk_path in existing_chunks:
                chunk_name = os.path.basename(chunk_path)
                if "_of_" in chunk_name:
                    try:
                        # Extract total rows from filename format: prefix_start_to_end_of_TOTAL.csv
                        total_part = chunk_name.split("_of_")[-1].replace(".csv", "")
                        total_rows = int(total_part)
                        break
                    except ValueError:
                        continue
            
            if total_rows:
                # Calculate expected number of chunks
                num_chunks = math.ceil(total_rows / chunk_size)
                
                # Check if all expected chunks exist
                all_chunks_exist = True
                for i in range(num_chunks):
                    start_idx = i * chunk_size
                    end_idx = min((i + 1) * chunk_size, total_rows)
                    output_file = os.path.join(output_folder, f"{prefix}_{start_idx}_to_{end_idx}_of_{total_rows}.csv")
                    
                    if not os.path.exists(output_file):
                        all_chunks_exist = False
                        break
                
                if all_chunks_exist:
                    print(f"Skipping {filename} - all {num_chunks} chunks already exist")
                    continue
        
        # If we get here, this file needs processing
        files_to_process.append(gpkg_file)
    
    print(f"Need to process {len(files_to_process)} of {len(geopackage_files)} files")
    
    # Process each file that needs processing
    for i, gpkg_file in enumerate(files_to_process):
        filename = os.path.basename(gpkg_file)
        prefix = filename.replace("_highways.gpkg", "")
        
        print(f"\nProcessing file {i+1}/{len(files_to_process)}: {filename}")
        print(f"Using prefix: {prefix}")
        
        # Process this file
        split_geopackage_to_csv(
            input_gpkg=gpkg_file,
            output_folder=output_folder,
            chunk_size=chunk_size,
            prefix=prefix,
            columns=columns
        )
        
    print(f"\nAll files processed! Check results in {output_folder}")

In [11]:
# in_folder = r"C:\Users\Arnell\OneDrive - Food and Agriculture Organization\project_work\p0002_primary_forest_support\work_in_progress\roads\osm\osm_regional_250521"
# input_gpkg_name = "africa_highways.gpkg"
# input_gpkg_path = os.path.join(in_folder, input_gpkg_name)
# print (f"Input GeoPackage path: {input_gpkg_path}")



In [None]:

# Example usage
if __name__ == "__main__":
    # Path to your input folder with geopackages
    # in_folder = r"C:\Users\Arnell\OneDrive - Food and Agriculture Organization\project_work\p0002_primary_forest_support\work_in_progress\roads\osm\osm_regional_250521"
    in_folder = r"C:\Users\Arnell\OneDrive - Food and Agriculture Organization\project_work\p0002_primary_forest_support\work_in_progress\roads\osm\osm_regional_250521\europe_gpkg"
    # Where to save the CSV files
    # output_folder = os.path.join(in_folder, "geo_csvs")
    
    output_folder = r"C:\Users\Arnell\OneDrive - Food and Agriculture Organization\project_work\p0002_primary_forest_support\work_in_progress\roads\osm\osm_regional_250521\geo_csvs"
    
    # Process all files
    process_all_geopackages_in_folder(
        input_folder=in_folder,
        output_folder=output_folder,
        pattern="*_highways.gpkg",
        chunk_size=10000000,
        # columns=['osm_id', 'highway', 'geometry']
    )

Found 50 files to process
Skipping albania_highways.gpkg - all 1 chunks already exist
Skipping andorra_highways.gpkg - all 1 chunks already exist
Skipping austria_highways.gpkg - all 1 chunks already exist
Skipping azores_highways.gpkg - all 1 chunks already exist
Skipping belarus_highways.gpkg - all 1 chunks already exist
Skipping belgium_highways.gpkg - all 1 chunks already exist
Skipping bosnia_herzegovina_highways.gpkg - all 1 chunks already exist
Skipping bulgaria_highways.gpkg - all 1 chunks already exist
Skipping croatia_highways.gpkg - all 1 chunks already exist
Skipping cyprus_highways.gpkg - all 1 chunks already exist
Need to process 40 of 50 files

Processing file 1/40: czech_republic_highways.gpkg
Using prefix: czech_republic
Reading GeoPackage: C:\Users\Arnell\OneDrive - Food and Agriculture Organization\project_work\p0002_primary_forest_support\work_in_progress\roads\osm\osm_regional_250521\europe_gpkg\czech_republic_highways.gpkg
Total features: 1779557
Splitting into 1 