# Processing Disaster Risk Maps

## Flood Risk

In [2]:
# Extracting Zip Files

import zipfile
import os
import glob
import geopandas as gpd
import pandas as pd
import shutil

# Define root directory
root_dir = "../00_data/flood_risk/"

# Extract all ZIP files
extracted_folders = []

for flood_scenario in sorted(os.listdir(root_dir)):  # e.g., "Flood 1", "Flood 2"
    scenario_path = os.path.join(root_dir, flood_scenario)
    
    if os.path.isdir(scenario_path):
        for return_period in sorted(os.listdir(scenario_path)):  # e.g., "5yr", "25yr", "100yr"
            return_period_path = os.path.join(scenario_path, return_period)

            if os.path.isdir(return_period_path):
                # Extract all ZIP files inside this return period
                for zip_file in glob.glob(os.path.join(return_period_path, "*.zip")):
                    extract_folder = zip_file.replace(".zip", "")
                    extracted_folders.append(extract_folder)

                    with zipfile.ZipFile(zip_file, "r") as zip_ref:
                        zip_ref.extractall(extract_folder)

print(f"Extracted {len(extracted_folders)} ZIP files.")


Extracted 220 ZIP files.


In [7]:
# Delete ZIP files

import os
import glob

# Define the root directory
root_dir = "../00_data/flood_risk/" 

# Find and delete all .zip files
zip_files = glob.glob(os.path.join(root_dir, "**", "*.zip"), recursive=True)

for zip_file in zip_files:
    os.remove(zip_file)
    print(f"Deleted: {zip_file}")

print("All .zip files have been removed.")

Deleted: ../00_data/flood_risk/Flood 2/100yr/AgusanDelNorte.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/DavaoDelSur.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/Kalinga.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/MisamisOccidental.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/Batangas.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/SultanKudarat.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/Cebu.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/LanaoDelSur.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/Catanduanes.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/Aklan.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/DavaoOccidental.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/MountainProvince.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/Sarangani.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/MetroManila.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/Leyte.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/Misamis Oriental.zip
Deleted: ../00_data/flood_r

In [6]:
import geopandas as gpd

# Path to the specific .shp file you want to open
shp_file = "Flood 1/5yr/MountainProvince/MountainProvince_Flood_5year.shp" 

# Load the Shapefile
gdf = gpd.read_file(root_dir+shp_file)

# Display the first few rows
print(gdf.head())

# Check the available columns
print(gdf.columns)

   Var                                           geometry
0  1.0  MULTIPOLYGON (((121.52562 17.26903, 121.52534 ...
1  2.0  MULTIPOLYGON (((121.52618 17.26985, 121.5259 1...
2  3.0  MULTIPOLYGON (((121.52618 17.26985, 121.52618 ...
Index(['Var', 'geometry'], dtype='object')


### Merging Files
The first run below was prematurely stopped because a different CRS was used for certain shapefiles.

In [4]:
import geopandas as gpd
import pandas as pd
import os
import glob

# Root directories
root_dir = "../00_data/flood_risk/"
output_path = "../01_processed_data/flood_risk/"

# Ensure output directory exists
os.makedirs(output_path, exist_ok=True)

# Paths for output files
output_files = {
    "5yr": os.path.join(output_path, "FloodRisk_5yr.parquet"),
    "25yr": os.path.join(output_path, "FloodRisk_25yr.parquet"),
    "100yr": os.path.join(output_path, "FloodRisk_100yr.parquet"),
}

# Processing settings
SAVE_INTERVAL = 10  # Save every 10 provinces
processed_count = 0
test_limit = None  # Set to an integer (e.g., 20) for testing, or None for full processing

# Loop through each Flood folder (Flood 1 to Flood 5)
for flood_folder in sorted(os.listdir(root_dir)):
    flood_path = os.path.join(root_dir, flood_folder)

    if os.path.isdir(flood_path):  # Ensure it's a directory
        print(f"📂 Processing: {flood_folder}")

        # Loop through risk levels (5yr, 25yr, 100yr)
        for risk_period in output_files.keys():
            risk_path = os.path.join(flood_path, risk_period)

            if os.path.isdir(risk_path):  # Ensure it exists
                print(f"  ⏳ Processing Risk Period: {risk_period}")

                # Placeholder for batch processing
                all_gdfs = []
                province_counter = 0  # Track processed provinces

                # Loop through province folders inside risk period
                for province in sorted(os.listdir(risk_path)):
                    province_path = os.path.join(risk_path, province)

                    if os.path.isdir(province_path):  # Ensure it's a province folder
                        print(f"    🏙 Processing Province: {province}")

                        # Find the shapefile inside the province folder
                        shapefiles = glob.glob(os.path.join(province_path, "*.shp"))

                        for shp in shapefiles:
                            if test_limit is not None and processed_count >= test_limit:
                                break  # Stop after reaching test limit

                            print(f"      📂 Reading file: {shp}")

                            try:
                                # Load shapefile
                                gdf = gpd.read_file(shp)

                                # Rename 'Var' to 'FloodRisk'
                                gdf = gdf.rename(columns={"Var": "FloodRisk"})

                                # Add metadata: Province and Flood Return Period
                                gdf["Province"] = province
                                gdf["FloodReturnPeriod"] = risk_period

                                # Collect data for batch processing
                                all_gdfs.append(gdf)
                                processed_count += 1
                                province_counter += 1

                            except Exception as e:
                                print(f"      ❌ Error processing {shp}: {e}")

                    # **Save after every 10 provinces**
                    if province_counter >= SAVE_INTERVAL:
                        if all_gdfs:
                            combined_gdf = gpd.GeoDataFrame(pd.concat(all_gdfs, ignore_index=True))
                            parquet_path = output_files[risk_period]

                            # Append by reading old data and merging before saving
                            if os.path.exists(parquet_path):
                                old_gdf = gpd.read_parquet(parquet_path)
                                combined_gdf = pd.concat([old_gdf, combined_gdf], ignore_index=True)

                            # Save
                            combined_gdf.to_parquet(parquet_path, index=False)
                            print(f"      ✅ Saved {province_counter} provinces to {parquet_path}")

                            # Reset buffer
                            all_gdfs = []
                            province_counter = 0

                # **Final save if any data remains**
                if all_gdfs:
                    combined_gdf = gpd.GeoDataFrame(pd.concat(all_gdfs, ignore_index=True))
                    parquet_path = output_files[risk_period]

                    if os.path.exists(parquet_path):
                        old_gdf = gpd.read_parquet(parquet_path)
                        combined_gdf = pd.concat([old_gdf, combined_gdf], ignore_index=True)

                    combined_gdf.to_parquet(parquet_path, index=False)
                    print(f"      ✅ Final save for {risk_period} at {parquet_path}")

# Final message
print("✅ Incremental processing with auto-saving complete!")


📂 Processing: Flood 1
  ⏳ Processing Risk Period: 5yr
    🏙 Processing Province: Kalinga
      📂 Reading file: ../00_data/flood_risk/Flood 1/5yr/Kalinga/Kalinga_Flood_5year.shp
    🏙 Processing Province: Marinduque
      📂 Reading file: ../00_data/flood_risk/Flood 1/5yr/Marinduque/Marinduque_Flood_5year.shp
    🏙 Processing Province: MountainProvince
      📂 Reading file: ../00_data/flood_risk/Flood 1/5yr/MountainProvince/MountainProvince_Flood_5year.shp
    🏙 Processing Province: Quirino
      📂 Reading file: ../00_data/flood_risk/Flood 1/5yr/Quirino/Quirino_Flood_5year.shp
    🏙 Processing Province: Sarangani
      📂 Reading file: ../00_data/flood_risk/Flood 1/5yr/Sarangani/Sarangani_Flood_5year.shp
      ✅ Final save for 5yr at ../01_processed_data/flood_risk/FloodRisk_5yr.parquet
  ⏳ Processing Risk Period: 25yr
    🏙 Processing Province: Abra
      📂 Reading file: ../00_data/flood_risk/Flood 1/25yr/Abra/Abra_Flood_25year.shp
    🏙 Processing Province: AgusanDelNorte
      📂 Readin

  return ogr_read(


      ✅ Final save for 5yr at ../01_processed_data/flood_risk/FloodRisk_5yr.parquet
  ⏳ Processing Risk Period: 25yr
    🏙 Processing Province: Bulacan
      📂 Reading file: ../00_data/flood_risk/Flood 2/25yr/Bulacan/Bulacan_Flood_25year.shp
    🏙 Processing Province: Cavite
      📂 Reading file: ../00_data/flood_risk/Flood 2/25yr/Cavite/Cavite_Flood_25year.shp
    🏙 Processing Province: IlocosNorte
      📂 Reading file: ../00_data/flood_risk/Flood 2/25yr/IlocosNorte/IlocosNorte_Flood_25year.shp
    🏙 Processing Province: IlocosSur
      📂 Reading file: ../00_data/flood_risk/Flood 2/25yr/IlocosSur/IlocosSur_Flood_25year.shp
    🏙 Processing Province: Isabela
      📂 Reading file: ../00_data/flood_risk/Flood 2/25yr/Isabela/Isabela_Flood_25year.shp
    🏙 Processing Province: LaUnion
      📂 Reading file: ../00_data/flood_risk/Flood 2/25yr/LaUnion/LaUnion_Flood_25year.shp
    🏙 Processing Province: Pampanga
      📂 Reading file: ../00_data/flood_risk/Flood 2/25yr/Pampanga/Pampanga_Flood_2

ValueError: Cannot determine common CRS for concatenation inputs, got ['WGS 84', 'WGS 84 / UTM zone 51N']. Use `to_crs()` to transform geometries to the same CRS before merging.

### Merging WITH Checkpoint

In [1]:
import geopandas as gpd
import pandas as pd
import os
import glob

# Root directories
root_dir = "../00_data/flood_risk/"
output_path = "../01_processed_data/flood_risk/"

# 📌 Common CRS
COMMON_CRS = "EPSG:4326"

# CHECKPOINT
start_with_flood_group = 2
start_with_risk_level = 100
start_with_province = "DavaoDelSur"
reached_start = False
first_pass = True

# Ensure output directory exists
os.makedirs(output_path, exist_ok=True)

# Paths for output files
output_files = {
    "5yr": os.path.join(output_path, "FloodRisk_5yr.parquet"),
    "25yr": os.path.join(output_path, "FloodRisk_25yr.parquet"),
    "100yr": os.path.join(output_path, "FloodRisk_100yr.parquet"),
}

# Processing settings
SAVE_INTERVAL = 10  # Save every 10 provinces
processed_count = 0
test_limit = None  # Set to an integer (e.g., 20) for testing, or None for full processing

# Loop through each Flood folder (Flood 1 to Flood 5)
for flood_folder in sorted(os.listdir(root_dir)):
    flood_path = os.path.join(root_dir, flood_folder)    

    if os.path.isdir(flood_path):  # Ensure it's a directory
        flood_group = int(flood_folder.split(" ")[1])  # Extract flood group number

        if first_pass and flood_group < start_with_flood_group:
            print(f"Skipping Flood Group: {flood_group}")
            continue  # Skip if before the start group    


        print(f"📂 Processing: {flood_folder}")

        # Loop through risk levels (5yr, 25yr, 100yr)
        for risk_period in output_files.keys():
            risk_path = os.path.join(flood_path, risk_period)


            if os.path.isdir(risk_path):  # Ensure it exists

                risk_level = int(risk_period[:-2])  # Extract risk level number
                if first_pass and risk_level < int(start_with_risk_level):
                    print(f"Skipping Risk Level: {risk_level}")
                    continue
            
                print(f"  ⏳ Processing Risk Period: {risk_period}")

                # Placeholder for batch processing
                all_gdfs = []
                province_counter = 0  # Track processed provinces

                # Loop through province folders inside risk period
                for province in sorted(os.listdir(risk_path)):

                    

                    province_path = os.path.join(risk_path, province)

                    if os.path.isdir(province_path):  # Ensure it's a province folder

                        if first_pass and not reached_start:
                            if province == start_with_province:
                                reached_start = True
                                first_pass = False
                                print(f"Reached start province: {province}")
                            else:
                                print(f"Skipping Province: {province}")
                                continue  # Skip provinces before the start point

                        print(f"    🏙 Processing Province: {province}")

                        # Find the shapefile inside the province folder
                        shapefiles = glob.glob(os.path.join(province_path, "*.shp"))

                        for shp in shapefiles:
                            if test_limit is not None and processed_count >= test_limit:
                                break  # Stop after reaching test limit

                            print(f"      📂 Reading file: {shp}")

                            try:
                                # Load shapefile
                                gdf = gpd.read_file(shp)

                                # ✅ Convert CRS if needed
                                if gdf.crs != COMMON_CRS:
                                    print(f"🔄 Converting CRS from {gdf.crs} to {COMMON_CRS}")
                                    gdf = gdf.to_crs(COMMON_CRS)

                                # Rename 'Var' to 'FloodRisk'
                                gdf = gdf.rename(columns={"Var": "FloodRisk"})

                                # Add metadata: Province and Flood Return Period
                                gdf["Province"] = province
                                gdf["FloodReturnPeriod"] = risk_period

                                # Collect data for batch processing
                                all_gdfs.append(gdf)
                                processed_count += 1
                                province_counter += 1

                            except Exception as e:
                                print(f"      ❌ Error processing {shp}: {e}")

                    # **Save after every 10 provinces**
                    if province_counter >= SAVE_INTERVAL:
                        if all_gdfs:
                            combined_gdf = gpd.GeoDataFrame(pd.concat(all_gdfs, ignore_index=True))
                            parquet_path = output_files[risk_period]

                            # Append by reading old data and merging before saving
                            if os.path.exists(parquet_path):
                                old_gdf = gpd.read_parquet(parquet_path)
                                combined_gdf = pd.concat([old_gdf, combined_gdf], ignore_index=True)

                            # Save
                            combined_gdf.to_parquet(parquet_path, index=False)
                            print(f"      ✅ Saved {province_counter} provinces to {parquet_path}")

                            # Reset buffer
                            all_gdfs = []
                            province_counter = 0

                # **Final save if any data remains**
                if all_gdfs:
                    combined_gdf = gpd.GeoDataFrame(pd.concat(all_gdfs, ignore_index=True))
                    parquet_path = output_files[risk_period]

                    if os.path.exists(parquet_path):
                        old_gdf = gpd.read_parquet(parquet_path)
                        combined_gdf = pd.concat([old_gdf, combined_gdf], ignore_index=True)

                    combined_gdf.to_parquet(parquet_path, index=False)
                    print(f"      ✅ Final save for {risk_period} at {parquet_path}")

# Final message
print("✅ Incremental processing with auto-saving complete!")


Skipping Flood Group: 1
📂 Processing: Flood 2
Skipping Risk Level: 5
Skipping Risk Level: 25
  ⏳ Processing Risk Period: 100yr
Skipping Province: Abra
Skipping Province: AgusanDelNorte
Skipping Province: Aklan
Skipping Province: Apayao
Skipping Province: Batangas
Skipping Province: Benguet
Skipping Province: CamarinesSur
Skipping Province: Camiguin
Skipping Province: Catanduanes
Skipping Province: Cebu
Skipping Province: CompostelaValley
Skipping Province: DavaoDelNorte
Reached start province: DavaoDelSur
    🏙 Processing Province: DavaoDelSur
      📂 Reading file: ../00_data/flood_risk/Flood 2/100yr/DavaoDelSur/DavaoDelSur_Flood_100year.shp
    🏙 Processing Province: DavaoOccidental
      📂 Reading file: ../00_data/flood_risk/Flood 2/100yr/DavaoOccidental/DavaoOccidental_Flood_100year.shp
    🏙 Processing Province: DinagatIslands
      📂 Reading file: ../00_data/flood_risk/Flood 2/100yr/DinagatIslands/DinagatIslands_Flood_100year.shp
    🏙 Processing Province: Kalinga
      📂 Reading 

### Updating Empty Folders

Some folders have the shapefiles erroneously stored inside another subfolder with the name of the province. \
e.g. ../00_data/flood_risk/Flood 1/100yr/Albay/Albay/

In [12]:
# Check empty folders
import os

# Define the root directory where the flood risk data is stored
root_dir = "../00_data/flood_risk/"

# List to store empty province folders
empty_province_folders = []

# Processing settings
SAVE_INTERVAL = 10  # Save every 10 provinces
processed_count = 0

# Paths for output files
output_files = {
    "5yr": os.path.join(output_path, "FloodRisk_5yr.parquet"),
    "25yr": os.path.join(output_path, "FloodRisk_25yr.parquet"),
    "100yr": os.path.join(output_path, "FloodRisk_100yr.parquet"),
}


# Iterate through Flood 1 to Flood 5
for flood_group in sorted(os.listdir(root_dir)):
    flood_group_path = os.path.join(root_dir, flood_group)
    
    # Skip if not a directory
    if not os.path.isdir(flood_group_path):
        continue

    # Iterate through return periods (5yr, 25yr, 100yr)
    for return_period in sorted(os.listdir(flood_group_path)):
        return_period_path = os.path.join(flood_group_path, return_period)
        
        if not os.path.isdir(return_period_path):
            continue

        # Placeholder for batch processing
        all_gdfs = []
        province_counter = 0  # Track processed provinces

        # Iterate through province folders
        for province in sorted(os.listdir(return_period_path)):
            province_path = os.path.join(return_period_path, province)

            if not os.path.isdir(province_path):
                continue

            # Check for .shp files in the province folder
            shp_files = [f for f in os.listdir(province_path) if f.endswith(".shp")]
            
            if not shp_files:
                # If no shapefile, there is likely another subfolder containing the shapefile

                for province2 in sorted(os.listdir(province_path)):
                    province2_path = os.path.join(province_path, province2)
                    
                    if os.path.isdir(province2_path):
                        
                        # Find the shapefile inside the province folder
                        shapefiles = glob.glob(os.path.join(province2_path, "*.shp"))

                        for shp in shapefiles:

                            print(f"      📂 Reading file: {shp}")

                            try:
                                # Load shapefile
                                gdf = gpd.read_file(shp)

                                # ✅ Convert CRS if needed
                                if gdf.crs != COMMON_CRS:
                                    print(f"🔄 Converting CRS from {gdf.crs} to {COMMON_CRS}")
                                    gdf = gdf.to_crs(COMMON_CRS)

                                # Rename 'Var' to 'FloodRisk'
                                gdf = gdf.rename(columns={"Var": "FloodRisk"})

                                # Add metadata: Province and Flood Return Period
                                gdf["Province"] = province2
                                gdf["FloodReturnPeriod"] = return_period

                                # Collect data for batch processing
                                all_gdfs.append(gdf)
                                processed_count += 1
                                province_counter += 1

                            except Exception as e:
                                print(f"      ❌ Error processing {shp}: {e}")

            # **Save after every 10 provinces**
            if province_counter >= SAVE_INTERVAL:
                if all_gdfs:
                    combined_gdf = gpd.GeoDataFrame(pd.concat(all_gdfs, ignore_index=True))
                    parquet_path = output_files[return_period]

                    # Append by reading old data and merging before saving
                    if os.path.exists(parquet_path):
                        old_gdf = gpd.read_parquet(parquet_path)
                        combined_gdf = pd.concat([old_gdf, combined_gdf], ignore_index=True)

                    # Save
                    combined_gdf.to_parquet(parquet_path, index=False)
                    print(f"      ✅ Saved {province_counter} provinces to {parquet_path}")

                    # Reset buffer
                    all_gdfs = []
                    province_counter = 0

        # **Final save if any data remains**
        if all_gdfs:
            combined_gdf = gpd.GeoDataFrame(pd.concat(all_gdfs, ignore_index=True))
            parquet_path = output_files[return_period]

            if os.path.exists(parquet_path):
                old_gdf = gpd.read_parquet(parquet_path)
                combined_gdf = pd.concat([old_gdf, combined_gdf], ignore_index=True)

            combined_gdf.to_parquet(parquet_path, index=False)
            print(f"      ✅ Final save for {return_period} at {parquet_path}")

          
# Final message
print("✅ Incremental processing with auto-saving complete!")



      📂 Reading file: ../00_data/flood_risk/Flood 1/100yr/Albay/Albay/PH050500000_FH_100yr.shp
      📂 Reading file: ../00_data/flood_risk/Flood 1/100yr/Bataan/Bataan/PH030800000_FH_100yr.shp
      ✅ Final save for 100yr at ../01_processed_data/flood_risk/FloodRisk_100yr.parquet
      📂 Reading file: ../00_data/flood_risk/Flood 1/25yr/Aklan/Aklan/PH060400000_FH_25yr.shp
      📂 Reading file: ../00_data/flood_risk/Flood 1/25yr/Albay/Albay/PH050500000_FH_25yr.shp
      📂 Reading file: ../00_data/flood_risk/Flood 1/25yr/Bataan/Bataan/PH030800000_FH_25yr.shp
      📂 Reading file: ../00_data/flood_risk/Flood 1/25yr/Cebu/Cebu/PH072200000_FH_25yr.shp
      📂 Reading file: ../00_data/flood_risk/Flood 1/25yr/Leyte/Leyte/PH083700000_FH_25yr.shp
      📂 Reading file: ../00_data/flood_risk/Flood 1/25yr/Misamis Oriental/Misamis Oriental/PH104300000_FH_25yr.shp
      📂 Reading file: ../00_data/flood_risk/Flood 1/25yr/Pangasinan/Pangasinan/PH015500000_FH_25yr.shp
      ✅ Final save for 25yr at ../01_

## Read Parquet Files and Split

In [1]:
import dask_geopandas as dgpd
gdf_flood_5 = dgpd.read_parquet("../01_processed_data/flood_risk/FloodRisk_5yr_reprojected.parquet").compute()
gdf_flood_25 = dgpd.read_parquet("../01_processed_data/flood_risk/FloodRisk_25yr_reprojected.parquet").compute()
gdf_flood_100 = dgpd.read_parquet("../01_processed_data/flood_risk/FloodRisk_100yr_reprojected.parquet").compute()

In [3]:
gdf_flood_100

Unnamed: 0,FloodRisk,geometry
0,1.0,"MULTIPOLYGON (((759330 910470, 759300 910470, ..."
1,2.0,"MULTIPOLYGON (((759330 910500, 759300 910500, ..."
2,3.0,"MULTIPOLYGON (((759540 910950, 759510 910950, ..."
3,1.0,"MULTIPOLYGON (((387400 694390, 387390 694390, ..."
4,2.0,"MULTIPOLYGON (((385510 695320, 385500 695320, ..."
...,...,...
248,2.0,"MULTIPOLYGON (((669415 902377.571, 669385 9023..."
249,3.0,"MULTIPOLYGON (((671125 902415, 671095 902415, ..."
250,1.0,"MULTIPOLYGON Z (((205417 1728585 0, 205417 172..."
251,2.0,"MULTIPOLYGON Z (((205427 1728565 0, 205427 172..."


In [2]:
# Drop Province and FloodReturnPeriod columns
gdf_flood_5 = gdf_flood_5.drop(columns=["Province", "FloodReturnPeriod"])
gdf_flood_25 = gdf_flood_25.drop(columns=["Province", "FloodReturnPeriod"])
gdf_flood_100 = gdf_flood_100.drop(columns=["Province", "FloodReturnPeriod"])

In [5]:
import geopandas as gpd
import pandas as pd
from tqdm import tqdm
import os

def simplify_and_save(single_row, filepath, initial_tolerance=0.001, max_retries=5, size_limit_mb=100):
    tolerance = initial_tolerance
    for _ in range(max_retries):
        simplified = single_row.copy()
        simplified["geometry"] = simplified["geometry"].simplify(tolerance=tolerance, preserve_topology=True)
        simplified.to_parquet(filepath, index=False)

        size_mb = os.path.getsize(filepath) / (1024 * 1024)
        if size_mb <= size_limit_mb:
            return tolerance  # success
        else:
            tolerance *= 2  # try more simplification
    return None  # failed to get under size limit

def split_gdf_to_files(gdf, output_dir, prefix="part", file_ext="parquet", initial_tolerance=0.001):
    os.makedirs(output_dir, exist_ok=True)
    results = []

    for i, row in tqdm(gdf.iterrows(), total=len(gdf), desc=f"Saving {output_dir} files"):
        single_row = gpd.GeoDataFrame([row], crs=gdf.crs)
        filename = f"{prefix}_{i:04d}.{file_ext}"
        filepath = os.path.join(output_dir, filename)
        tolerance_used = simplify_and_save(single_row, filepath, initial_tolerance=initial_tolerance)
        results.append({"filename": filename, "tolerance_used": tolerance_used})

    return pd.DataFrame(results)

# Example calls
results_5 = split_gdf_to_files(gdf_flood_5, "flood_5_split_parquet", prefix="part")
results_25 = split_gdf_to_files(gdf_flood_25, "flood_25_split_parquet", prefix="part")
results_100 = split_gdf_to_files(gdf_flood_100, "flood_100_split_parquet", prefix="part")

# Optional: Save logs
results_5.to_csv("flood_5_tolerance_log.csv", index=False)
results_25.to_csv("flood_25_tolerance_log.csv", index=False)
results_100.to_csv("flood_100_tolerance_log.csv", index=False)

Saving flood_5_split_parquet files: 100%|██████████| 207/207 [10:08<00:00,  2.94s/it]
Saving flood_25_split_parquet files: 100%|██████████| 222/222 [12:06<00:00,  3.27s/it]
Saving flood_100_split_parquet files: 100%|██████████| 253/253 [24:59<00:00,  5.93s/it]


## Read and Merge Parquet Files

In [13]:
import geopandas as gpd
import os
from glob import glob

def merge_parquet_folder(folder_path):
    # Get all .parquet files in the folder
    parquet_files = sorted(glob(os.path.join(folder_path, "*.parquet")))

    # Use list comprehension to read each file into a GeoDataFrame
    gdfs = [gpd.read_parquet(f) for f in parquet_files]

    # Concatenate into one GeoDataFrame
    merged_gdf = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True), crs=gdfs[0].crs)
    return merged_gdf

# Example usage
merged_flood_5 = merge_parquet_folder("flood_5_split_parquet")
merged_flood_25 = merge_parquet_folder("flood_25_split_parquet")
merged_flood_100 = merge_parquet_folder("flood_100_split_parquet")



In [15]:
merged_flood_100

Unnamed: 0,FloodRisk,geometry
0,1.0,"MULTIPOLYGON (((759330 910470, 759300 910470, ..."
1,2.0,"MULTIPOLYGON (((759330 910500, 759300 910500, ..."
2,3.0,"MULTIPOLYGON (((759540 910950, 759510 910950, ..."
3,1.0,"MULTIPOLYGON (((387400 694390, 387390 694390, ..."
4,2.0,"MULTIPOLYGON (((385510 695320, 385500 695320, ..."
...,...,...
248,2.0,"MULTIPOLYGON (((669415 902377.571, 669385 9023..."
249,3.0,"MULTIPOLYGON (((671125 902415, 671095 902415, ..."
250,1.0,"MULTIPOLYGON Z (((205417 1728585 0, 205417 172..."
251,2.0,"MULTIPOLYGON Z (((205427 1728565 0, 205427 172..."


## Split and Save as GeoJSON (some files cant be opened)

In [29]:
import os

# Define the directory to scan
directory = "flood_5_split" 
threshold = 100 * 1024 * 1024  # 100 MB in bytes
count = 0


for root, dirs, files in os.walk(directory):
    for file in files:
        filepath = os.path.join(root, file)
        try:
            size = os.stat(filepath).st_size
            if size > threshold:
                print(f"{filepath} - {size / (1024 * 1024):.2f} MB")
                count += 1
        except FileNotFoundError:
            # In case the file was removed during scanning
            continue

print(f"\nTotal files exceeding 100MB: {count}")

flood_5_split/part_0141.geojson - 101.27 MB
flood_5_split/part_0144.geojson - 136.30 MB
flood_5_split/part_0156.geojson - 128.01 MB
flood_5_split/part_0198.geojson - 161.39 MB
flood_5_split/part_0171.geojson - 167.75 MB
flood_5_split/part_0204.geojson - 135.76 MB
flood_5_split/part_0091.geojson - 105.86 MB
flood_5_split/part_0127.geojson - 115.01 MB
flood_5_split/part_0126.geojson - 144.05 MB
flood_5_split/part_0106.geojson - 116.33 MB
flood_5_split/part_0105.geojson - 169.96 MB
flood_5_split/part_0099.geojson - 106.70 MB
flood_5_split/part_0102.geojson - 114.95 MB
flood_5_split/part_0090.geojson - 168.83 MB
flood_5_split/part_0150.geojson - 129.66 MB
flood_5_split/part_0048.geojson - 116.16 MB
flood_5_split/part_0174.geojson - 122.44 MB

Total files exceeding 100MB: 17


In [30]:
# Define the directory to scan
directory = "flood_25_split" 
threshold = 100 * 1024 * 1024  # 100 MB in bytes
count = 0


for root, dirs, files in os.walk(directory):
    for file in files:
        filepath = os.path.join(root, file)
        try:
            size = os.stat(filepath).st_size
            if size > threshold:
                print(f"{filepath} - {size / (1024 * 1024):.2f} MB")
                count += 1
        except FileNotFoundError:
            # In case the file was removed during scanning
            continue

print(f"\nTotal files exceeding 100MB: {count}")

flood_25_split/part_0213.geojson - 165.15 MB
flood_25_split/part_0123.geojson - 170.22 MB
flood_25_split/part_0156.geojson - 142.59 MB
flood_25_split/part_0198.geojson - 132.39 MB
flood_25_split/part_0219.geojson - 142.37 MB
flood_25_split/part_0186.geojson - 180.84 MB
flood_25_split/part_0127.geojson - 102.01 MB
flood_25_split/part_0069.geojson - 136.58 MB
flood_25_split/part_0124.geojson - 131.56 MB
flood_25_split/part_0126.geojson - 125.83 MB
flood_25_split/part_0157.geojson - 124.42 MB
flood_25_split/part_0117.geojson - 109.51 MB
flood_25_split/part_0106.geojson - 122.17 MB
flood_25_split/part_0216.geojson - 102.82 MB
flood_25_split/part_0105.geojson - 171.17 MB
flood_25_split/part_0214.geojson - 104.30 MB
flood_25_split/part_0199.geojson - 104.56 MB
flood_25_split/part_0084.geojson - 109.86 MB
flood_25_split/part_0033.geojson - 116.32 MB
flood_25_split/part_0174.geojson - 130.67 MB
flood_25_split/part_0120.geojson - 119.46 MB

Total files exceeding 100MB: 21


In [31]:
# Define the directory to scan
directory = "flood_100_split" 
threshold = 100 * 1024 * 1024  # 100 MB in bytes
count = 0

for root, dirs, files in os.walk(directory):
    for file in files:
        filepath = os.path.join(root, file)
        try:
            size = os.stat(filepath).st_size
            if size > threshold:
                print(f"{filepath} - {size / (1024 * 1024):.2f} MB")
                count += 1
        except FileNotFoundError:
            # In case the file was removed during scanning
            continue

print(f"\nTotal files exceeding 100MB: {count}")

flood_100_split/part_0193.geojson - 142.55 MB
flood_100_split/part_0108.geojson - 111.67 MB
flood_100_split/part_0213.geojson - 200.00 MB
flood_100_split/part_0132.geojson - 110.82 MB
flood_100_split/part_0239.geojson - 185.90 MB
flood_100_split/part_0238.geojson - 242.11 MB
flood_100_split/part_0123.geojson - 131.54 MB
flood_100_split/part_0202.geojson - 173.72 MB
flood_100_split/part_0112.geojson - 131.10 MB
flood_100_split/part_0031.geojson - 106.93 MB
flood_100_split/part_0087.geojson - 114.76 MB
flood_100_split/part_0229.geojson - 105.67 MB
flood_100_split/part_0211.geojson - 200.50 MB
flood_100_split/part_0171.geojson - 175.62 MB
flood_100_split/part_0125.geojson - 107.58 MB
flood_100_split/part_0155.geojson - 169.59 MB
flood_100_split/part_0227.geojson - 173.72 MB
flood_100_split/part_0127.geojson - 133.21 MB
flood_100_split/part_0153.geojson - 196.64 MB
flood_100_split/part_0250.geojson - 161.03 MB
flood_100_split/part_0124.geojson - 199.83 MB
flood_100_split/part_0126.geojson 

In [43]:
import os
import geopandas as gpd
import fiona

# Base folder and DataFrame
folder = "flood_5_split"

# Parameters
tolerance = 0.01

# Loop over expected filenames
for i in tqdm(range(len(gdf_flood_5)), desc="Checking GeoJSON files"):
    filename = f"part_{i:04d}.geojson"
    filepath = os.path.join(folder, filename)

    if not os.path.exists(filepath):
        print(f"Missing: {filename} — skipping")
        continue

    # Try to open with Fiona
    try:
        with fiona.open(filepath):
            pass  # File is readable
    except Exception as e:
        print(f"{filename} is broken. Attempting to simplify and re-save...")

        try:
            row = gdf_flood_5.loc[i]
            simplified_geom = row.geometry.simplify(tolerance=tolerance, preserve_topology=True)
            new_gdf = gpd.GeoDataFrame([row], geometry=[simplified_geom], crs=gdf_flood_5.crs)

            new_gdf.to_file(filepath, driver="GeoJSON")
            print(f"✔ Fixed: {filename}")

        except Exception as e2:
            print(f"❌ Failed to fix {filename}: {e2}")


part_0048.geojson is broken. Attempting to simplify and re-save...
✔ Fixed: part_0048.geojson
part_0069.geojson is broken. Attempting to simplify and re-save...
✔ Fixed: part_0069.geojson
part_0084.geojson is broken. Attempting to simplify and re-save...
✔ Fixed: part_0084.geojson
part_0090.geojson is broken. Attempting to simplify and re-save...
✔ Fixed: part_0090.geojson
part_0091.geojson is broken. Attempting to simplify and re-save...
✔ Fixed: part_0091.geojson
part_0096.geojson is broken. Attempting to simplify and re-save...
✔ Fixed: part_0096.geojson
part_0099.geojson is broken. Attempting to simplify and re-save...
✔ Fixed: part_0099.geojson
part_0102.geojson is broken. Attempting to simplify and re-save...
✔ Fixed: part_0102.geojson
part_0105.geojson is broken. Attempting to simplify and re-save...
✔ Fixed: part_0105.geojson
part_0106.geojson is broken. Attempting to simplify and re-save...
✔ Fixed: part_0106.geojson
part_0126.geojson is broken. Attempting to simplify and re-s

In [44]:
import os
import geopandas as gpd
import fiona

# Base folder and DataFrame
folder = "flood_25_split"

# Parameters
tolerance = 0.01

# Loop over expected filenames
for i in tqdm(range(len(gdf_flood_25)), desc="Checking GeoJSON files"):
    filename = f"part_{i:04d}.geojson"
    filepath = os.path.join(folder, filename)

    if not os.path.exists(filepath):
        print(f"Missing: {filename} — skipping")
        continue

    # Try to open with Fiona
    try:
        with fiona.open(filepath):
            pass  # File is readable
    except Exception as e:
        print(f"{filename} is broken. Attempting to simplify and re-save...")

        try:
            row = gdf_flood_25.loc[i]
            simplified_geom = row.geometry.simplify(tolerance=tolerance, preserve_topology=True)
            new_gdf = gpd.GeoDataFrame([row], geometry=[simplified_geom], crs=gdf_flood_25.crs)

            new_gdf.to_file(filepath, driver="GeoJSON")
            print(f"✔ Fixed: {filename}")

        except Exception as e2:
            print(f"❌ Failed to fix {filename}: {e2}")


Checking GeoJSON files:  15%|█▍        | 33/222 [00:19<02:56,  1.07it/s]

part_0033.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  15%|█▌        | 34/222 [00:43<24:28,  7.81s/it]

✔ Fixed: part_0033.geojson
part_0034.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  16%|█▌        | 35/222 [00:58<31:48, 10.21s/it]

✔ Fixed: part_0034.geojson


Checking GeoJSON files:  24%|██▍       | 54/222 [01:09<01:23,  2.01it/s]

part_0054.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  25%|██▍       | 55/222 [01:24<12:53,  4.63s/it]

✔ Fixed: part_0054.geojson


Checking GeoJSON files:  31%|███       | 68/222 [01:31<01:09,  2.21it/s]

part_0069.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  32%|███▏      | 70/222 [01:58<13:42,  5.41s/it]

✔ Fixed: part_0069.geojson
part_0070.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  32%|███▏      | 71/222 [02:13<18:49,  7.48s/it]

✔ Fixed: part_0070.geojson


Checking GeoJSON files:  38%|███▊      | 84/222 [02:19<00:58,  2.34it/s]

part_0084.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  38%|███▊      | 85/222 [02:43<16:36,  7.27s/it]

✔ Fixed: part_0084.geojson


Checking GeoJSON files:  46%|████▌     | 102/222 [02:53<00:55,  2.14it/s]

part_0105.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  48%|████▊     | 106/222 [03:23<09:28,  4.90s/it]

✔ Fixed: part_0105.geojson
part_0106.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  48%|████▊     | 107/222 [03:43<14:35,  7.62s/it]

✔ Fixed: part_0106.geojson


Checking GeoJSON files:  50%|█████     | 111/222 [03:46<05:45,  3.11s/it]

part_0111.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  50%|█████     | 112/222 [04:03<12:35,  6.87s/it]

✔ Fixed: part_0111.geojson


Checking GeoJSON files:  51%|█████▏    | 114/222 [04:05<07:06,  3.95s/it]

part_0117.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  53%|█████▎    | 118/222 [04:27<08:20,  4.82s/it]

✔ Fixed: part_0117.geojson
part_0118.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  54%|█████▎    | 119/222 [04:42<11:38,  6.78s/it]

✔ Fixed: part_0118.geojson


Checking GeoJSON files:  54%|█████▍    | 120/222 [04:43<09:31,  5.60s/it]

part_0120.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  55%|█████▍    | 121/222 [05:05<15:36,  9.27s/it]

✔ Fixed: part_0120.geojson


Checking GeoJSON files:  55%|█████▌    | 123/222 [05:07<09:10,  5.56s/it]

part_0123.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  56%|█████▌    | 124/222 [05:37<19:56, 12.21s/it]

✔ Fixed: part_0123.geojson
part_0124.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  56%|█████▋    | 125/222 [05:59<23:58, 14.83s/it]

✔ Fixed: part_0124.geojson


Checking GeoJSON files:  57%|█████▋    | 126/222 [06:00<17:34, 10.98s/it]

part_0126.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  57%|█████▋    | 127/222 [06:21<21:59, 13.89s/it]

✔ Fixed: part_0126.geojson
part_0127.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  58%|█████▊    | 128/222 [06:37<22:56, 14.65s/it]

✔ Fixed: part_0127.geojson


Checking GeoJSON files:  70%|███████   | 156/222 [06:50<00:18,  3.48it/s]

part_0156.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  71%|███████   | 157/222 [07:20<09:48,  9.05s/it]

✔ Fixed: part_0156.geojson
part_0157.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  71%|███████   | 158/222 [07:45<14:46, 13.85s/it]

✔ Fixed: part_0157.geojson


Checking GeoJSON files:  76%|███████▌  | 168/222 [07:54<00:51,  1.05it/s]

part_0168.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  76%|███████▌  | 169/222 [08:14<05:55,  6.70s/it]

✔ Fixed: part_0168.geojson
part_0169.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  77%|███████▋  | 170/222 [08:30<08:07,  9.37s/it]

✔ Fixed: part_0169.geojson


Checking GeoJSON files:  78%|███████▊  | 174/222 [08:33<02:17,  2.87s/it]

part_0174.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  79%|███████▉  | 175/222 [08:58<07:30,  9.58s/it]

✔ Fixed: part_0174.geojson
part_0175.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  79%|███████▉  | 176/222 [09:16<09:13, 12.03s/it]

✔ Fixed: part_0175.geojson


Checking GeoJSON files:  81%|████████  | 180/222 [09:20<02:27,  3.51s/it]

part_0180.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  82%|████████▏ | 181/222 [09:36<04:53,  7.15s/it]

✔ Fixed: part_0180.geojson


Checking GeoJSON files:  84%|████████▍ | 186/222 [09:41<01:11,  1.98s/it]

part_0186.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  84%|████████▍ | 187/222 [10:30<09:28, 16.24s/it]

✔ Fixed: part_0186.geojson
part_0187.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  85%|████████▍ | 188/222 [10:48<09:29, 16.76s/it]

✔ Fixed: part_0187.geojson


Checking GeoJSON files:  89%|████████▉ | 198/222 [10:55<00:30,  1.26s/it]

part_0198.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  90%|████████▉ | 199/222 [11:22<03:25,  8.94s/it]

✔ Fixed: part_0198.geojson
part_0199.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  90%|█████████ | 200/222 [11:41<04:27, 12.15s/it]

✔ Fixed: part_0199.geojson


Checking GeoJSON files:  96%|█████████▌| 213/222 [11:51<00:06,  1.44it/s]

part_0213.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  96%|█████████▋| 214/222 [12:22<01:19,  9.91s/it]

✔ Fixed: part_0213.geojson
part_0214.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  97%|█████████▋| 215/222 [12:40<01:26, 12.36s/it]

✔ Fixed: part_0214.geojson


Checking GeoJSON files:  97%|█████████▋| 216/222 [12:41<00:53,  8.94s/it]

part_0216.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  98%|█████████▊| 217/222 [13:00<00:58, 11.76s/it]

✔ Fixed: part_0216.geojson


Checking GeoJSON files:  99%|█████████▊| 219/222 [13:02<00:18,  6.28s/it]

part_0219.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  99%|█████████▉| 220/222 [13:34<00:27, 13.94s/it]

✔ Fixed: part_0219.geojson


Checking GeoJSON files: 100%|█████████▉| 221/222 [13:34<00:09,  9.93s/it]

part_0221.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files: 100%|██████████| 222/222 [13:50<00:00,  3.74s/it]

✔ Fixed: part_0221.geojson





In [45]:
import os
import geopandas as gpd
import fiona

# Base folder and DataFrame
folder = "flood_100_split"

# Parameters
tolerance = 0.01

# Loop over expected filenames
for i in tqdm(range(len(gdf_flood_100)), desc="Checking GeoJSON files"):
    filename = f"part_{i:04d}.geojson"
    filepath = os.path.join(folder, filename)

    if not os.path.exists(filepath):
        print(f"Missing: {filename} — skipping")
        continue

    # Try to open with Fiona
    try:
        with fiona.open(filepath):
            pass  # File is readable
    except Exception as e:
        print(f"{filename} is broken. Attempting to simplify and re-save...")

        try:
            row = gdf_flood_100.loc[i]
            simplified_geom = row.geometry.simplify(tolerance=tolerance, preserve_topology=True)
            new_gdf = gpd.GeoDataFrame([row], geometry=[simplified_geom], crs=gdf_flood_100.crs)

            new_gdf.to_file(filepath, driver="GeoJSON")
            print(f"✔ Fixed: {filename}")

        except Exception as e2:
            print(f"❌ Failed to fix {filename}: {e2}")


Checking GeoJSON files:   6%|▋         | 16/253 [00:05<01:46,  2.22it/s]

part_0016.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:   7%|▋         | 17/253 [00:27<20:40,  5.26s/it]

✔ Fixed: part_0016.geojson


Checking GeoJSON files:   9%|▉         | 24/253 [00:32<04:09,  1.09s/it]

part_0024.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  10%|▉         | 25/253 [01:03<37:28,  9.86s/it]

✔ Fixed: part_0024.geojson
part_0025.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  10%|█         | 26/253 [01:29<55:59, 14.80s/it]

✔ Fixed: part_0025.geojson


Checking GeoJSON files:  11%|█         | 28/253 [01:31<29:04,  7.75s/it]

part_0028.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  11%|█▏        | 29/253 [01:46<36:39,  9.82s/it]

✔ Fixed: part_0028.geojson


Checking GeoJSON files:  12%|█▏        | 30/253 [01:47<26:38,  7.17s/it]

part_0030.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  12%|█▏        | 31/253 [02:13<47:27, 12.83s/it]

✔ Fixed: part_0030.geojson
part_0031.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  13%|█▎        | 32/253 [02:32<54:32, 14.81s/it]

✔ Fixed: part_0031.geojson


Checking GeoJSON files:  18%|█▊        | 45/253 [02:38<02:37,  1.32it/s]

part_0045.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  18%|█▊        | 46/253 [02:58<22:16,  6.46s/it]

✔ Fixed: part_0045.geojson
part_0046.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  19%|█▊        | 47/253 [03:13<31:08,  9.07s/it]

✔ Fixed: part_0046.geojson


Checking GeoJSON files:  25%|██▍       | 63/253 [03:21<01:59,  1.59it/s]

part_0063.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  25%|██▌       | 64/253 [03:42<21:07,  6.71s/it]

✔ Fixed: part_0063.geojson


Checking GeoJSON files:  31%|███       | 78/253 [03:50<01:13,  2.39it/s]

part_0078.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  31%|███       | 79/253 [04:25<27:15,  9.40s/it]

✔ Fixed: part_0078.geojson
part_0079.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  32%|███▏      | 80/253 [04:50<39:23, 13.66s/it]

✔ Fixed: part_0079.geojson


Checking GeoJSON files:  33%|███▎      | 83/253 [04:52<16:05,  5.68s/it]

part_0084.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  34%|███▎      | 85/253 [05:12<21:01,  7.51s/it]

✔ Fixed: part_0084.geojson
part_0085.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  34%|███▍      | 86/253 [05:29<26:32,  9.54s/it]

✔ Fixed: part_0085.geojson


Checking GeoJSON files:  34%|███▍      | 87/253 [05:30<20:47,  7.52s/it]

part_0087.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  35%|███▍      | 88/253 [05:51<29:44, 10.82s/it]

✔ Fixed: part_0087.geojson


Checking GeoJSON files:  40%|████      | 102/253 [06:02<01:45,  1.44it/s]

part_0102.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  41%|████      | 103/253 [06:32<23:31,  9.41s/it]

✔ Fixed: part_0102.geojson
part_0103.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  41%|████      | 104/253 [06:59<36:25, 14.67s/it]

✔ Fixed: part_0103.geojson


Checking GeoJSON files:  43%|████▎     | 108/253 [07:03<10:01,  4.15s/it]

part_0108.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  43%|████▎     | 109/253 [07:22<20:59,  8.75s/it]

✔ Fixed: part_0108.geojson
part_0109.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  43%|████▎     | 110/253 [07:36<24:35, 10.32s/it]

✔ Fixed: part_0109.geojson


Checking GeoJSON files:  44%|████▍     | 112/253 [07:39<13:21,  5.68s/it]

part_0112.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  45%|████▍     | 113/253 [08:01<24:39, 10.57s/it]

✔ Fixed: part_0112.geojson


Checking GeoJSON files:  47%|████▋     | 120/253 [08:04<03:05,  1.39s/it]

part_0120.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  48%|████▊     | 121/253 [08:17<10:14,  4.66s/it]

✔ Fixed: part_0120.geojson


Checking GeoJSON files:  49%|████▊     | 123/253 [08:19<06:00,  2.77s/it]

part_0123.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  49%|████▉     | 124/253 [08:43<19:30,  9.07s/it]

✔ Fixed: part_0123.geojson
part_0124.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  49%|████▉     | 125/253 [09:22<38:02, 17.84s/it]

✔ Fixed: part_0124.geojson
part_0125.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  50%|████▉     | 126/253 [09:41<38:17, 18.09s/it]

✔ Fixed: part_0125.geojson
part_0126.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  50%|█████     | 127/253 [10:11<45:04, 21.46s/it]

✔ Fixed: part_0126.geojson
part_0127.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  51%|█████     | 128/253 [10:35<46:32, 22.34s/it]

✔ Fixed: part_0127.geojson


Checking GeoJSON files:  51%|█████     | 129/253 [10:36<33:11, 16.06s/it]

part_0129.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  51%|█████▏    | 130/253 [11:03<39:19, 19.18s/it]

✔ Fixed: part_0129.geojson
part_0130.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  52%|█████▏    | 131/253 [11:24<40:23, 19.87s/it]

✔ Fixed: part_0130.geojson


Checking GeoJSON files:  52%|█████▏    | 132/253 [11:26<28:49, 14.30s/it]

part_0132.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  53%|█████▎    | 133/253 [11:51<35:18, 17.65s/it]

✔ Fixed: part_0132.geojson


Checking GeoJSON files:  53%|█████▎    | 135/253 [11:53<18:05,  9.20s/it]

part_0135.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  54%|█████▍    | 136/253 [12:08<20:57, 10.75s/it]

✔ Fixed: part_0135.geojson


Checking GeoJSON files:  58%|█████▊    | 147/253 [12:17<01:33,  1.13it/s]

part_0147.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  58%|█████▊    | 148/253 [12:34<09:49,  5.61s/it]

✔ Fixed: part_0147.geojson
part_0148.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  59%|█████▉    | 149/253 [12:50<14:59,  8.65s/it]

✔ Fixed: part_0148.geojson


Checking GeoJSON files:  59%|█████▉    | 150/253 [12:51<10:54,  6.36s/it]

part_0150.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  60%|█████▉    | 151/253 [13:05<14:49,  8.72s/it]

✔ Fixed: part_0150.geojson


Checking GeoJSON files:  60%|██████    | 153/253 [13:07<08:00,  4.80s/it]

part_0153.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  61%|██████    | 154/253 [13:45<24:30, 14.85s/it]

✔ Fixed: part_0153.geojson


Checking GeoJSON files:  61%|██████▏   | 155/253 [13:47<17:33, 10.75s/it]

part_0155.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  62%|██████▏   | 156/253 [14:17<26:47, 16.57s/it]

✔ Fixed: part_0155.geojson


Checking GeoJSON files:  62%|██████▏   | 157/253 [14:18<19:00, 11.88s/it]

part_0157.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  62%|██████▏   | 158/253 [14:33<20:16, 12.80s/it]

✔ Fixed: part_0157.geojson
part_0158.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  63%|██████▎   | 159/253 [15:01<27:15, 17.39s/it]

✔ Fixed: part_0158.geojson


Checking GeoJSON files:  64%|██████▎   | 161/253 [15:03<13:54,  9.07s/it]

part_0161.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  64%|██████▍   | 162/253 [15:33<23:08, 15.26s/it]

✔ Fixed: part_0161.geojson


Checking GeoJSON files:  64%|██████▍   | 163/253 [15:34<16:26, 10.96s/it]

part_0163.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  65%|██████▍   | 164/253 [15:50<18:43, 12.62s/it]

✔ Fixed: part_0163.geojson


Checking GeoJSON files:  66%|██████▌   | 166/253 [15:52<09:39,  6.67s/it]

part_0166.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  66%|██████▌   | 167/253 [16:27<21:56, 15.31s/it]

✔ Fixed: part_0166.geojson


Checking GeoJSON files:  66%|██████▋   | 168/253 [16:29<15:44, 11.11s/it]

part_0168.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  67%|██████▋   | 170/253 [17:02<17:12, 12.44s/it]

✔ Fixed: part_0168.geojson


Checking GeoJSON files:  68%|██████▊   | 171/253 [17:03<12:27,  9.12s/it]

part_0171.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  68%|██████▊   | 172/253 [17:52<28:08, 20.85s/it]

✔ Fixed: part_0171.geojson
part_0172.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  68%|██████▊   | 173/253 [18:13<28:10, 21.13s/it]

✔ Fixed: part_0172.geojson


Checking GeoJSON files:  69%|██████▉   | 174/253 [18:14<19:51, 15.08s/it]

part_0174.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  69%|██████▉   | 175/253 [18:43<25:03, 19.28s/it]

✔ Fixed: part_0174.geojson
part_0175.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  70%|██████▉   | 176/253 [19:09<27:04, 21.10s/it]

✔ Fixed: part_0175.geojson


Checking GeoJSON files:  76%|███████▌  | 192/253 [19:18<00:44,  1.36it/s]

part_0192.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  76%|███████▋  | 193/253 [19:40<06:50,  6.84s/it]

✔ Fixed: part_0192.geojson
part_0193.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  77%|███████▋  | 194/253 [20:03<11:38, 11.85s/it]

✔ Fixed: part_0193.geojson


Checking GeoJSON files:  77%|███████▋  | 195/253 [20:05<08:25,  8.72s/it]

part_0195.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  77%|███████▋  | 196/253 [20:23<11:03, 11.63s/it]

✔ Fixed: part_0195.geojson
part_0196.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  78%|███████▊  | 197/253 [20:38<11:48, 12.64s/it]

✔ Fixed: part_0196.geojson


Checking GeoJSON files:  78%|███████▊  | 198/253 [20:39<08:24,  9.16s/it]

part_0198.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  79%|███████▊  | 199/253 [20:58<10:53, 12.10s/it]

✔ Fixed: part_0198.geojson
part_0199.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  79%|███████▉  | 200/253 [21:15<12:06, 13.71s/it]

✔ Fixed: part_0199.geojson


Checking GeoJSON files:  79%|███████▉  | 201/253 [21:16<08:33,  9.88s/it]

part_0201.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  80%|███████▉  | 202/253 [21:43<12:43, 14.98s/it]

✔ Fixed: part_0201.geojson
part_0202.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  80%|████████  | 203/253 [22:13<16:07, 19.36s/it]

✔ Fixed: part_0202.geojson


Checking GeoJSON files:  81%|████████  | 204/253 [22:14<11:23, 13.94s/it]

part_0204.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  81%|████████  | 205/253 [22:30<11:39, 14.56s/it]

✔ Fixed: part_0204.geojson
part_0205.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  81%|████████▏ | 206/253 [22:51<12:59, 16.58s/it]

✔ Fixed: part_0205.geojson


Checking GeoJSON files:  82%|████████▏ | 207/253 [22:53<09:09, 11.94s/it]

part_0207.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  82%|████████▏ | 208/253 [23:35<15:45, 21.02s/it]

✔ Fixed: part_0207.geojson


Checking GeoJSON files:  83%|████████▎ | 209/253 [23:36<11:05, 15.12s/it]

part_0209.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  83%|████████▎ | 210/253 [24:11<15:06, 21.07s/it]

✔ Fixed: part_0209.geojson


Checking GeoJSON files:  83%|████████▎ | 211/253 [24:12<10:32, 15.06s/it]

part_0211.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  84%|████████▍ | 212/253 [24:48<14:33, 21.32s/it]

✔ Fixed: part_0211.geojson
part_0212.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  84%|████████▍ | 213/253 [25:02<12:43, 19.10s/it]

✔ Fixed: part_0212.geojson
part_0213.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  85%|████████▍ | 214/253 [25:37<15:35, 23.98s/it]

✔ Fixed: part_0213.geojson


Checking GeoJSON files:  85%|████████▍ | 215/253 [25:38<10:41, 16.88s/it]

part_0215.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  85%|████████▌ | 216/253 [25:52<09:56, 16.11s/it]

✔ Fixed: part_0215.geojson
part_0216.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  86%|████████▌ | 217/253 [26:24<12:31, 20.87s/it]

✔ Fixed: part_0216.geojson
part_0217.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  86%|████████▌ | 218/253 [26:51<13:13, 22.68s/it]

✔ Fixed: part_0217.geojson


Checking GeoJSON files:  87%|████████▋ | 220/253 [26:53<06:24, 11.65s/it]

part_0220.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  87%|████████▋ | 221/253 [27:23<09:06, 17.07s/it]

✔ Fixed: part_0220.geojson


Checking GeoJSON files:  88%|████████▊ | 222/253 [27:23<06:18, 12.21s/it]

part_0222.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  88%|████████▊ | 223/253 [27:54<08:52, 17.77s/it]

✔ Fixed: part_0222.geojson
part_0223.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  89%|████████▊ | 224/253 [28:32<11:27, 23.70s/it]

✔ Fixed: part_0223.geojson


Checking GeoJSON files:  89%|████████▉ | 225/253 [28:33<07:55, 16.97s/it]

part_0225.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  89%|████████▉ | 226/253 [28:50<07:40, 17.07s/it]

✔ Fixed: part_0225.geojson
part_0226.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  90%|████████▉ | 227/253 [29:17<08:41, 20.04s/it]

✔ Fixed: part_0226.geojson
part_0227.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  90%|█████████ | 228/253 [29:47<09:33, 22.95s/it]

✔ Fixed: part_0227.geojson


Checking GeoJSON files:  91%|█████████ | 229/253 [29:48<06:33, 16.40s/it]

part_0229.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  91%|█████████ | 230/253 [30:09<06:51, 17.87s/it]

✔ Fixed: part_0229.geojson
part_0230.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  91%|█████████▏| 231/253 [30:24<06:08, 16.75s/it]

✔ Fixed: part_0230.geojson


Checking GeoJSON files:  94%|█████████▍| 238/253 [30:31<00:34,  2.27s/it]

part_0238.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  94%|█████████▍| 239/253 [31:15<03:28, 14.91s/it]

✔ Fixed: part_0238.geojson
part_0239.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  95%|█████████▍| 240/253 [31:47<04:20, 20.02s/it]

✔ Fixed: part_0239.geojson


Checking GeoJSON files:  95%|█████████▌| 241/253 [31:48<02:52, 14.39s/it]

part_0241.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  96%|█████████▌| 242/253 [32:11<03:06, 16.93s/it]

✔ Fixed: part_0241.geojson
part_0242.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  96%|█████████▌| 243/253 [32:26<02:43, 16.37s/it]

✔ Fixed: part_0242.geojson


Checking GeoJSON files:  96%|█████████▋| 244/253 [32:27<01:45, 11.71s/it]

part_0244.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  97%|█████████▋| 245/253 [33:28<03:31, 26.39s/it]

✔ Fixed: part_0244.geojson
part_0245.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  97%|█████████▋| 246/253 [34:22<04:03, 34.79s/it]

✔ Fixed: part_0245.geojson
part_0246.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  98%|█████████▊| 247/253 [34:38<02:55, 29.20s/it]

✔ Fixed: part_0246.geojson
part_0247.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  98%|█████████▊| 248/253 [34:59<02:12, 26.59s/it]

✔ Fixed: part_0247.geojson
part_0248.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  98%|█████████▊| 249/253 [35:13<01:31, 22.85s/it]

✔ Fixed: part_0248.geojson


Checking GeoJSON files:  99%|█████████▉| 250/253 [35:14<00:48, 16.26s/it]

part_0250.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files:  99%|█████████▉| 251/253 [35:52<00:45, 22.74s/it]

✔ Fixed: part_0250.geojson
part_0251.geojson is broken. Attempting to simplify and re-save...


Checking GeoJSON files: 100%|█████████▉| 252/253 [36:13<00:22, 22.28s/it]

✔ Fixed: part_0251.geojson


Checking GeoJSON files: 100%|██████████| 253/253 [36:14<00:00,  8.59s/it]


In [None]:
# import geopandas as gpd
# import glob
# import os
# import pandas as pd

# # Adjust the path and extension based on your format (e.g., .geojson, .shp, .parquet)
# files = sorted(glob.glob("flood_5_split/part_*.geojson"))  # or "*.shp", "*.parquet", etc.

# # Read and concatenate all individual GeoDataFrames
# gdf_list = [gpd.read_file(file) for file in files]
# merged_gdf = gpd.GeoDataFrame(pd.concat(gdf_list, ignore_index=True), crs=gdf_list[0].crs)

# # Optionally save the merged file
# # merged_gdf.to_file("merged_output.geojson", driver="GeoJSON")


In [None]:
# import geopandas as gpd
# import io
# import os

# def get_gdf_file_size(gdf, driver="GeoJSON"):
#     """Estimate the file size of a GeoDataFrame in MB."""
#     buffer = io.BytesIO()
#     gdf.to_file(buffer, driver=driver)
#     return len(buffer.getvalue()) / (1024 * 1024)

# def split_and_save_gdf(gdf, output_dir="output", base_filename="part", max_mb=95, driver="GeoJSON"):
#     """Split GeoDataFrame into chunks under max_mb and save each to disk."""
#     if not os.path.exists(output_dir):
#         os.makedirs(output_dir)

#     chunks = []
#     current_chunk = []
#     part_index = 0

#     for _, row in gdf.iterrows():
#         current_chunk.append(row)
#         temp_gdf = gpd.GeoDataFrame(current_chunk, crs=gdf.crs)

#         if get_gdf_file_size(temp_gdf, driver=driver) > max_mb:
#             # Save previous chunk
#             final_gdf = gpd.GeoDataFrame(current_chunk[:-1], crs=gdf.crs)
#             file_path = os.path.join(output_dir, f"{base_filename}_{part_index}.geojson")
#             final_gdf.to_file(file_path, driver=driver)
#             print(f"Saved {file_path} ({get_gdf_file_size(final_gdf):.2f} MB)")
#             part_index += 1
#             current_chunk = [row]  # Start new chunk

#     # Save last chunk
#     if current_chunk:
#         final_gdf = gpd.GeoDataFrame(current_chunk, crs=gdf.crs)
#         file_path = os.path.join(output_dir, f"{base_filename}_{part_index}.geojson")
#         final_gdf.to_file(file_path, driver=driver)
#         print(f"Saved {file_path} ({get_gdf_file_size(final_gdf):.2f} MB)")

## GEE Dataset

In [5]:
!pip uninstall ee

Found existing installation: ee 0.2
Uninstalling ee-0.2:
  Would remove:
    /raid/students/ryan/anaconda3/envs/capstone_venv/bin/ee
    /raid/students/ryan/anaconda3/envs/capstone_venv/lib/python3.13/site-packages/ee-0.2.dist-info/*
    /raid/students/ryan/anaconda3/envs/capstone_venv/lib/python3.13/site-packages/ee/*
Proceed (Y/n)? [31mERROR: Operation cancelled by user[0m[31m
[0m^C


In [10]:
import ee
ee.Initialize()


In [18]:
flood = ee.ImageCollection("GLOBAL_FLOOD_DB/MODIS_EVENTS/V1").select("flooded").mean()

In [19]:
flood

<ee.image.Image at 0x7f97bf53d1d0>