# Processing Disaster Risk Maps

## Flood Risk

In [2]:
# Extracting Zip Files

import zipfile
import os
import glob
import geopandas as gpd
import pandas as pd
import shutil

# Define root directory
root_dir = "../00_data/flood_risk/"

# Extract all ZIP files
extracted_folders = []

for flood_scenario in sorted(os.listdir(root_dir)):  # e.g., "Flood 1", "Flood 2"
    scenario_path = os.path.join(root_dir, flood_scenario)
    
    if os.path.isdir(scenario_path):
        for return_period in sorted(os.listdir(scenario_path)):  # e.g., "5yr", "25yr", "100yr"
            return_period_path = os.path.join(scenario_path, return_period)

            if os.path.isdir(return_period_path):
                # Extract all ZIP files inside this return period
                for zip_file in glob.glob(os.path.join(return_period_path, "*.zip")):
                    extract_folder = zip_file.replace(".zip", "")
                    extracted_folders.append(extract_folder)

                    with zipfile.ZipFile(zip_file, "r") as zip_ref:
                        zip_ref.extractall(extract_folder)

print(f"Extracted {len(extracted_folders)} ZIP files.")


Extracted 220 ZIP files.


In [7]:
# Delete ZIP files

import os
import glob

# Define the root directory
root_dir = "../00_data/flood_risk/" 

# Find and delete all .zip files
zip_files = glob.glob(os.path.join(root_dir, "**", "*.zip"), recursive=True)

for zip_file in zip_files:
    os.remove(zip_file)
    print(f"Deleted: {zip_file}")

print("All .zip files have been removed.")

Deleted: ../00_data/flood_risk/Flood 2/100yr/AgusanDelNorte.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/DavaoDelSur.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/Kalinga.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/MisamisOccidental.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/Batangas.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/SultanKudarat.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/Cebu.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/LanaoDelSur.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/Catanduanes.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/Aklan.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/DavaoOccidental.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/MountainProvince.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/Sarangani.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/MetroManila.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/Leyte.zip
Deleted: ../00_data/flood_risk/Flood 2/100yr/Misamis Oriental.zip
Deleted: ../00_data/flood_r

In [6]:
import geopandas as gpd

# Path to the specific .shp file you want to open
shp_file = "Flood 1/5yr/MountainProvince/MountainProvince_Flood_5year.shp" 

# Load the Shapefile
gdf = gpd.read_file(root_dir+shp_file)

# Display the first few rows
print(gdf.head())

# Check the available columns
print(gdf.columns)

   Var                                           geometry
0  1.0  MULTIPOLYGON (((121.52562 17.26903, 121.52534 ...
1  2.0  MULTIPOLYGON (((121.52618 17.26985, 121.5259 1...
2  3.0  MULTIPOLYGON (((121.52618 17.26985, 121.52618 ...
Index(['Var', 'geometry'], dtype='object')


### Merging Files
The first run below was prematurely stopped because a different CRS was used for certain shapefiles.

In [4]:
import geopandas as gpd
import pandas as pd
import os
import glob

# Root directories
root_dir = "../00_data/flood_risk/"
output_path = "../01_processed_data/flood_risk/"

# Ensure output directory exists
os.makedirs(output_path, exist_ok=True)

# Paths for output files
output_files = {
    "5yr": os.path.join(output_path, "FloodRisk_5yr.parquet"),
    "25yr": os.path.join(output_path, "FloodRisk_25yr.parquet"),
    "100yr": os.path.join(output_path, "FloodRisk_100yr.parquet"),
}

# Processing settings
SAVE_INTERVAL = 10  # Save every 10 provinces
processed_count = 0
test_limit = None  # Set to an integer (e.g., 20) for testing, or None for full processing

# Loop through each Flood folder (Flood 1 to Flood 5)
for flood_folder in sorted(os.listdir(root_dir)):
    flood_path = os.path.join(root_dir, flood_folder)

    if os.path.isdir(flood_path):  # Ensure it's a directory
        print(f"📂 Processing: {flood_folder}")

        # Loop through risk levels (5yr, 25yr, 100yr)
        for risk_period in output_files.keys():
            risk_path = os.path.join(flood_path, risk_period)

            if os.path.isdir(risk_path):  # Ensure it exists
                print(f"  ⏳ Processing Risk Period: {risk_period}")

                # Placeholder for batch processing
                all_gdfs = []
                province_counter = 0  # Track processed provinces

                # Loop through province folders inside risk period
                for province in sorted(os.listdir(risk_path)):
                    province_path = os.path.join(risk_path, province)

                    if os.path.isdir(province_path):  # Ensure it's a province folder
                        print(f"    🏙 Processing Province: {province}")

                        # Find the shapefile inside the province folder
                        shapefiles = glob.glob(os.path.join(province_path, "*.shp"))

                        for shp in shapefiles:
                            if test_limit is not None and processed_count >= test_limit:
                                break  # Stop after reaching test limit

                            print(f"      📂 Reading file: {shp}")

                            try:
                                # Load shapefile
                                gdf = gpd.read_file(shp)

                                # Rename 'Var' to 'FloodRisk'
                                gdf = gdf.rename(columns={"Var": "FloodRisk"})

                                # Add metadata: Province and Flood Return Period
                                gdf["Province"] = province
                                gdf["FloodReturnPeriod"] = risk_period

                                # Collect data for batch processing
                                all_gdfs.append(gdf)
                                processed_count += 1
                                province_counter += 1

                            except Exception as e:
                                print(f"      ❌ Error processing {shp}: {e}")

                    # **Save after every 10 provinces**
                    if province_counter >= SAVE_INTERVAL:
                        if all_gdfs:
                            combined_gdf = gpd.GeoDataFrame(pd.concat(all_gdfs, ignore_index=True))
                            parquet_path = output_files[risk_period]

                            # Append by reading old data and merging before saving
                            if os.path.exists(parquet_path):
                                old_gdf = gpd.read_parquet(parquet_path)
                                combined_gdf = pd.concat([old_gdf, combined_gdf], ignore_index=True)

                            # Save
                            combined_gdf.to_parquet(parquet_path, index=False)
                            print(f"      ✅ Saved {province_counter} provinces to {parquet_path}")

                            # Reset buffer
                            all_gdfs = []
                            province_counter = 0

                # **Final save if any data remains**
                if all_gdfs:
                    combined_gdf = gpd.GeoDataFrame(pd.concat(all_gdfs, ignore_index=True))
                    parquet_path = output_files[risk_period]

                    if os.path.exists(parquet_path):
                        old_gdf = gpd.read_parquet(parquet_path)
                        combined_gdf = pd.concat([old_gdf, combined_gdf], ignore_index=True)

                    combined_gdf.to_parquet(parquet_path, index=False)
                    print(f"      ✅ Final save for {risk_period} at {parquet_path}")

# Final message
print("✅ Incremental processing with auto-saving complete!")


📂 Processing: Flood 1
  ⏳ Processing Risk Period: 5yr
    🏙 Processing Province: Kalinga
      📂 Reading file: ../00_data/flood_risk/Flood 1/5yr/Kalinga/Kalinga_Flood_5year.shp
    🏙 Processing Province: Marinduque
      📂 Reading file: ../00_data/flood_risk/Flood 1/5yr/Marinduque/Marinduque_Flood_5year.shp
    🏙 Processing Province: MountainProvince
      📂 Reading file: ../00_data/flood_risk/Flood 1/5yr/MountainProvince/MountainProvince_Flood_5year.shp
    🏙 Processing Province: Quirino
      📂 Reading file: ../00_data/flood_risk/Flood 1/5yr/Quirino/Quirino_Flood_5year.shp
    🏙 Processing Province: Sarangani
      📂 Reading file: ../00_data/flood_risk/Flood 1/5yr/Sarangani/Sarangani_Flood_5year.shp
      ✅ Final save for 5yr at ../01_processed_data/flood_risk/FloodRisk_5yr.parquet
  ⏳ Processing Risk Period: 25yr
    🏙 Processing Province: Abra
      📂 Reading file: ../00_data/flood_risk/Flood 1/25yr/Abra/Abra_Flood_25year.shp
    🏙 Processing Province: AgusanDelNorte
      📂 Readin

  return ogr_read(


      ✅ Final save for 5yr at ../01_processed_data/flood_risk/FloodRisk_5yr.parquet
  ⏳ Processing Risk Period: 25yr
    🏙 Processing Province: Bulacan
      📂 Reading file: ../00_data/flood_risk/Flood 2/25yr/Bulacan/Bulacan_Flood_25year.shp
    🏙 Processing Province: Cavite
      📂 Reading file: ../00_data/flood_risk/Flood 2/25yr/Cavite/Cavite_Flood_25year.shp
    🏙 Processing Province: IlocosNorte
      📂 Reading file: ../00_data/flood_risk/Flood 2/25yr/IlocosNorte/IlocosNorte_Flood_25year.shp
    🏙 Processing Province: IlocosSur
      📂 Reading file: ../00_data/flood_risk/Flood 2/25yr/IlocosSur/IlocosSur_Flood_25year.shp
    🏙 Processing Province: Isabela
      📂 Reading file: ../00_data/flood_risk/Flood 2/25yr/Isabela/Isabela_Flood_25year.shp
    🏙 Processing Province: LaUnion
      📂 Reading file: ../00_data/flood_risk/Flood 2/25yr/LaUnion/LaUnion_Flood_25year.shp
    🏙 Processing Province: Pampanga
      📂 Reading file: ../00_data/flood_risk/Flood 2/25yr/Pampanga/Pampanga_Flood_2

ValueError: Cannot determine common CRS for concatenation inputs, got ['WGS 84', 'WGS 84 / UTM zone 51N']. Use `to_crs()` to transform geometries to the same CRS before merging.

### Merging WITH Checkpoint

In [1]:
import geopandas as gpd
import pandas as pd
import os
import glob

# Root directories
root_dir = "../00_data/flood_risk/"
output_path = "../01_processed_data/flood_risk/"

# 📌 Common CRS
COMMON_CRS = "EPSG:4326"

# CHECKPOINT
start_with_flood_group = 2
start_with_risk_level = 100
start_with_province = "DavaoDelSur"
reached_start = False
first_pass = True

# Ensure output directory exists
os.makedirs(output_path, exist_ok=True)

# Paths for output files
output_files = {
    "5yr": os.path.join(output_path, "FloodRisk_5yr.parquet"),
    "25yr": os.path.join(output_path, "FloodRisk_25yr.parquet"),
    "100yr": os.path.join(output_path, "FloodRisk_100yr.parquet"),
}

# Processing settings
SAVE_INTERVAL = 10  # Save every 10 provinces
processed_count = 0
test_limit = None  # Set to an integer (e.g., 20) for testing, or None for full processing

# Loop through each Flood folder (Flood 1 to Flood 5)
for flood_folder in sorted(os.listdir(root_dir)):
    flood_path = os.path.join(root_dir, flood_folder)    

    if os.path.isdir(flood_path):  # Ensure it's a directory
        flood_group = int(flood_folder.split(" ")[1])  # Extract flood group number

        if first_pass and flood_group < start_with_flood_group:
            print(f"Skipping Flood Group: {flood_group}")
            continue  # Skip if before the start group    


        print(f"📂 Processing: {flood_folder}")

        # Loop through risk levels (5yr, 25yr, 100yr)
        for risk_period in output_files.keys():
            risk_path = os.path.join(flood_path, risk_period)


            if os.path.isdir(risk_path):  # Ensure it exists

                risk_level = int(risk_period[:-2])  # Extract risk level number
                if first_pass and risk_level < int(start_with_risk_level):
                    print(f"Skipping Risk Level: {risk_level}")
                    continue
            
                print(f"  ⏳ Processing Risk Period: {risk_period}")

                # Placeholder for batch processing
                all_gdfs = []
                province_counter = 0  # Track processed provinces

                # Loop through province folders inside risk period
                for province in sorted(os.listdir(risk_path)):

                    

                    province_path = os.path.join(risk_path, province)

                    if os.path.isdir(province_path):  # Ensure it's a province folder

                        if first_pass and not reached_start:
                            if province == start_with_province:
                                reached_start = True
                                first_pass = False
                                print(f"Reached start province: {province}")
                            else:
                                print(f"Skipping Province: {province}")
                                continue  # Skip provinces before the start point

                        print(f"    🏙 Processing Province: {province}")

                        # Find the shapefile inside the province folder
                        shapefiles = glob.glob(os.path.join(province_path, "*.shp"))

                        for shp in shapefiles:
                            if test_limit is not None and processed_count >= test_limit:
                                break  # Stop after reaching test limit

                            print(f"      📂 Reading file: {shp}")

                            try:
                                # Load shapefile
                                gdf = gpd.read_file(shp)

                                # ✅ Convert CRS if needed
                                if gdf.crs != COMMON_CRS:
                                    print(f"🔄 Converting CRS from {gdf.crs} to {COMMON_CRS}")
                                    gdf = gdf.to_crs(COMMON_CRS)

                                # Rename 'Var' to 'FloodRisk'
                                gdf = gdf.rename(columns={"Var": "FloodRisk"})

                                # Add metadata: Province and Flood Return Period
                                gdf["Province"] = province
                                gdf["FloodReturnPeriod"] = risk_period

                                # Collect data for batch processing
                                all_gdfs.append(gdf)
                                processed_count += 1
                                province_counter += 1

                            except Exception as e:
                                print(f"      ❌ Error processing {shp}: {e}")

                    # **Save after every 10 provinces**
                    if province_counter >= SAVE_INTERVAL:
                        if all_gdfs:
                            combined_gdf = gpd.GeoDataFrame(pd.concat(all_gdfs, ignore_index=True))
                            parquet_path = output_files[risk_period]

                            # Append by reading old data and merging before saving
                            if os.path.exists(parquet_path):
                                old_gdf = gpd.read_parquet(parquet_path)
                                combined_gdf = pd.concat([old_gdf, combined_gdf], ignore_index=True)

                            # Save
                            combined_gdf.to_parquet(parquet_path, index=False)
                            print(f"      ✅ Saved {province_counter} provinces to {parquet_path}")

                            # Reset buffer
                            all_gdfs = []
                            province_counter = 0

                # **Final save if any data remains**
                if all_gdfs:
                    combined_gdf = gpd.GeoDataFrame(pd.concat(all_gdfs, ignore_index=True))
                    parquet_path = output_files[risk_period]

                    if os.path.exists(parquet_path):
                        old_gdf = gpd.read_parquet(parquet_path)
                        combined_gdf = pd.concat([old_gdf, combined_gdf], ignore_index=True)

                    combined_gdf.to_parquet(parquet_path, index=False)
                    print(f"      ✅ Final save for {risk_period} at {parquet_path}")

# Final message
print("✅ Incremental processing with auto-saving complete!")


Skipping Flood Group: 1
📂 Processing: Flood 2
Skipping Risk Level: 5
Skipping Risk Level: 25
  ⏳ Processing Risk Period: 100yr
Skipping Province: Abra
Skipping Province: AgusanDelNorte
Skipping Province: Aklan
Skipping Province: Apayao
Skipping Province: Batangas
Skipping Province: Benguet
Skipping Province: CamarinesSur
Skipping Province: Camiguin
Skipping Province: Catanduanes
Skipping Province: Cebu
Skipping Province: CompostelaValley
Skipping Province: DavaoDelNorte
Reached start province: DavaoDelSur
    🏙 Processing Province: DavaoDelSur
      📂 Reading file: ../00_data/flood_risk/Flood 2/100yr/DavaoDelSur/DavaoDelSur_Flood_100year.shp
    🏙 Processing Province: DavaoOccidental
      📂 Reading file: ../00_data/flood_risk/Flood 2/100yr/DavaoOccidental/DavaoOccidental_Flood_100year.shp
    🏙 Processing Province: DinagatIslands
      📂 Reading file: ../00_data/flood_risk/Flood 2/100yr/DinagatIslands/DinagatIslands_Flood_100year.shp
    🏙 Processing Province: Kalinga
      📂 Reading 

### Updating Empty Folders

Some folders have the shapefiles erroneously stored inside another subfolder with the name of the province. \
e.g. ../00_data/flood_risk/Flood 1/100yr/Albay/Albay/

In [12]:
# Check empty folders
import os

# Define the root directory where the flood risk data is stored
root_dir = "../00_data/flood_risk/"

# List to store empty province folders
empty_province_folders = []

# Processing settings
SAVE_INTERVAL = 10  # Save every 10 provinces
processed_count = 0

# Paths for output files
output_files = {
    "5yr": os.path.join(output_path, "FloodRisk_5yr.parquet"),
    "25yr": os.path.join(output_path, "FloodRisk_25yr.parquet"),
    "100yr": os.path.join(output_path, "FloodRisk_100yr.parquet"),
}


# Iterate through Flood 1 to Flood 5
for flood_group in sorted(os.listdir(root_dir)):
    flood_group_path = os.path.join(root_dir, flood_group)
    
    # Skip if not a directory
    if not os.path.isdir(flood_group_path):
        continue

    # Iterate through return periods (5yr, 25yr, 100yr)
    for return_period in sorted(os.listdir(flood_group_path)):
        return_period_path = os.path.join(flood_group_path, return_period)
        
        if not os.path.isdir(return_period_path):
            continue

        # Placeholder for batch processing
        all_gdfs = []
        province_counter = 0  # Track processed provinces

        # Iterate through province folders
        for province in sorted(os.listdir(return_period_path)):
            province_path = os.path.join(return_period_path, province)

            if not os.path.isdir(province_path):
                continue

            # Check for .shp files in the province folder
            shp_files = [f for f in os.listdir(province_path) if f.endswith(".shp")]
            
            if not shp_files:
                # If no shapefile, there is likely another subfolder containing the shapefile

                for province2 in sorted(os.listdir(province_path)):
                    province2_path = os.path.join(province_path, province2)
                    
                    if os.path.isdir(province2_path):
                        
                        # Find the shapefile inside the province folder
                        shapefiles = glob.glob(os.path.join(province2_path, "*.shp"))

                        for shp in shapefiles:

                            print(f"      📂 Reading file: {shp}")

                            try:
                                # Load shapefile
                                gdf = gpd.read_file(shp)

                                # ✅ Convert CRS if needed
                                if gdf.crs != COMMON_CRS:
                                    print(f"🔄 Converting CRS from {gdf.crs} to {COMMON_CRS}")
                                    gdf = gdf.to_crs(COMMON_CRS)

                                # Rename 'Var' to 'FloodRisk'
                                gdf = gdf.rename(columns={"Var": "FloodRisk"})

                                # Add metadata: Province and Flood Return Period
                                gdf["Province"] = province2
                                gdf["FloodReturnPeriod"] = return_period

                                # Collect data for batch processing
                                all_gdfs.append(gdf)
                                processed_count += 1
                                province_counter += 1

                            except Exception as e:
                                print(f"      ❌ Error processing {shp}: {e}")

            # **Save after every 10 provinces**
            if province_counter >= SAVE_INTERVAL:
                if all_gdfs:
                    combined_gdf = gpd.GeoDataFrame(pd.concat(all_gdfs, ignore_index=True))
                    parquet_path = output_files[return_period]

                    # Append by reading old data and merging before saving
                    if os.path.exists(parquet_path):
                        old_gdf = gpd.read_parquet(parquet_path)
                        combined_gdf = pd.concat([old_gdf, combined_gdf], ignore_index=True)

                    # Save
                    combined_gdf.to_parquet(parquet_path, index=False)
                    print(f"      ✅ Saved {province_counter} provinces to {parquet_path}")

                    # Reset buffer
                    all_gdfs = []
                    province_counter = 0

        # **Final save if any data remains**
        if all_gdfs:
            combined_gdf = gpd.GeoDataFrame(pd.concat(all_gdfs, ignore_index=True))
            parquet_path = output_files[return_period]

            if os.path.exists(parquet_path):
                old_gdf = gpd.read_parquet(parquet_path)
                combined_gdf = pd.concat([old_gdf, combined_gdf], ignore_index=True)

            combined_gdf.to_parquet(parquet_path, index=False)
            print(f"      ✅ Final save for {return_period} at {parquet_path}")

          
# Final message
print("✅ Incremental processing with auto-saving complete!")



      📂 Reading file: ../00_data/flood_risk/Flood 1/100yr/Albay/Albay/PH050500000_FH_100yr.shp
      📂 Reading file: ../00_data/flood_risk/Flood 1/100yr/Bataan/Bataan/PH030800000_FH_100yr.shp
      ✅ Final save for 100yr at ../01_processed_data/flood_risk/FloodRisk_100yr.parquet
      📂 Reading file: ../00_data/flood_risk/Flood 1/25yr/Aklan/Aklan/PH060400000_FH_25yr.shp
      📂 Reading file: ../00_data/flood_risk/Flood 1/25yr/Albay/Albay/PH050500000_FH_25yr.shp
      📂 Reading file: ../00_data/flood_risk/Flood 1/25yr/Bataan/Bataan/PH030800000_FH_25yr.shp
      📂 Reading file: ../00_data/flood_risk/Flood 1/25yr/Cebu/Cebu/PH072200000_FH_25yr.shp
      📂 Reading file: ../00_data/flood_risk/Flood 1/25yr/Leyte/Leyte/PH083700000_FH_25yr.shp
      📂 Reading file: ../00_data/flood_risk/Flood 1/25yr/Misamis Oriental/Misamis Oriental/PH104300000_FH_25yr.shp
      📂 Reading file: ../00_data/flood_risk/Flood 1/25yr/Pangasinan/Pangasinan/PH015500000_FH_25yr.shp
      ✅ Final save for 25yr at ../01_