In [1]:
import geopandas as gpd
from utils import fill_holes, find_overlap_groups, get_min_year_from_group

In [2]:
wdpa = gpd.read_file('../data/WDPA_polygons.geojson').to_crs("ESRI:54009")
len(wdpa) # 6358

6358

In [3]:
# Fill holes smaller than 1500m x 1500m = 2250000 sq meters
filled = fill_holes(wdpa, max_hole_area=2250000) 

In [4]:
# Remove duplicates with >90% overlap (takes ~13min)
overlap_groups = find_overlap_groups(filled, overlap_threshold=90)

# Select best row from each group and create new dataset
selected_rows = []
for group_indices in overlap_groups:
    if len(group_indices) == 1:
        # Single geometry, keep as is
        selected_rows.append(filled.loc[group_indices[0]])
    else:
        # Keep the boundary with the minimum year attribute
        group_df = filled.loc[group_indices]
        best_row = get_min_year_from_group(group_df)
        selected_rows.append(best_row)

deduped_overlaps = gpd.GeoDataFrame(selected_rows, crs=filled.crs)

print(f"Removed {len(filled) - len(deduped_overlaps)} overlapping geometries")  # 729

Finding overlap groups with >90% overlap...
Removed 729 overlapping geometries


In [5]:
# Remove duplicate names by grouping and dissolving
# For each ORIG_NAME group, select the one with minimum year
deduped_names = deduped_overlaps.groupby('ORIG_NAME').apply(
    lambda x: get_min_year_from_group(x)).reset_index(drop=True)
dissolved = deduped_names.dissolve(by='ORIG_NAME', as_index=False)

print(f"Removed {len(deduped_overlaps) - len(dissolved)} duplicate names")  # 60

Removed 60 duplicate names


  deduped_names = deduped_overlaps.groupby('ORIG_NAME').apply(


In [6]:
# Recalculate dissolved area per geometry 
# And remove narrow polygons based on perimeter-to-area ratio
dissolved["AREA_DISSO"] = dissolved.geometry.area
dissolved["PERIMETER"] = dissolved.geometry.length  # length in CRS units
dissolved["PA_RATIO"]  = dissolved["PERIMETER"] / dissolved["AREA_DISSO"]
q75 = dissolved["PA_RATIO"].quantile(0.75)
wdpa_filtered = dissolved[dissolved["PA_RATIO"] < q75]

print(f"75th percentile of PA_RATIO: {q75}")  #0.00039173069233858557
print(f"Removed {len(wdpa) - len(wdpa_filtered['WDPA_PID'])} PAs in upper quantile") # 2182
print(f"Remaining PAs after filtering: {len(wdpa_filtered['WDPA_PID'])}") # 4176

75th percentile of PA_RATIO: 0.00038816387065234174
Removed 2182 PAs in upper quantile
Remaining PAs after filtering: 4176


In [7]:
wdpa_filtered.to_file("../data/wdpa_filtered/wdpa_filtered.shp")
#check = dissolved[dissolved["PA_RATIO"] >= q75]
#check.to_file("../data/q75/q75.shp")

  wdpa_filtered.to_file("../data/wdpa_filtered/wdpa_filtered.shp")
  write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
 