In [None]:
import ee
import geopandas as gpd
from utils import get_pa_filter, set_geometry_type, get_biome, fill_holes, find_overlap_groups, get_min_year_from_group

ee.Authenticate()
ee.Initialize(project='dse-staff')

PROTECTED_AREAS = ee.FeatureCollection("WCMC/WDPA/202106/polygons")

In [None]:
filtered_polygon = PROTECTED_AREAS.map(set_geometry_type) \
    .filter(get_pa_filter("Polygon")) \
    .map(get_biome) 

wdpa_pids_ee = filtered_polygon.aggregate_array('WDPA_PID')
wdpaids = wdpa_pids_ee.getInfo()
len(wdpaids)  #6358

In [None]:
task = ee.batch.Export.table.toDrive(
    collection=filtered_polygon,
    description='WDPA_polygons',
    fileFormat='GeoJSON'
)
task.start()

# After it finishes, download it from your Google Drive to this repository's ..data/ folder
# Then proceed to the next step

In [None]:
target_crs = "ESRI:54009"
wdpa = gpd.read_file('../data/WDPA_polygons.geojson').to_crs(target_crs)
len(wdpa) #6358

In [None]:
# Fill holes smaller than 1500m x 1500m
filled = fill_holes(wdpa, max_hole_area=2250000)  # 1500m * 1500m = 2250000 sq meters
print(f"Number of polygons after filling holes: {len(filled)}")  # 6358
print(f"Original polygon count: {len(wdpa)}")  # 6358

In [None]:
# Find overlap groups (takes ~30min)
overlap_groups = find_overlap_groups(wdpa, overlap_threshold=90)

# Select best row from each group and create new dataset
selected_rows = []
for group_indices in overlap_groups:
    if len(group_indices) == 1:
        # Single geometry, keep as is
        selected_rows.append(wdpa.loc[group_indices[0]])
    else:
        # Multiple overlapping geometries, select best one
        group_df = wdpa.loc[group_indices]
        best_row = get_min_year_from_group(group_df)
        selected_rows.append(best_row)
        print(f"Overlap group of {len(group_indices)} geometries - selected WDPA_PID: {best_row['WDPA_PID']}")

# Create new GeoDataFrame with selected rows
deduped_overlaps = gpd.GeoDataFrame(selected_rows, crs=wdpa.crs)

print(f"Original count: {len(wdpa)}")  # 6358
print(f"After removing >90% overlaps: {len(deduped_overlaps)}")  # 5629
print(f"Removed {len(wdpa) - len(deduped_overlaps)} overlapping geometries")  # 729

In [None]:
# Check for duplicate ORIG_NAME next
print("Checking for duplicate ORIG_NAME...")
name_duplicates = deduped_overlaps['ORIG_NAME'].duplicated().sum()
print(f"Duplicate ORIG_NAME: {name_duplicates}") #60

In [None]:
# Group by ORIG_NAME and get min year attribute data
grouped = deduped_overlaps.groupby('ORIG_NAME').apply(lambda x: get_min_year_from_group(x)).reset_index(drop=True)

# Now dissolve the geometries while keeping the selected attributes
dissolved = grouped.dissolve(by='ORIG_NAME', as_index=False)
print(f"Number of polygons after dissolving by ORIG_NAME: {len(dissolved)}")  # 5569

In [None]:
# Remove narrow polygons based on perimeter-to-area ratio
# Recalculate area per geometry 
dissolved["AREA_DISSO"] = dissolved.geometry.area
dissolved["PERIMETER"] = dissolved.geometry.length  # length in CRS units
dissolved["PA_RATIO"]  = dissolved["PERIMETER"] / dissolved["AREA_DISSO"]

In [None]:
q75 = dissolved["PA_RATIO"].quantile(0.75) #0.00039173069233858557
print(f"75th percentile of PA_RATIO: {q75}")
check = dissolved[dissolved["PA_RATIO"] >= q75]
check.to_file("../data/q75/q75.shp")

In [None]:
wdpa_filtered = dissolved[dissolved["PA_RATIO"] < q75]
len(wdpa_filtered['WDPA_PID']) #4176

In [None]:
wdpa_filtered.to_file("../data/wdpa_filtered/wdpa_filtered.shp")