IMPORTS

In [1]:
import importlib
import sys

# Remove the module from cache if it exists
if 'utils2' in sys.modules:
    del sys.modules['utils2']

from utils2 import GeometryOperations

import ee
import pandas as pd
import pyproj
import geopandas as gpd
from shapely.geometry import Polygon

ee.Authenticate()
ee.Initialize(project='dse-staff')

PROTECTED_AREAS = ee.FeatureCollection("WCMC/WDPA/202106/polygons")
ECOREGIONS = gpd.read_file("../data/Ecoregions2017/Ecoregions2017.shp")
WATER = ee.Image("JRC/GSW1_4/GlobalSurfaceWater")
HM_IMAGE = ee.ImageCollection('CSP/HM/GlobalHumanModification').mean()
MODIS = ee.ImageCollection('MODIS/006/MOD09A1')

geom_ops = GeometryOperations()


Attention required for MODIS/006/MOD09A1! You are using a deprecated asset.
To make sure your code keeps working, please update it.
Learn more: https://developers.google.com/earth-engine/datasets/catalog/MODIS_006_MOD09A1



In [None]:
filtered_polygon = PROTECTED_AREAS.map(geom_ops.set_geometry_type) \
    .filter(geom_ops.get_pa_filter("Polygon")) \
    .map(geom_ops.get_biome) #6358

In [None]:
wdpa_pids_ee = filtered_polygon.aggregate_array('WDPA_PID')
wdpaids = wdpa_pids_ee.getInfo()
len(wdpaids) 

In [None]:
task = ee.batch.Export.table.toDrive(
    collection=filtered_polygon,
    description='WDPA_filtered_polygons',
    fileFormat='SHP'
)
task.start()

In [None]:
target_crs = "ESRI:54009"
wdpa = gpd.read_file('../data/WDPA_filtered_polygons.geojson').to_crs(target_crs)
len(wdpa) #6358

In [None]:
# Fill holes smaller than 1500m x 1500m

filled = geom_ops.fill_holes(wdpa, max_hole_area=2250000)  # 1500m * 1500m = 2250000 sq meters
print(f"Number of polygons after filling holes: {len(filled)}") #6358
print(f"Original polygon count: {len(wdpa)}") #6358

In [None]:
# Find overlap groups
overlap_groups = geom_ops.find_overlap_groups(wdpa, overlap_threshold=90)

# Select best row from each group and create new dataset
selected_rows = []
for group_indices in overlap_groups:
    if len(group_indices) == 1:
        # Single geometry, keep as is
        selected_rows.append(wdpa.loc[group_indices[0]])
    else:
        # Multiple overlapping geometries, select best one
        group_df = wdpa.loc[group_indices]
        best_row = geom_ops.get_min_year_from_group(group_df)
        selected_rows.append(best_row)
        print(f"Overlap group of {len(group_indices)} geometries - selected WDPA_PID: {best_row['WDPA_PID']}")

# Create new GeoDataFrame with selected rows
deduped_overlaps = gpd.GeoDataFrame(selected_rows, crs=wdpa.crs)

print(f"Original count: {len(wdpa)}") #6358
print(f"After removing >90% overlaps: {len(deduped_overlaps)}") #5629
print(f"Removed {len(wdpa) - len(deduped_overlaps)} overlapping geometries") #729

In [None]:
# Check for duplicate ORIG_NAME next
print("Checking for duplicate ORIG_NAME...")
name_duplicates = deduped_overlaps['ORIG_NAME'].duplicated().sum()
print(f"Duplicate ORIG_NAME: {name_duplicates}") #60

In [None]:
# Group by ORIG_NAME and apply the function
grouped = deduped_overlaps.groupby('ORIG_NAME').apply(lambda x: geom_ops.get_min_year_from_group(x)).reset_index(drop=True)

# Now dissolve the geometries while keeping the selected attributes
dissolved = grouped.dissolve(by='ORIG_NAME', as_index=False)
print(f"Number of polygons after dissolving by ORIG_NAME: {len(dissolved)}") #5569

After deduping, finish filtering out narrow PAs

In [None]:
# Remove narrow polygons based on perimeter-to-area ratio
# Recalculate area per geometry 
dissolved["AREA_DISSOLVED"] = dissolved.geometry.area
dissolved["PERIMETER"] = dissolved.geometry.length  # length in CRS units
dissolved["PA_RATIO"]  = dissolved["PERIMETER"] / dissolved["AREA_DISSOLVED"]

In [None]:
q75 = dissolved["PA_RATIO"].quantile(0.75) #0.00039173069233858557
print(f"75th percentile of PA_RATIO: {q75}")
check = dissolved[dissolved["PA_RATIO"] >= q75]
check.to_file("../data/q75.shp")

In [None]:
q90 = dissolved["PA_RATIO"].quantile(0.90) #0.0005701899361271124
print(f"90th percentile of PA_RATIO: {q90}")
check2 = dissolved[dissolved["PA_RATIO"] >= q90]
check2.to_file("../data/q90.shp")

In [None]:
wdpa_final = dissolved[dissolved["PA_RATIO"] < q90]
len(wdpa_final['WDPA_PID']) #5012

In [None]:
wdpa_final.to_file("../data/wdpa_final.shp")

CREATE ZONES

In [None]:
wdpa_final = gpd.read_file("../data/wdpa_final.shp")

In [None]:
len(wdpa_final)  # 5012

In [None]:
zones_list = []

for idx, park in wdpa_final.iterrows():
    geom = park.geometry
    
    # Outer zones (rings extending outward from park boundary)
    center = geom.buffer(1000).difference(geom.buffer(-1000))  # -1 to +1km 
    outer = geom.buffer(3000).difference(geom.buffer(1000))    # +1 to +3km
    far_outer = geom.buffer(5000).difference(geom.buffer(3000)) # +3 to +5km
    
    # Inner zones (rings extending inward from park boundary)
    inner = geom.buffer(-1000).difference(geom.buffer(-3000))   # -1 to -3km (ring inside park)
    far_inner = geom.buffer(-3000).difference(geom.buffer(-5000)) # -3 to -5km (ring inside park)

    base_props = {col: park[col] for col in park.index if col not in ['geometry', 'geometry_t']}
    
    zones_list.extend([
        {**base_props, 'zone': '-1_1km', 'geometry': center},
        {**base_props, 'zone': '1_3km', 'geometry': outer},
        {**base_props, 'zone': '3_5km', 'geometry': far_outer},
        {**base_props, 'zone': '-1_-3km', 'geometry': inner},
        {**base_props, 'zone': '-3_-5km', 'geometry': far_inner}
    ])

zones = gpd.GeoDataFrame(zones_list, crs=wdpa_final.crs)

In [None]:
# Filter to just the 3_5km zones
zones_3_5 = zones[zones['zone'] == '3_5km'].copy()

# Use spatial index for efficient intersection checking
sindex = zones_3_5.sindex

shared_border = []
for idx, zone in zones_3_5.iterrows():
    # Find potential intersections using spatial index
    possible_matches_idx = list(sindex.intersection(zone.geometry.bounds))
    possible_matches = zones_3_5.iloc[possible_matches_idx]
    
    # Check if any other zone (different WDPA_PID) intersects
    has_border = False
    for other_idx, other_zone in possible_matches.iterrows():
        if (idx != other_idx and 
            zone['WDPA_PID'] != other_zone['WDPA_PID'] and
            zone.geometry.intersects(other_zone.geometry)):
            has_border = True
            break
    
    shared_border.append(has_border)

zones_3_5['SHARED_BORDER'] = shared_border

# Merge back to full zones dataset if needed
zones = zones.merge(
    zones_3_5[['WDPA_PID', 'zone', 'SHARED_BORDER']], 
    on=['WDPA_PID', 'zone'], 
    how='left'
)

In [None]:
zones.to_file('/workspace/data/zones/zones.shp', driver='ESRI Shapefile')

RUN ANALYSIS

In [2]:
zones = ee.FeatureCollection('projects/dse-staff/assets/zones')

In [3]:
## More efficient approach: Calculate water percentage with reduceRegions (batch processing)
#permanent_water = WATER.select('occurrence').gte(90)
#
## Create a binary image: 1 for water, 0 for land
#water_binary = permanent_water.unmask(0)
#
## Use reduceRegions to calculate mean (which gives % water when binary)
#zones_with_water = water_binary.reduceRegions(
#    collection=zones,
#    reducer=ee.Reducer.mean().setOutputs(['perc_water']),
#    scale=30,
#    tileScale=4
#).map(lambda f: f.set('perc_water', ee.Number(f.get('perc_water')).multiply(100)))

# Now use this for HM calculation
hm_results = HM_IMAGE.reduceRegions(
    collection=zones,
    reducer=ee.Reducer.mean().combine(ee.Reducer.stdDev(), '', True).combine(ee.Reducer.median(), '', True)
        .setOutputs(['hm_mean', 'hm_std', 'hm_med']),
    scale=500,
    tileScale=8
)

In [4]:
# Process multiple years of MODIS gradient data with task queue management
import time

years = list(range(2001, 2022)) 
max_concurrent_tasks = 15
submitted_tasks = []

def check_task_status():
    """Check status of submitted tasks and remove completed ones"""
    global submitted_tasks
    active_tasks = []
    for task_obj, year in submitted_tasks:
        task_status = task_obj.status()
        if task_status['state'] in ['COMPLETED', 'FAILED', 'CANCELLED']:
            print(f"Task {year} {task_status['state']}")
        else:
            active_tasks.append((task_obj, year))
    submitted_tasks = active_tasks
    return len(submitted_tasks)

for i, year in enumerate(years):
    # Wait if we have too many active tasks
    while check_task_status() >= max_concurrent_tasks:
        print(f"Waiting... {len(submitted_tasks)} tasks active")
        time.sleep(30)  # Check every 30 seconds
    
    print(f"Processing year {year} ({i+1}/{len(years)})...")
    
    # Get MODIS and calculate NDVI
    modis = MODIS.filterDate(f'{year}-01-01', f'{year}-12-31') \
        .max() \
        .select(['sur_refl_b01', 'sur_refl_b02','sur_refl_b03','sur_refl_b04','sur_refl_b05','sur_refl_b06','sur_refl_b07'])  
    
    # Calculate NDVI
    #ndvi = modis.normalizedDifference(['sur_refl_b02', 'sur_refl_b01']).rename('ndvi').select('ndvi')
    
    # Calculate gradient 
    grad = modis.gradient()
    magnitude = grad.expression('sqrt(x*x + y*y)', {'x': grad.select('x'), 'y': grad.select('y')})

    # Reduce with explicit CRS and scale matching MODIS
    final_results = magnitude.reduceRegions(
        collection=hm_results,
        reducer=ee.Reducer.mean().combine(ee.Reducer.stdDev(), '', True).combine(ee.Reducer.median(), '', True)
            .setOutputs(['grad_mean', 'grad_std', 'grad_med']),
        scale=500,   
        tileScale=8
    )
    
    # Export results and track task
    export_task = ee.batch.Export.table.toDrive(
        collection=final_results,
        description=f'final_results_{year}',
        folder='pa_results_20251112',  # This will create/use a 'results' folder in your Google Drive
        fileNamePrefix=f'final_results_{year}',
        fileFormat='CSV',
        selectors=['WDPA_PID', 'ORIG_NAME', 'GOV_TYPE', 'OWN_TYPE',
        'STATUS_YR', 'IUCN_CAT', 'GIS_AREA', 'CONS_OBJ', 'DESIG', 'DESIG_ENG',
        'DESIG_TYPE', 'GIS_M_AREA', 'INT_CRIT', 'ISO3', 'MANG_AUTH',
        'MANG_PLAN', 'MARINE', 'METADATAID', 'NAME', 'NO_TAKE',
        'NO_TK_AREA', 'PARENT_ISO', 'PA_DEF', 'REP_AREA', 'REP_M_AREA',
        'STATUS', 'SUB_LOC', 'SUPP_INFO', 'VERIF', 'WDPAID','BIOME_NAME',
        'PA_RATIO', 'AREA_DISSO', 'PERIMETER', 'SHARED_BOR', 
        'zone',
        'hm_mean', 'hm_std', 'hm_med', 
        'grad_mean', 'grad_std', 'grad_med']
    )
    export_task.start()
    submitted_tasks.append((export_task, year))
    print(f"Export task started: {export_task.id} for {year}")

# Wait for remaining tasks to complete
print("Waiting for remaining tasks to complete...")
while check_task_status() > 0:
    print(f"Still waiting for {len(submitted_tasks)} tasks...")
    time.sleep(30)

print("All export tasks completed!")

Processing year 2001 (1/21)...
Export task started: G3VP6W32FPCVQPTK74UHV757 for 2001
Processing year 2002 (2/21)...
Export task started: SYAZUGFJEJI4FRNEGMC64BBO for 2002
Processing year 2003 (3/21)...
Export task started: E72OQEK3O5SJUSCHJRV4DHZE for 2003
Processing year 2004 (4/21)...
Export task started: V23LHWETVIN74KYEHEIY3APY for 2004
Processing year 2005 (5/21)...
Export task started: M5PUHDVVFUOT3HDSKZDQ7QMD for 2005
Processing year 2006 (6/21)...
Export task started: B6NAKOQSXH2L6CNF4U5BR2NX for 2006
Processing year 2007 (7/21)...
Export task started: JQ5SRQHHP3NSLSBJFI6UA53Q for 2007
Processing year 2008 (8/21)...
Export task started: DO3AQ75Y6IBOQPRQFQ537FKJ for 2008
Processing year 2009 (9/21)...
Export task started: FXZ5BIW6KSAWJ3LKSM3P2ALZ for 2009
Processing year 2010 (10/21)...
Export task started: ZJZQUL7NSGA4BEP45WFZDCFL for 2010
Processing year 2011 (11/21)...
Export task started: K423FW75PX3FMFORBU376GKA for 2011
Processing year 2012 (12/21)...
Export task started: