IMPORTS

In [2]:
import importlib
import sys

# Remove the module from cache if it exists
if 'utils2' in sys.modules:
    del sys.modules['utils2']

from utils2 import GeometryOperations

import ee
import pandas as pd
import pyproj
import geopandas as gpd
from shapely.geometry import Polygon

ee.Authenticate()
ee.Initialize(project='dse-staff')

PROTECTED_AREAS = ee.FeatureCollection("WCMC/WDPA/202106/polygons")
ECOREGIONS = gpd.read_file("../data/Ecoregions2017/Ecoregions2017.shp")
WATER_MASK = ee.Image("JRC/GSW1_4/GlobalSurfaceWater")
HM_IMAGE = ee.ImageCollection('CSP/HM/GlobalHumanModification').mean()
MODIS = ee.ImageCollection('MODIS/006/MOD09A1')

geom_ops = GeometryOperations()


Attention required for MODIS/006/MOD09A1! You are using a deprecated asset.
To make sure your code keeps working, please update it.
Learn more: https://developers.google.com/earth-engine/datasets/catalog/MODIS_006_MOD09A1



In [None]:
filtered_polygon = PROTECTED_AREAS.map(geom_ops.set_geometry_type) \
    .filter(geom_ops.get_pa_filter("Polygon")) \
    .map(geom_ops.get_biome) #6358

In [None]:
wdpa_pids_ee = filtered_polygon.aggregate_array('WDPA_PID')
wdpaids = wdpa_pids_ee.getInfo()
len(wdpaids) 

In [None]:
task = ee.batch.Export.table.toDrive(
    collection=filtered_polygon,
    description='WDPA_filtered_polygons',
    fileFormat='SHP'
)
task.start()

In [None]:
target_crs = "ESRI:54009"
wdpa = gpd.read_file('../data/WDPA_filtered_polygons.geojson').to_crs(target_crs)
len(wdpa) #6358

In [None]:
# Fill holes smaller than 1500m x 1500m

filled = geom_ops.fill_holes(wdpa, max_hole_area=2250000)  # 1500m * 1500m = 2250000 sq meters
print(f"Number of polygons after filling holes: {len(filled)}") #6358
print(f"Original polygon count: {len(wdpa)}") #6358

In [None]:
# Find overlap groups
overlap_groups = geom_ops.find_overlap_groups(wdpa, overlap_threshold=90)

# Select best row from each group and create new dataset
selected_rows = []
for group_indices in overlap_groups:
    if len(group_indices) == 1:
        # Single geometry, keep as is
        selected_rows.append(wdpa.loc[group_indices[0]])
    else:
        # Multiple overlapping geometries, select best one
        group_df = wdpa.loc[group_indices]
        best_row = geom_ops.get_min_year_from_group(group_df)
        selected_rows.append(best_row)
        print(f"Overlap group of {len(group_indices)} geometries - selected WDPA_PID: {best_row['WDPA_PID']}")

# Create new GeoDataFrame with selected rows
deduped_overlaps = gpd.GeoDataFrame(selected_rows, crs=wdpa.crs)

print(f"Original count: {len(wdpa)}") #6358
print(f"After removing >90% overlaps: {len(deduped_overlaps)}") #5629
print(f"Removed {len(wdpa) - len(deduped_overlaps)} overlapping geometries") #729

In [None]:
# Check for duplicate ORIG_NAME next
print("Checking for duplicate ORIG_NAME...")
name_duplicates = deduped_overlaps['ORIG_NAME'].duplicated().sum()
print(f"Duplicate ORIG_NAME: {name_duplicates}") #60

In [None]:
# Group by ORIG_NAME and apply the function
grouped = deduped_overlaps.groupby('ORIG_NAME').apply(lambda x: geom_ops.get_min_year_from_group(x)).reset_index(drop=True)

# Now dissolve the geometries while keeping the selected attributes
dissolved = grouped.dissolve(by='ORIG_NAME', as_index=False)
print(f"Number of polygons after dissolving by ORIG_NAME: {len(dissolved)}") #5569

After deduping, finish filtering out narrow PAs

In [None]:
# Remove narrow polygons based on perimeter-to-area ratio
# Recalculate area per geometry 
dissolved["AREA_DISSOLVED"] = dissolved.geometry.area
dissolved["PERIMETER"] = dissolved.geometry.length  # length in CRS units
dissolved["PA_RATIO"]  = dissolved["PERIMETER"] / dissolved["AREA_DISSOLVED"]

In [None]:
q75 = dissolved["PA_RATIO"].quantile(0.75) #0.00039173069233858557
print(f"75th percentile of PA_RATIO: {q75}")
check = dissolved[dissolved["PA_RATIO"] >= q75]
check.to_file("../data/q75.shp")

In [None]:
q90 = dissolved["PA_RATIO"].quantile(0.90) #0.0005701899361271124
print(f"90th percentile of PA_RATIO: {q90}")
check2 = dissolved[dissolved["PA_RATIO"] >= q90]
check2.to_file("../data/q90.shp")

In [None]:
wdpa_final = dissolved[dissolved["PA_RATIO"] < q90]
len(wdpa_final['WDPA_PID']) #5012

In [None]:
wdpa_final.to_file("../data/wdpa_final.shp")

CREATE ZONES

In [3]:
wdpa_final = gpd.read_file("../data/wdpa_final.shp")

In [4]:
len(wdpa_final)  # 5012

5012

In [None]:
zones_list = []

for idx, park in wdpa_final.iterrows():
    geom = park.geometry
    
    # Outer zones (rings extending outward from park boundary)
    center = geom.buffer(1000).difference(geom.buffer(-1000))  # -1 to +1km 
    outer = geom.buffer(3000).difference(geom.buffer(1000))    # +1 to +3km
    far_outer = geom.buffer(5000).difference(geom.buffer(3000)) # +3 to +5km
    
    # Inner zones (rings extending inward from park boundary)
    inner = geom.buffer(-1000).difference(geom.buffer(-3000))   # -1 to -3km (ring inside park)
    far_inner = geom.buffer(-3000).difference(geom.buffer(-5000)) # -3 to -5km (ring inside park)

    base_props = {col: park[col] for col in park.index if col not in ['geometry', 'geometry_t']}
    
    zones_list.extend([
        {**base_props, 'zone': '-1_1km', 'geometry': center},
        {**base_props, 'zone': '1_3km', 'geometry': outer},
        {**base_props, 'zone': '3_5km', 'geometry': far_outer},
        {**base_props, 'zone': '-1_-3km', 'geometry': inner},
        {**base_props, 'zone': '-3_-5km', 'geometry': far_inner}
    ])

zones = gpd.GeoDataFrame(zones_list, crs=wdpa_final.crs)

In [6]:
# Filter to just the 3_5km zones
zones_3_5 = zones[zones['zone'] == '3_5km'].copy()

# Use spatial index for efficient intersection checking
sindex = zones_3_5.sindex

shared_border = []
for idx, zone in zones_3_5.iterrows():
    # Find potential intersections using spatial index
    possible_matches_idx = list(sindex.intersection(zone.geometry.bounds))
    possible_matches = zones_3_5.iloc[possible_matches_idx]
    
    # Check if any other zone (different WDPA_PID) intersects
    has_border = False
    for other_idx, other_zone in possible_matches.iterrows():
        if (idx != other_idx and 
            zone['WDPA_PID'] != other_zone['WDPA_PID'] and
            zone.geometry.intersects(other_zone.geometry)):
            has_border = True
            break
    
    shared_border.append(has_border)

zones_3_5['SHARED_BORDER'] = shared_border

# Merge back to full zones dataset if needed
zones = zones.merge(
    zones_3_5[['WDPA_PID', 'zone', 'SHARED_BORDER']], 
    on=['WDPA_PID', 'zone'], 
    how='left'
)

In [None]:
len(zones)/5 #5012

5012.0

In [12]:
zones.to_file('/workspace/data/zones/zones.shp', driver='ESRI Shapefile')

  zones.to_file('/workspace/data/zones/zones.shp', driver='ESRI Shapefile')
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
 

RUN ANALYSIS

In [None]:
zones = ee.FeatureCollection('projects/dse-staff/assets/zones')

In [None]:
# Get permanent water layer from JRC Global Surface Water
# occurrence >= 90 is considered permanent water
permanent_water = WATER_MASK.select('occurrence').gte(90)

# Calculate percentage of permanent water pixels in each zone
def add_water_percentage(feature):
    # Get the geometry
    geom = feature.geometry()
    
    # Calculate total pixels in the zone
    total_pixels = ee.Image.constant(1).reduceRegion(
        reducer=ee.Reducer.count(),
        geometry=geom,
        scale=30,  # JRC GSW resolution
        maxPixels=1e9
    ).get('constant')
    
    # Calculate permanent water pixels
    water_pixels = permanent_water.reduceRegion(
        reducer=ee.Reducer.sum(),
        geometry=geom,
        scale=30,
        maxPixels=1e9
    ).get('occurrence')
    
    # Calculate percentage
    perc_water = ee.Number(water_pixels).divide(ee.Number(total_pixels)).multiply(100)
    
    return feature.set('PERC_PIX_PERM_WATER', perc_water)

# Apply to all zones
zones_with_water = zones.map(add_water_percentage)

In [None]:
# Use this in your analysis instead of the original zones
hm_masked = HM_IMAGE.updateMask(WATER_MASK)

hm_results = hm_masked.reduceRegions(
    collection=zones_with_water,  # Use updated zones
    reducer=ee.Reducer.mean().combine(ee.Reducer.stdDev(), '', True).combine(ee.Reducer.median(), '', True)
        .setOutputs(['hm_mean', 'hm_stddev', 'hm_median']),
    scale=500,
    tileScale=8
)

In [None]:
hm_masked = HM_IMAGE.updateMask(WATER_MASK)

hm_results = hm_masked.reduceRegions(
    collection=zones,
    reducer=ee.Reducer.mean().combine(ee.Reducer.stdDev(), '', True).combine(ee.Reducer.median(), '', True)
        .setOutputs(['hm_mean', 'hm_stddev', 'hm_median']),
    scale=500,
    tileScale=8
)

In [None]:
# Process multiple years of MODIS gradient data with task queue management
import time

years = list(range(2001, 2021)) 
max_concurrent_tasks = 10
submitted_tasks = []

def check_task_status():
    """Check status of submitted tasks and remove completed ones"""
    global submitted_tasks
    active_tasks = []
    for task_obj, year in submitted_tasks:
        task_status = task_obj.status()
        if task_status['state'] in ['COMPLETED', 'FAILED', 'CANCELLED']:
            print(f"Task {year} {task_status['state']}")
        else:
            active_tasks.append((task_obj, year))
    submitted_tasks = active_tasks
    return len(submitted_tasks)

for i, year in enumerate(years):
    # Wait if we have too many active tasks
    while check_task_status() >= max_concurrent_tasks:
        print(f"Waiting... {len(submitted_tasks)} tasks active")
        time.sleep(30)  # Check every 30 seconds
    
    print(f"Processing year {year} ({i+1}/{len(years)})...")
    
    # Get MODIS and calculate NDVI
    modis = MODIS.filterDate(f'{year}-01-01', f'{year}-12-31') \
        .max() \
        .select(['sur_refl_b01', 'sur_refl_b02'])  # Red and NIR bands
    
    # Calculate NDVI
    ndvi = modis.normalizedDifference(['sur_refl_b02', 'sur_refl_b01']).rename('ndvi').select('ndvi')
    
    # Calculate gradient of NDVI
    grad = ndvi.gradient()
    magnitude = grad.expression('sqrt(x*x + y*y)', {'x': grad.select('x'), 'y': grad.select('y')}).rename('gradient_magnitude')
    magnitude_masked = magnitude.updateMask(WATER_MASK)

    # Reduce with explicit CRS and scale matching MODIS
    final_results = magnitude_masked.reduceRegions(
        collection=hm_results,
        reducer=ee.Reducer.mean().combine(ee.Reducer.stdDev(), '', True).combine(ee.Reducer.median(), '', True)
            .setOutputs(['gradient_mean', 'gradient_stddev', 'gradient_median']),
        scale=500,   
        tileScale=8
    )
    
    # Export results and track task
    export_task = ee.batch.Export.table.toCloudStorage(
        collection=final_results,
        description=f'results_{year}',
        bucket='dse-staff',
        fileNamePrefix=f'protected_areas/results2/results_{year}',
        fileFormat='CSV',
        selectors=['WDPA_PID', 'ORIG_NAME', 'GOV_TYPE', 'OWN_TYPE',
                   'STATUS_YR', 'IUCN_CAT', 'GIS_AREA', 'PA_RATIO', 'BIOME_NAME',
                   'zone', 'SHARED_BORDER', 'PERC_PIX_PERM_WATER',
                   'hm_mean', 'hm_stddev', 'hm_median', 
                   'gradient_mean', 'gradient_stddev', 'gradient_median']
    )
    export_task.start()
    submitted_tasks.append((export_task, year))
    print(f"Export task started: {export_task.id} for {year}")

# Wait for remaining tasks to complete
print("Waiting for remaining tasks to complete...")
while check_task_status() > 0:
    print(f"Still waiting for {len(submitted_tasks)} tasks...")
    time.sleep(30)

print("All export tasks completed!")

VISUALIZATION

In [None]:
# Simplified version - just use gsutil and handle errors better
import subprocess
import tempfile
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt

# Simple download with error handling
temp_dir = tempfile.mkdtemp()
try:
    result = subprocess.run(['gsutil', 'ls', 'gs://dse-staff/protected_areas/results/*.csv'], 
                          capture_output=True, text=True, check=True)
    files = result.stdout.strip().split('\n')
    print(f"Found {len(files)} files")
    
    # Download all files
    subprocess.run(['gsutil', '-m', 'cp'] + files + [temp_dir], check=True)
    
    # Load and combine
    all_data = []
    for file in glob.glob(os.path.join(temp_dir, '*.csv')):
        year = int(os.path.basename(file).split('_')[-1].split('.')[0])
        df = pd.read_csv(file)
        df['year'] = year
        all_data.append(df)
    
    combined_df = pd.concat(all_data, ignore_index=True)
    combined_df['pid_zone'] = combined_df['WDPA_PID'].astype(str) + '_' + combined_df['zone']
    
    # Plot
    plt.figure(figsize=(15, 8))
    for pid_zone in combined_df['pid_zone'].unique():
        subset = combined_df[combined_df['pid_zone'] == pid_zone].sort_values('year')
        plt.plot(subset['year'], subset['gradient_mean'], alpha=0.7, linewidth=1)
    
    plt.xlabel('Year')
    plt.ylabel('NDVI Gradient Mean')
    plt.title('Timeline of NDVI Gradient by Protected Area and Buffer Zone')
    plt.grid(True, alpha=0.3)
    plt.show()
    
except subprocess.CalledProcessError as e:
    print(f"Error: {e}")
    print("Maybe no files exist yet or gsutil auth issue")