In [1]:
# Core analysis functions - modular and efficient
import ee
from utils import *
# Process geometries
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
import pyproj

# Initialize GEE
ee.Authenticate()
ee.Initialize(project='dse-staff')

# Global datasets
PROTECTED_AREAS = ee.FeatureCollection('WCMC/WDPA/current/polygons')
ECOREGIONS = ee.FeatureCollection("RESOLVE/ECOREGIONS/2017")
WATER_MASK = ee.Image("JRC/GSW1_0/GlobalSurfaceWater").select('max_extent').eq(0)
HM_IMAGE = ee.ImageCollection('CSP/HM/GlobalHumanModification').mean()
MODIS = ee.ImageCollection('MODIS/006/MOD09A1')


Attention required for JRC/GSW1_0/GlobalSurfaceWater! You are using a deprecated asset.
To make sure your code keeps working, please update it.
Learn more: https://developers.google.com/earth-engine/datasets/catalog/JRC_GSW1_0_GlobalSurfaceWater


Attention required for MODIS/006/MOD09A1! You are using a deprecated asset.
To make sure your code keeps working, please update it.
Learn more: https://developers.google.com/earth-engine/datasets/catalog/MODIS_006_MOD09A1



In [None]:
from config import * 
def set_geometry_type(feature):
    """
    Sets a 'geometry_type' property on a feature based on its geometry type.
    """
    return feature.set('geometry_type', feature.geometry().type())



filtered = filter_protected_areas()

filtered_collection = filtered.map(set_geometry_type).filter(
    ee.Filter.equals('geometry_type', 'Polygon')
)

task = ee.batch.Export.table.toDrive(
    collection=filtered_collection,
    description='WDPA_June2021_filtered_wo_perimeter',
    fileFormat='SHP'
)

task.start()

#wdpa_pids_ee = filtered_collection.aggregate_array('WDPA_PID')
#wdpaids = wdpa_pids_ee.getInfo()
#len(wdpaids). #6570

In [None]:
gdf = gpd.read_file('../data/global_wdpa_June2021/WDPA_June2021_filtered_wo_perimeter.shp',encoding='latin1')
len(gdf['WDPA_PID']) #6570

In [None]:
# Ensure PA_DEF is numeric (convert to float)
gdf["PA_DEF"] = gdf["PA_DEF"].astype(float)

# Compute perimeter and perimeter-area ratio
gdf["PERIMETER"] = gdf.geometry.length        # length in CRS units
gdf["PA_RATIO"]  = gdf["PERIMETER"] / gdf["GIS_AREA"]

# Filter: keep only features with PA_RATIO < 75th percentile
q75 = gdf["PA_RATIO"].quantile(0.75)
gdf_filtered = gdf[gdf["PA_RATIO"] < q75]
len(gdf_filtered['WDPA_PID']) #4927

In [None]:
import geopandas as gpd

# --- Load data ---
ecoregions = gpd.read_file("../data/Ecoregions2017/Ecoregions2017.shp")
wdpa = gdf_filtered

# --- CRS: project to Mollweide ---
target_crs = "ESRI:54009"
ecoregions = ecoregions.to_crs(target_crs)
wdpa = wdpa.to_crs(target_crs)

# --- Dissolve ecoregions into biomes ---
biomes = ecoregions.dissolve(by="BIOME_NAME").reset_index()

# --- Fix invalid geometries ---
biomes["geometry"] = biomes.buffer(0)
wdpa["geometry"] = wdpa.buffer(0)

# --- Overlay (vectorized intersection) ---
intersections = gpd.overlay(wdpa, biomes, how="intersection")

# --- Compute intersection areas ---
intersections["area"] = intersections.geometry.area

# --- Keep the largest overlapping biome per WDPA polygon ---
idx = intersections.groupby("WDPA_PID")["area"].idxmax()
largest_intersections = intersections.loc[idx].copy()

# --- Create final output: merge BIOME_NAME back to original WDPA geometries ---
wdpa_with_biome = wdpa.merge(
    largest_intersections[['WDPA_PID', 'BIOME_NAME']], 
    on='WDPA_PID', 
    how='left'
)

print(f"WDPA polygons with assigned biome: {len(wdpa_with_biome)}")
print(wdpa_with_biome.head())

In [None]:
wdpa_with_biome.to_file("../data/wdpa_filtered_with_biome.shp")

In [None]:
wdpa_with_biome = gpd.read_file("../data/wdpa_filtered_with_biome.shp")

In [None]:
import geopandas as gpd
from shapely.geometry import Polygon

def fill_holes_vector(gdf, max_hole_area=250000):  # 500m * 500m = 250k sq meters
    """Fill small holes in polygons using vector operations"""
    filled_geoms = []
    
    for geom in gdf.geometry:
        if hasattr(geom, 'interiors') and geom.interiors:
            # Get exterior boundary
            exterior = geom.exterior
            
            # Keep only large holes (filter out small ones)
            large_holes = [interior for interior in geom.interiors 
                          if Polygon(interior).area > max_hole_area]
            
            # Create new polygon with exterior + large holes only
            filled_geoms.append(Polygon(exterior, large_holes))
        else:
            # No holes - keep original geometry
            filled_geoms.append(geom)
    
    result = gdf.copy()
    result['geometry'] = filled_geoms
    return result

# Load WDPA polygons
target_crs = "ESRI:54009"
wdpa = wdpa_with_biome.to_crs(target_crs)

# Vector-based hole filling (much safer and faster)
wdpa_final = fill_holes_vector(wdpa, max_hole_area=250000)

print(f"Number of polygons after filling holes: {len(wdpa_final)}")
print(f"Original polygon count: {len(wdpa)}")
print("Vector approach preserves exact 1:1 relationship and all attributes!")


In [None]:
wdpa_final.to_file("../data/wdpa_final.shp")

In [2]:
wdpa_final = gpd.read_file("../data/wdpa_final.shp")

In [None]:
# Find and export duplicate geometries to shapefile for QGIS review
print("Finding duplicate geometries...")

# Get boolean mask of duplicated geometries (keep=False marks ALL duplicates)
duplicate_mask = wdpa_final['geometry'].duplicated(keep=False)

# Get all rows with duplicate geometries
duplicates_df = wdpa_final[duplicate_mask].copy()

print(f"Found {len(duplicates_df)} rows with duplicate geometries")

# Add a group ID to identify which geometries are duplicates of each other
duplicates_df['geom_hash'] = duplicates_df['geometry'].apply(lambda x: hash(x.wkt))
group_mapping = {hash_val: f"group_{i}" for i, hash_val in enumerate(duplicates_df['geom_hash'].unique())}
duplicates_df['duplicate_group'] = duplicates_df['geom_hash'].map(group_mapping)

# Drop the hash column (not needed in shapefile)
duplicates_df = duplicates_df.drop('geom_hash', axis=1)

# Export to shapefile for QGIS
output_path = '/workspace/data/duplicate_geometries.shp'
duplicates_df.to_file(output_path, driver='ESRI Shapefile')

print(f"Duplicate geometries exported to: {output_path}")
print(f"Total duplicate groups: {duplicates_df['duplicate_group'].nunique()}")
print(f"You can now open this in QGIS to visually inspect and decide which WDPA_PIDs to keep/drop")

# Show summary of each duplicate group
print("\nSummary of duplicate groups:")
for group in duplicates_df['duplicate_group'].unique():
    group_data = duplicates_df[duplicates_df['duplicate_group'] == group]
    pids = group_data['WDPA_PID'].tolist()
    names = group_data['ORIG_NAME'].tolist()
    print(f"{group}: WDPA_PIDs {pids} | Names: {names}")

In [10]:
# Find geometries with ≥90% overlap - ALL PARKS
print("Finding overlapping geometries (≥90%) across ALL parks...")

def calculate_overlap_percentage(geom1, geom2):
    """Calculate what percentage of each geometry overlaps with the other"""
    try:
        intersection = geom1.intersection(geom2)
        if intersection.is_empty:
            return 0, 0
        intersection_area = intersection.area
        overlap_pct_1 = (intersection_area / geom1.area) * 100
        overlap_pct_2 = (intersection_area / geom2.area) * 100
        return overlap_pct_1, overlap_pct_2
    except:
        return 0, 0

# Find all pairs with ≥90% overlap using brute force
overlap_pids = set()
total_comparisons = len(wdpa_final) * (len(wdpa_final) - 1) // 2
comparisons_done = 0

for idx, park in wdpa_final.iterrows():
    if idx % 500 == 0:
        print(f"Processing park {idx}/{len(wdpa_final)} - Found {len(overlap_pids)} overlapping parks so far")
    
    current_geom = park['geometry']
    current_pid = park['WDPA_PID']
    
    for other_idx, other_park in wdpa_final.iterrows():
        if other_idx <= idx:  # Skip already checked pairs and self
            continue
            
        other_pid = other_park['WDPA_PID']
        other_geom = other_park['geometry']
        
        overlap_1, overlap_2 = calculate_overlap_percentage(current_geom, other_geom)
        max_overlap = max(overlap_1, overlap_2)
        
        if max_overlap >= 90:
            overlap_pids.add(current_pid)
            overlap_pids.add(other_pid)
            print(f"Found ≥90% overlap: {current_pid} vs {other_pid}: {max_overlap:.1f}%")

# Get all rows with overlapping geometries
duplicates_df = wdpa_final[wdpa_final['WDPA_PID'].isin(overlap_pids)].copy()

print(f"\nFound {len(duplicates_df)} rows with ≥90% overlapping geometries")

# Add a group ID
duplicates_df['duplicate_group'] = 'overlap_group'

# Export to shapefile
output_path = '/workspace/data/overlap_geometries_all.shp'
duplicates_df.to_file(output_path, driver='ESRI Shapefile')

print(f"Overlapping geometries exported to: {output_path}")
print(f"You can now open this in QGIS to review all parks with ≥90% overlap")

Finding overlapping geometries (≥90%) across ALL parks...
Processing park 0/4927 - Found 0 overlapping parks so far
Found ≥90% overlap: 555542778 vs 3219: 98.5%
Found ≥90% overlap: 351788 vs 198356: 100.0%
Found ≥90% overlap: 19737 vs 351787: 100.0%
Found ≥90% overlap: 61611 vs 389012: 99.9%
Found ≥90% overlap: 555542497 vs 17713: 99.7%
Found ≥90% overlap: 860 vs 555531072: 99.8%
Found ≥90% overlap: 196219 vs 555722895: 100.0%
Found ≥90% overlap: 196219 vs 555548908: 100.0%
Found ≥90% overlap: 196219 vs 555588812: 100.0%
Found ≥90% overlap: 349468 vs 555722880: 100.0%
Found ≥90% overlap: 349468 vs 555580765: 100.0%
Found ≥90% overlap: 349468 vs 555588799: 100.0%
Found ≥90% overlap: 1338 vs 555531084: 99.4%
Found ≥90% overlap: 5783 vs 555531085: 99.3%
Found ≥90% overlap: 20673 vs 555531077: 96.6%
Found ≥90% overlap: 20938 vs 555722884: 100.0%
Found ≥90% overlap: 20938 vs 555548906: 100.0%
Found ≥90% overlap: 20938 vs 555588810: 100.0%
Found ≥90% overlap: 20943 vs 555722017: 100.0%
Found

  duplicates_df.to_file(output_path, driver='ESRI Shapefile')
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  

Overlapping geometries exported to: /workspace/data/overlap_geometries_all.shp
You can now open this in QGIS to review all parks with ≥90% overlap


  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


In [None]:
# Check for duplicate WDPA_PIDs first
print("Checking for duplicate WDPA_PIDs...")
pid_duplicates = wdpa_final['WDPA_PID'].duplicated().sum()
print(f"Duplicate WDPA_PIDs: {pid_duplicates}")

# Simple and reliable spatial relationship check
print("\nAnalyzing spatial relationships (simple method)...")
start_time = time.time()

shares_border = []
for idx, park in wdpa_final.iterrows():
    if idx % 500 == 0:
        print(f"Processing park {idx}/{len(wdpa_final)}")
    
    current_wdpa_pid = park['WDPA_PID']
    
    # Check against all other parks (brute force but reliable)
    has_relationship = False
    for other_idx, other_park in wdpa_final.iterrows():
        other_wdpa_pid = other_park['WDPA_PID']
        
        # Only check different WDPA_PIDs
        if (other_idx != idx and 
            current_wdpa_pid != other_wdpa_pid and 
            park.geometry.intersects(other_park.geometry)):
            has_relationship = True
            break
    
    shares_border.append(has_relationship)

wdpa_final['SHARED_BORDER'] = shares_border

elapsed = time.time() - start_time
print(f"Completed in {elapsed:.1f} seconds")
print(f"Parks with spatial relationships: {sum(shares_border)}")
print(f"Parks without spatial relationships: {len(shares_border) - sum(shares_border)}")

In [None]:
rings_list = []

for idx, park in wdpa_final.iterrows():
    geom = park.geometry
    
    # Fast local buffer operations
    small_ring = geom.buffer(1000).difference(geom.buffer(-1000))
    large_buffer = geom.buffer(5000).difference(geom.buffer(-5000))
    large_ring = large_buffer.difference(small_ring)
    
     # Create base_props from all columns except geometry columns
    base_props = {col: park[col] for col in park.index if col not in ['geometry', 'geometry_t']}
    
    rings_list.extend([
        {**base_props, 'zone': '1_km', 'geometry': small_ring},
        {**base_props, 'zone': '5_km', 'geometry': large_ring}
    ])

rings_gdf = gpd.GeoDataFrame(rings_list, crs=wdpa_final.crs)
rings_gdf.to_file('/workspace/data/rings/rings.shp', driver='ESRI Shapefile')
#ADD EXPORT TO ASSETS

In [None]:
zones = ee.FeatureCollection('projects/dse-staff/assets/zones')

In [None]:
hm_masked = HM_IMAGE.updateMask(WATER_MASK)

hm_results = hm_masked.reduceRegions(
    collection=zones,
    reducer=ee.Reducer.mean().combine(ee.Reducer.stdDev(), '', True)
        .setOutputs(['hm_mean', 'hm_stddev']),
    scale=500,
    tileScale=8
)

In [None]:
# Process multiple years of MODIS gradient data with task queue management
import time

years = list(range(2001, 2024))  # 2001 to 2023
max_concurrent_tasks = 10
submitted_tasks = []

def check_task_status():
    """Check status of submitted tasks and remove completed ones"""
    global submitted_tasks
    active_tasks = []
    for task_obj, year in submitted_tasks:
        task_status = task_obj.status()
        if task_status['state'] in ['COMPLETED', 'FAILED', 'CANCELLED']:
            print(f"Task {year} {task_status['state']}")
        else:
            active_tasks.append((task_obj, year))
    submitted_tasks = active_tasks
    return len(submitted_tasks)

for i, year in enumerate(years):
    # Wait if we have too many active tasks
    while check_task_status() >= max_concurrent_tasks:
        print(f"Waiting... {len(submitted_tasks)} tasks active")
        time.sleep(30)  # Check every 30 seconds
    
    print(f"Processing year {year} ({i+1}/{len(years)})...")
    
    # Get MODIS and calculate NDVI
    modis = MODIS.filterDate(f'{year}-01-01', f'{year}-12-31') \
        .median() \
        .select(['sur_refl_b01', 'sur_refl_b02'])  # Red and NIR bands
    
    # Calculate NDVI
    ndvi = modis.normalizedDifference(['sur_refl_b02', 'sur_refl_b01']).rename('ndvi').select('ndvi')
    
    # Calculate gradient of NDVI
    grad = ndvi.gradient()
    magnitude = grad.expression('sqrt(x*x + y*y)', {'x': grad.select('x'), 'y': grad.select('y')}).rename('gradient_magnitude')
    magnitude_masked = magnitude.updateMask(WATER_MASK)

    # Reduce with explicit CRS and scale matching MODIS
    final_results = magnitude_masked.reduceRegions(
        collection=hm_results,
        reducer=ee.Reducer.mean().combine(ee.Reducer.stdDev(), '', True)
            .setOutputs(['gradient_mean', 'gradient_stddev']),
        scale=500,  
        tileScale=8
    )
    
    # Export results and track task
    export_task = ee.batch.Export.table.toCloudStorage(
        collection=final_results,
        description=f'results_{year}',
        bucket='dse-staff',
        fileNamePrefix=f'protected_areas/results2/results_{year}',
        fileFormat='CSV',
        selectors=['WDPA_PID', 'ORIG_NAME', 'GOV_TYPE', 'OWN_TYPE',
                   'STATUS_YR', 'IUCN_CAT', 'GIS_AREA', 'PA_RATIO', 'BIOME_NAME',
                   'zone', 'hm_mean', 'hm_stddev', 'gradient_mean', 'gradient_stddev']
    )
    export_task.start()
    submitted_tasks.append((export_task, year))
    print(f"Export task started: {export_task.id} for {year}")

# Wait for remaining tasks to complete
print("Waiting for remaining tasks to complete...")
while check_task_status() > 0:
    print(f"Still waiting for {len(submitted_tasks)} tasks...")
    time.sleep(30)

print("All export tasks completed!")

In [None]:
# Simplified version - just use gsutil and handle errors better
import subprocess
import tempfile
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt

# Simple download with error handling
temp_dir = tempfile.mkdtemp()
try:
    result = subprocess.run(['gsutil', 'ls', 'gs://dse-staff/protected_areas/results/*.csv'], 
                          capture_output=True, text=True, check=True)
    files = result.stdout.strip().split('\n')
    print(f"Found {len(files)} files")
    
    # Download all files
    subprocess.run(['gsutil', '-m', 'cp'] + files + [temp_dir], check=True)
    
    # Load and combine
    all_data = []
    for file in glob.glob(os.path.join(temp_dir, '*.csv')):
        year = int(os.path.basename(file).split('_')[-1].split('.')[0])
        df = pd.read_csv(file)
        df['year'] = year
        all_data.append(df)
    
    combined_df = pd.concat(all_data, ignore_index=True)
    combined_df['pid_zone'] = combined_df['WDPA_PID'].astype(str) + '_' + combined_df['zone']
    
    # Plot
    plt.figure(figsize=(15, 8))
    for pid_zone in combined_df['pid_zone'].unique():
        subset = combined_df[combined_df['pid_zone'] == pid_zone].sort_values('year')
        plt.plot(subset['year'], subset['gradient_mean'], alpha=0.7, linewidth=1)
    
    plt.xlabel('Year')
    plt.ylabel('NDVI Gradient Mean')
    plt.title('Timeline of NDVI Gradient by Protected Area and Buffer Zone')
    plt.grid(True, alpha=0.3)
    plt.show()
    
except subprocess.CalledProcessError as e:
    print(f"Error: {e}")
    print("Maybe no files exist yet or gsutil auth issue")