In [None]:
import geopandas as gpd
from utils import remove_pa_transects_in_chunks, transform_chunks_crs, combine_chunks_to_files

In [2]:
wdpa_filtered = gpd.read_file("../data/wdpa_filtered/wdpa_filtered.shp")
crs = wdpa_filtered.crs
len(wdpa_filtered)  # 4176

4176

In [3]:
# Set parameters 
sample_dist = 500  # transect spacing (meters)
transect_unit = 2500  # distance between samples along a transect (meters)
transect_pts = 2  # number of points on each side of boundary point
buffer_dist = transect_unit * transect_pts + 500  # size of inner buffer, evaluates point validity

In [4]:
# Create all interior buffers at once (takes ~2min)
print("Creating inner buffers for all protected areas...")
wdpa_buffers = wdpa_filtered[['WDPA_PID', 'geometry']].copy()
wdpa_buffers['geometry'] = wdpa_buffers.geometry.buffer(-buffer_dist)
wdpa_buffer_dict = dict(zip(wdpa_buffers['WDPA_PID'], wdpa_buffers['geometry']))
del wdpa_buffers  # Free memory - only need the dictionary

Creating inner buffers for all protected areas...


In [5]:
# Process PAs and write to chunks (~6min)
stats = remove_pa_transects_in_chunks(
    wdpa_gdf=wdpa_filtered,
    wdpa_buffer_dict=wdpa_buffer_dict,
    sample_dist=sample_dist,
    transect_unit=transect_unit,
    transect_pts=transect_pts,
    output_dir="../data/transect_chunks",
    chunk_size=500
)

# Print statistics
#print(f"Total PAs in dataset: {stats['total_pas']:,}") # 4176
#print(f"PAs with empty buffer (too small): {stats['empty_buffer']:,}") # 237
#print(f"PAs with all transects filtered out: {stats['all_filtered']:,}") # 0
#print(f"PAs successfully processed: {stats['pas_processed']:,}") # 3939
#print(f"Transects filtered - inside buffer (bad angle): {stats['bad_inside_buffer']:,}") # 39,475
#print(f"Transects filtered - outside PA (crossed): {stats['bad_outside_pa']:,}") # 403,265
#print(f"Total transect points (filtered): {stats['total_points']:,}") # 8,650,425
#print(f"Total unique transects: {stats['total_transects']:,}") # 1,730,085
#print(f"Average transects per PA: {stats['total_transects'] / stats['pas_processed']:.1f}") # 439.2 from ~500 per PA
#print(f"Average points per transect: {stats['total_points'] / stats['total_transects']:.1f}") # 5
#print(f"Created {len(stats['chunk_files'])} chunk files") # 8 files

Processing 4176 protected areas with streaming filter...
  Processed 525/4176 PAs, wrote chunk 1 | Total points: 1,078,330
  Processed 1061/4176 PAs, wrote chunk 2 | Total points: 2,086,905
  Processed 1592/4176 PAs, wrote chunk 3 | Total points: 3,128,855
  Processed 2131/4176 PAs, wrote chunk 4 | Total points: 4,096,590
  Processed 2658/4176 PAs, wrote chunk 5 | Total points: 5,278,985
  Processed 3179/4176 PAs, wrote chunk 6 | Total points: 6,432,460
  Processed 3714/4176 PAs, wrote chunk 7 | Total points: 7,591,200
  Wrote final chunk 8

Processing Complete!
Total PAs in dataset: 4,176
PAs with empty buffer (too small): 237
PAs with all transects filtered out: 0
PAs successfully processed: 3,939
Transects filtered - inside buffer (bad angle): 39,475
Transects filtered - outside PA (crossed): 403,265
Total transect points (filtered): 8,650,425
Total unique transects: 1,730,085
Average transects per PA: 439.2
Average points per transect: 5.0
Created 8 chunk files


In [6]:
# Transform each chunk CRS from ESRI:54009 to EPSG:4326 for earth engine (~5min)
transform_chunks_crs(
    chunk_pattern='../data/transect_chunks/chunk_*.csv',
    source_crs='ESRI:54009',
    target_crs='EPSG:4326'
)

Transforming 8 chunks from ESRI:54009 to EPSG:4326...
  Transformed 3/8 chunks
  Transformed 6/8 chunks
  Transformed 8/8 chunks
CRS transformation complete!


In [7]:
# Combine chunks into transects (essential cols) and attributes (metadata) files (~4min)
combine_chunks_to_files(
    chunk_pattern='../data/transect_chunks/chunk_*.csv',
    transect_output='../data/transects_final.csv',
    attributes_output='../data/attributes_final.csv',
    transect_cols=['WDPA_PID', 'transectID', 'point_position', 'x', 'y']
)
# Now upload transects_final.csv as an asset to Earth Engine (x,y), crs is EPSG:4326
# Uploading the asset takes ~30min

Found 8 chunk files
Combining into:
  - ../data/transects_final.csv
  - ../data/attributes_final.csv

Transect columns: ['WDPA_PID', 'transectID', 'point_position', 'x', 'y']
Attribute columns: ['ORIG_NAME', 'id', 'BIOME_NAME', 'CONS_OBJ', 'DESIG', 'DESIG_ENG', 'DESIG_TYPE', 'GIS_AREA', 'GIS_M_AREA', 'GOV_TYPE', 'INT_CRIT', 'ISO3', 'IUCN_CAT', 'MANG_AUTH', 'MANG_PLAN', 'MARINE', 'METADATAID', 'NAME', 'NO_TAKE', 'NO_TK_AREA', 'OWN_TYPE', 'PARENT_ISO', 'PA_DEF', 'REP_AREA', 'REP_M_AREA', 'STATUS', 'STATUS_YR', 'SUB_LOC', 'SUPP_INFO', 'VERIF', 'WDPAID', 'WDPA_PID', 'AREA_DISSO', 'PERIMETER', 'PA_RATIO']

Writing transects file...
  Wrote chunk 1/8
  Wrote chunk 2/8
  Wrote chunk 3/8
  Wrote chunk 4/8
  Wrote chunk 5/8
  Wrote chunk 6/8
  Wrote chunk 7/8
  Wrote chunk 8/8
Transects saved: 437.5 MB

Extracting unique attributes by WDPA_PID...
  Processed 3/8 chunks
  Processed 6/8 chunks
  Processed 8/8 chunks

Attributes saved: 1.8 MB
Unique WDPA_PIDs: 3939

Combining complete!
