In [None]:
import glob
import os
import gc
import pandas as pd
import geopandas as gpd
import numpy as np
from utils import create_transects

In [None]:
wdpa_filtered = gpd.read_file("../data/wdpa_filtered/wdpa_filtered.shp")
crs = wdpa_filtered.crs
len(wdpa_filtered)  # 5012

In [None]:
# Set parameters 
sample_dist = 500  # transect spacing (meters)
transect_unit = 2500  # distance between samples along a transect (meters)
transect_pts = 2  # number of points on each side of boundary point
buffer_dist = transect_unit * transect_pts + 500  # size of inner buffer, evaluates point validity

In [None]:
# Create all interior buffers at once
print("Creating inner buffers for all protected areas...")
wdpa_buffers = wdpa_filtered[['WDPA_PID', 'geometry']].copy()
wdpa_buffers['geometry'] = wdpa_buffers.geometry.buffer(-buffer_dist)
wdpa_buffer_dict = dict(zip(wdpa_buffers['WDPA_PID'], wdpa_buffers['geometry']))
del wdpa_buffers  # Free memory - only need the dictionary

# Diagnose any buffer issues
print("Checking buffer validity...")
empty_buffers = sum(1 for geom in wdpa_buffer_dict.values() if geom.is_empty)
invalid_buffers = sum(1 for geom in wdpa_buffer_dict.values() if not geom.is_valid)
print(f"  Total buffers created: {len(wdpa_buffer_dict)}")
print(f"  Empty buffers (PA too small): {empty_buffers}")
print(f"  Invalid buffers: {invalid_buffers}")
print(f"  Valid non-empty buffers: {len(wdpa_buffer_dict) - empty_buffers - invalid_buffers}")

In [None]:
# Process PAs one at a time, filter immediately, write to chunks
output_dir = "../data/transect_chunks"
os.makedirs(output_dir, exist_ok=True)

print(f"Processing {len(wdpa_filtered)} protected areas with streaming filter...")
chunk_files = []
chunk_num = 0
chunk_data = []
total_points = 0
total_transects = 0
pas_processed = 0

# Diagnostic counters
no_transects = 0
no_buffer = 0
empty_buffer = 0
all_filtered = 0

# This function keeps transects for parks that pass these checks
#✅ Transects generated successfully
#✅ Generates an inner buffer in the dictionary
#✅ Inner buffer is not empty
#✅ Inner points are NOT inside the inner buffer
#✅ Inner points are NOT outside the PA polygon
#✅ Still has at least 1 transect remaining after filtering

for idx, (_, park_row) in enumerate(wdpa_filtered.iterrows()):
    # Generate transects for single PA
    transect_df = create_transects((idx, park_row), sample_dist, transect_unit, transect_pts)
    
    if transect_df is None:
        no_transects += 1
        continue
    
    # Filter bad transects immediately for this PA
    pid = park_row['WDPA_PID']
    if pid not in wdpa_buffer_dict:
        no_buffer += 1
        continue
    
    # Check if buffer is empty, skip adding transects (PA too small)
    buffer_geom = wdpa_buffer_dict[pid]
    if buffer_geom.is_empty:
        empty_buffer += 1
        continue
    
    # Get inner points only
    inner_pts = transect_df[transect_df['point_position'] < 0].copy()
    
    if len(inner_pts) > 0:
        # Create minimal geodataframe for spatial check
        inner_gdf = gpd.GeoDataFrame(
            inner_pts[['WDPA_PID', 'transectID']],
            geometry=gpd.points_from_xy(inner_pts['x'], inner_pts['y']),
            crs=crs
        )
        
        # Get the PA geometry
        pa_geom = park_row.geometry
        
        # Mark inner points inside the inner buffer as bad (bad angle?)
        bad_inside_buffer = inner_gdf[inner_gdf.geometry.within(buffer_geom)]['transectID'].unique()
        
        # Mark inner points outside the PA polygon as bad (crossed to opposite side)
        bad_outside_pa = inner_gdf[~inner_gdf.geometry.within(pa_geom)]['transectID'].unique()
        
        # Combine both sets of bad transects
        bad_transects = np.unique(np.concatenate([bad_inside_buffer, bad_outside_pa]))
        
        # Filter full data
        transect_df = transect_df[~transect_df['transectID'].isin(bad_transects)]
    
    # If there are any transects left after filtering, add them to the chunk data
    if len(transect_df) > 0:
        total_points += len(transect_df)
        total_transects += transect_df['transectID'].nunique()
        pas_processed += 1
        chunk_data.append(transect_df)
    else:
        all_filtered += 1
    
    # Write every 500 PAs
    if len(chunk_data) >= 500:
        chunk_file = f"{output_dir}/chunk_{chunk_num:03d}.csv"
        pd.concat(chunk_data, ignore_index=True).to_csv(chunk_file, index=False)
        chunk_files.append(chunk_file)
        chunk_data = []
        chunk_num += 1
        gc.collect()
        print(f"  Processed {idx + 1}/{len(wdpa_filtered)} PAs, wrote chunk {chunk_num} | Total points: {total_points:,}")

# Write final chunk
if chunk_data:
    chunk_file = f"{output_dir}/chunk_{chunk_num:03d}.csv"
    pd.concat(chunk_data, ignore_index=True).to_csv(chunk_file, index=False)
    chunk_files.append(chunk_file)
    print(f"  Wrote final chunk {chunk_num + 1}")

# Clean up memory
del wdpa_filtered, wdpa_buffer_dict, chunk_data
gc.collect()

# Print statistics
print(f"\n{'='*60}")
print(f"Processing Complete!")
print(f"{'='*60}")
print(f"Total PAs in dataset: {idx + 1:,}")
print(f"PAs with no transects generated: {no_transects:,}")
print(f"PAs with no buffer: {no_buffer:,}")
print(f"PAs with empty buffer (too small): {empty_buffer:,}")
print(f"PAs with all transects filtered out: {all_filtered:,}")
print(f"PAs successfully processed: {pas_processed:,}")
print(f"-" * 60)
print(f"Total transect points (filtered): {total_points:,}")
print(f"Total unique transects: {total_transects:,}")
print(f"Average transects per PA: {total_transects / pas_processed:.1f}")
print(f"Average points per transect: {total_points / total_transects:.1f}")
print(f"Created {len(chunk_files)} chunk files in {output_dir}/")
print(f"{'='*60}")

In [None]:
#Total PAs in dataset: 5,012
#PAs with no transects generated: 0
#PAs with no buffer: 0
#PAs with empty buffer (too small): 753
#PAs with all transects filtered out: 0
#PAs successfully processed: 4,259
#Total transect points (filtered): 9,029,755
#Total unique transects: 1,805,951
#Average transects per PA: 424.0
#Average points per transect: 5.0
#Created 9 chunk files in ../data/transect_chunks/

In [None]:
# Transform each chunk CRS from ESRI:54009 to EPSG:4326 for earth engine
for chunk_file in sorted(glob.glob('../data/transect_chunks/chunk_*.csv')):
    chunk = pd.read_csv(chunk_file, low_memory=False)
    chunk_gdf = gpd.GeoDataFrame(chunk, geometry=gpd.points_from_xy(chunk['x'], chunk['y']), crs='ESRI:54009')
    chunk_gdf = chunk_gdf.to_crs('EPSG:4326')
    chunk['x'] = chunk_gdf.geometry.x
    chunk['y'] = chunk_gdf.geometry.y
    chunk.to_csv(chunk_file, index=False)

In [None]:
# Combine chunks, keeping only essential columns for Earth Engine
chunk_files = sorted(glob.glob('../data/transect_chunks/chunk_*.csv'))
print(f"Found {len(chunk_files)} chunk files")

output_file = '../data/transects_final.csv'

# Essential columns only (removes problematic text fields with quotes/commas)
essential_cols = ['WDPA_PID', 'transectID', 'point_position', 'x', 'y']

# Write first chunk with header
print("Writing combined file with essential columns only...")
first_chunk = pd.read_csv(chunk_files[0], low_memory=False)[essential_cols]
first_chunk.to_csv(output_file, index=False, mode='w')
print(f"  Wrote chunk 1/{len(chunk_files)}")

# Append remaining chunks without header
for i, chunk_file in enumerate(chunk_files[1:], start=2):
    chunk = pd.read_csv(chunk_file, low_memory=False)[essential_cols]
    chunk.to_csv(output_file, index=False, mode='a', header=False)
    print(f"  Wrote chunk {i}/{len(chunk_files)}")
    del chunk

print(f"\nComplete! Saved to {output_file}")
print(f"File size: {os.path.getsize(output_file) / 1024**2:.1f} MB")

# Now upload the combined CSV file as an asset to Earth Engine (x,y), crs is EPSG:4326
# Uploading the asset takes ~40min

Found 9 chunk files
Writing combined file...
  Wrote chunk 1/9
  Wrote chunk 2/9
  Wrote chunk 3/9
  Wrote chunk 4/9
  Wrote chunk 5/9
  Wrote chunk 6/9
  Wrote chunk 7/9
  Wrote chunk 8/9
  Wrote chunk 9/9

Complete! Saved to ../data/all_transects_combined.csv
File size: 4918.8 MB
