In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np

In [None]:
wdpa_final = gpd.read_file("../data/wdpa_final.shp")
crs = wdpa_final.crs
len(wdpa_final)  # 5012

In [None]:
# all functions
def evenspace(xy, sep, start=0):
    """
    Creates points along lines with a set distance.
    
    Parameters:
    -----------
    xy : array-like
        Nx2 array of coordinates (x, y)
    sep : float
        Separation distance between points
    start : float, optional
        Starting distance along the line (default=0)
    
    Returns:
    --------
    DataFrame with columns: x, y, x0, y0, x1, y1, theta
    """
    xy = np.array(xy)
    
    # Calculate differences and segment distances
    dx = np.concatenate([[0], np.diff(xy[:, 0])])
    dy = np.concatenate([[0], np.diff(xy[:, 1])])
    dseg = np.sqrt(dx**2 + dy**2)
    dtotal = np.cumsum(dseg)
    
    linelength = np.sum(dseg)
    
    # Generate positions along the line
    pos = np.arange(start, linelength, sep)
    pos = pos[:-1]  # Remove last point to avoid enclosed point
    
    if len(pos) == 0:
        return pd.DataFrame(columns=['x', 'y', 'x0', 'y0', 'x1', 'y1', 'theta'])
    
    # Find which segment each position falls in
    whichseg = np.array([np.sum(dtotal <= x) for x in pos])
    
    # Ensure whichseg doesn't exceed array bounds
    max_seg = len(xy) - 2  # Maximum valid segment index
    whichseg = np.clip(whichseg, 0, max_seg)
    
    # Create dataframe with position information
    pos_df = pd.DataFrame({
        'pos': pos,
        'whichseg': whichseg,
        'x0': xy[whichseg, 0],
        'y0': xy[whichseg, 1],
        'dseg': dseg[whichseg + 1],
        'dtotal': dtotal[whichseg],
        'x1': xy[whichseg + 1, 0],
        'y1': xy[whichseg + 1, 1]
    })
    
    # Calculate exact positions
    pos_df['further'] = pos_df['pos'] - pos_df['dtotal']
    pos_df['f'] = pos_df['further'] / pos_df['dseg']
    pos_df['x'] = pos_df['x0'] + pos_df['f'] * (pos_df['x1'] - pos_df['x0'])
    pos_df['y'] = pos_df['y0'] + pos_df['f'] * (pos_df['y1'] - pos_df['y0'])
    
    # Calculate angle
    pos_df['theta'] = np.arctan2(pos_df['y0'] - pos_df['y1'], pos_df['x0'] - pos_df['x1'])
    
    return pos_df[['x', 'y', 'x0', 'y0', 'x1', 'y1', 'theta']]


def transect(tpts, tlen, npts=1):
    """
    Creates points perpendicular to a line with set distance.
    
    Parameters:
    -----------
    tpts : DataFrame
        DataFrame from evenspace with columns: x, y, theta
    tlen : float
        Length of transect steps
    npts : int, optional
        Number of points on one side in addition to center (default=1)
    
    Returns:
    --------
    DataFrame with columns: transectID, point_position, x, y
    """
    if len(tpts) == 0:
        return pd.DataFrame(columns=['transectID', 'point_position', 'x', 'y'])
    
    tpts = tpts.copy()
    tpts['thetaT'] = tpts['theta'] + np.pi / 2
    
    dx = tlen * np.cos(tpts['thetaT'])
    dy = tlen * np.sin(tpts['thetaT'])
    
    x = tpts['x'].values
    y = tpts['y'].values
    
    # Create inner points (negative positions)
    x_inner = np.column_stack([x + i * dx for i in range(npts, 0, -1)])
    y_inner = np.column_stack([y + i * dy for i in range(npts, 0, -1)])
    inner_names = [f"-{i}" for i in range(npts, 0, -1)]
    
    # Create outer points (positive positions, including center at 0)
    x_outer = np.column_stack([x - i * dx for i in range(0, npts + 1)])
    y_outer = np.column_stack([y - i * dy for i in range(0, npts + 1)])
    outer_names = [f"+{i}" for i in range(0, npts + 1)]
    
    # Combine inner and outer
    xx = np.column_stack([x_inner, x_outer])
    yy = np.column_stack([y_inner, y_outer])
    all_names = inner_names + outer_names
    
    # Create long format dataframe
    n_transects = len(tpts)
    n_points_per_transect = 2 * npts + 1
    
    result = []
    for i in range(n_transects):
        for j, name in enumerate(all_names):
            result.append({
                'transectID': i + 1,
                'point_position': float(name),
                'x': xx[i, j],
                'y': yy[i, j]
            })
    
    xy = pd.DataFrame(result)
    xy = xy.sort_values(['transectID', 'point_position']).reset_index(drop=True)
    
    return xy


def extract_coords(geom):
    """Extract coordinates from geometry (Polygon or MultiPolygon)."""
    if geom.geom_type == 'Polygon':
        return np.array(geom.exterior.coords)
    elif geom.geom_type == 'MultiPolygon':
        # Use the largest polygon for MultiPolygon
        largest = max(geom.geoms, key=lambda p: p.area)
        return np.array(largest.exterior.coords)
    else:
        return None


def create_transects(park_row, sample_dist, transect_unit, transect_pts):
    """Process a single park to generate transect points."""
    park_data = park_row[1]  # Get the Series from (index, Series) tuple
    geom = park_data.geometry
    
    # Extract coordinates
    coords = extract_coords(geom)
    if coords is None or len(coords) < 3:
        return None
    
    # Create evenly spaced boundary points
    div_pts = evenspace(coords, sample_dist)
    if len(div_pts) == 0:
        return None
    
    # Create transect points
    transect_pts_df = transect(div_pts, transect_unit, npts=transect_pts)
    if len(transect_pts_df) == 0:
        return None
    

    # Add park attributes
    base_props = {col: park_data[col] for col in park_data.index if col not in ['geometry', 'geometry_t']}
    transect_pts_df = transect_pts_df.assign(**base_props)
    
    return transect_pts_df


def batch_remove_bad_transects(batch_list, wdpa_buffer_dict):
    """Process a batch following exact specifications: create minimal geodataframe, find bad transects, filter full data"""
    batch_df = pd.concat(batch_list, ignore_index=True)
    
    # Create minimal geodataframe for spatial operations
    batch_gdf = gpd.GeoDataFrame(
        batch_df[['WDPA_PID', 'transectID', 'point_position']], 
        geometry=gpd.points_from_xy(batch_df['x'], batch_df['y']), 
        crs=crs
    )
    
    # Get pre-computed buffers for this batch
    pa_ids = batch_df['WDPA_PID'].unique()
    batch_buffers = pd.DataFrame([
        {'WDPA_PID': pid, 'geometry': wdpa_buffer_dict[pid]} 
        for pid in pa_ids if pid in wdpa_buffer_dict
    ])
    batch_buffers = gpd.GeoDataFrame(batch_buffers, geometry='geometry', crs=crs)
    
    # Find bad points (inner points outside the inner buffer)
    transect_with_buffer = batch_gdf.merge(batch_buffers, on='WDPA_PID')
    bad_points = transect_with_buffer[
        ~transect_with_buffer.geometry_x.within(transect_with_buffer.geometry_y) & 
        (transect_with_buffer['point_position'] < 0)
    ][['WDPA_PID', 'transectID']].drop_duplicates()
    
    # Filter and return
    filtered = batch_df.merge(bad_points, on=['WDPA_PID', 'transectID'], how='left', indicator=True).query('_merge == "left_only"').drop(columns='_merge')
    return filtered

In [None]:
# ======================================================
# Generate Transect Points
# ======================================================
# Set parameters
sample_dist = 500  # transect spacing (meters)
transect_unit = 2500  # distance between samples along a transect (meters)
transect_pts = 2  # number of points on each side of boundary point
buffer_dist = transect_unit * transect_pts + 500  # size of inner buffer, evaluates point validity


# Create all buffers at once 
print("Creating inner buffers for all protected areas...")
wdpa_buffers = wdpa_final[['WDPA_PID', 'geometry']].copy()
wdpa_buffers['geometry'] = wdpa_buffers.geometry.buffer(-buffer_dist)
wdpa_buffer_dict = dict(zip(wdpa_buffers['WDPA_PID'], wdpa_buffers['geometry']))
del wdpa_buffers

# Process all parks using map
print(f"Processing {len(wdpa_final)} protected areas...")

# Create transect points for each park
transect_list = list(map(
    lambda park_row: create_transects(park_row, sample_dist, transect_unit, transect_pts),
    wdpa_final.iterrows()
))

# Filter out None values
transect_list = [t for t in transect_list if t is not None]
print(f"Successfully created transects for {len(transect_list)} protected areas")
del wdpa_final 

In [None]:
# Calculate and display statistics
total_points = sum(len(t) for t in transect_list)
total_transects = sum(t['transectID'].nunique() for t in transect_list)
avg_transects_per_pa = total_transects / len(transect_list)
avg_points_per_transect = total_points / total_transects

print(f"Total transect points: {total_points:,}")
print(f"Total unique transects: {total_transects:,}")
print(f"Average transects per PA: {avg_transects_per_pa:.1f}")
print(f"Average points per transect: {avg_points_per_transect:.1f}")