# collection

> `SiteCollection` for batch operations on multiple sites.

In [None]:
#| default_exp collection

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
import json
from pathlib import Path
from typing import Optional, Union, Iterator, Callable, Any, List, TYPE_CHECKING
from dataclasses import dataclass, field

import ee
import pandas as pd

from gee_polygons.site import Site, _detect_crs, _reproject_geometry

if TYPE_CHECKING:
    from gee_polygons.layers import CategoricalLayer, ContinuousLayer

## Chunking Utilities

Internal utilities for splitting large site collections into manageable chunks.

In [None]:
#| export
def calculate_chunk_size(n_years: int, n_sites: int, target_rows: int = 40_000) -> int:
    """Calculate optimal chunk size based on expected data volume.

    Heuristics:
    - Target ~40,000 result rows per getInfo() call (conservative for GEE)
    - Each site-year produces ~10-50 class rows for categorical data
    - For continuous, each site-year produces 1 row per band

    Args:
        n_years: Number of years being extracted
        n_sites: Total number of sites
        target_rows: Target maximum rows per chunk (default 40,000)

    Returns:
        Recommended number of sites per chunk
    """
    # Conservative estimate: 30 classes per site-year for categorical
    estimated_classes = 30
    rows_per_site = n_years * estimated_classes

    if rows_per_site == 0:
        return min(100, n_sites)

    chunk_size = max(10, target_rows // rows_per_site)

    # Cap at 500 sites per chunk to avoid memory issues
    chunk_size = min(chunk_size, 500)

    # Don't exceed actual site count
    return min(chunk_size, n_sites)


def chunk_items(items: Any, chunk_size: int) -> Iterator[List]:
    """Yield chunks of items from an iterable.

    Args:
        items: Iterable of items to chunk (list, generator, etc.)
        chunk_size: Maximum items per chunk

    Yields:
        Lists of up to chunk_size items
    """
    chunk = []
    for item in items:
        chunk.append(item)
        if len(chunk) >= chunk_size:
            yield chunk
            chunk = []
    if chunk:
        yield chunk

## ChunkedResult

A result container that tracks both successful extractions and any errors that occurred.

In [None]:
#| export
@dataclass
class ChunkedResult:
    """Result from chunked extraction with error tracking.

    Attributes:
        data: DataFrame containing successful extractions
        errors: List of dicts with site_id, chunk_idx, and error message
    """
    data: pd.DataFrame
    errors: List[dict] = field(default_factory=list)

    @property
    def success_rate(self) -> float:
        """Fraction of sites successfully extracted."""
        n_success = len(self.data['site_id'].unique()) if len(self.data) > 0 else 0
        n_errors = len(self.errors)
        total = n_success + n_errors
        return n_success / total if total > 0 else 1.0

    def __repr__(self) -> str:
        n_sites = len(self.data['site_id'].unique()) if len(self.data) > 0 else 0
        return f"ChunkedResult(sites={n_sites}, errors={len(self.errors)}, success_rate={self.success_rate:.1%})"

## SiteCollection

The `SiteCollection` class enables batch operations on multiple sites. It supports:

- **Eager mode**: All Site objects created upfront (default for <1000 sites)
- **Lazy mode**: Site objects created on-demand to save memory for large collections

Use `SiteCollection` when you need to extract data from hundreds to thousands of sites efficiently.

In [None]:
#| export
class SiteCollection:
    """A collection of Sites for batch operations.

    Supports two modes:
    - Eager: All Site objects created upfront (default for <1000 sites)
    - Lazy: Site objects created on-demand from stored feature dicts

    Example:
        # Load and extract from many sites
        sites = SiteCollection.from_geojson('restoration_sites.geojson')

        # Interactive batch extraction (Path A)
        result = sites.extract_categorical(MAPBIOMAS_LULC, years=range(2010, 2024))
        df = result.data

        # Export for large collections (Path B)
        task = sites.export_categorical(
            MAPBIOMAS_LULC,
            years=range(2010, 2024),
            destination=ExportDestination(type='drive', folder='exports')
        )
    """

    def __init__(
        self,
        sites: Optional[List[Site]] = None,
        feature_dicts: Optional[List[dict]] = None,
        source_crs: str = 'EPSG:4326',
        metadata: Optional[dict] = None
    ):
        """Create a SiteCollection.

        Args:
            sites: List of Site objects (eager mode)
            feature_dicts: List of GeoJSON feature dicts (lazy mode)
            source_crs: CRS for feature_dicts (used in lazy mode)
            metadata: Optional metadata dict (e.g., source file path)
        """
        self._sites = sites
        self._feature_dicts = feature_dicts
        self._source_crs = source_crs
        self._metadata = metadata or {}
        self._fc_cache: Optional[ee.FeatureCollection] = None

    def __len__(self) -> int:
        if self._sites is not None:
            return len(self._sites)
        return len(self._feature_dicts) if self._feature_dicts else 0

    def __iter__(self) -> Iterator[Site]:
        if self._sites is not None:
            yield from self._sites
        elif self._feature_dicts:
            for fd in self._feature_dicts:
                yield Site.from_geojson(fd, source_crs=self._source_crs)

    def __getitem__(self, idx: Union[int, slice]) -> Union[Site, 'SiteCollection']:
        if isinstance(idx, int):
            if self._sites is not None:
                return self._sites[idx]
            return Site.from_geojson(self._feature_dicts[idx], self._source_crs)
        elif isinstance(idx, slice):
            if self._sites is not None:
                return SiteCollection(sites=self._sites[idx], metadata=self._metadata)
            return SiteCollection(
                feature_dicts=self._feature_dicts[idx],
                source_crs=self._source_crs,
                metadata=self._metadata
            )
        raise TypeError(f"indices must be integers or slices, not {type(idx).__name__}")

    def __repr__(self) -> str:
        mode = "eager" if self._sites is not None else "lazy"
        return f"SiteCollection(n={len(self)}, mode={mode})"

    @property
    def site_ids(self) -> List[str]:
        """List of all site IDs in the collection."""
        return [site.site_id for site in self]

    @property
    def feature_collection(self) -> ee.FeatureCollection:
        """Get as ee.FeatureCollection (cached).

        Note: This creates ee.Feature objects for all sites, which
        may be slow for very large collections.
        """
        if self._fc_cache is None:
            features = [site.feature for site in self]
            self._fc_cache = ee.FeatureCollection(features)
        return self._fc_cache

    def filter(self, predicate: Callable[[Site], bool]) -> 'SiteCollection':
        """Filter sites by a predicate function.

        Args:
            predicate: Function taking a Site, returning True to keep

        Returns:
            New SiteCollection with filtered sites
        """
        filtered = [s for s in self if predicate(s)]
        return SiteCollection(sites=filtered, metadata=self._metadata)

    def filter_by_property(self, key: str, value: Any) -> 'SiteCollection':
        """Filter sites by a property value.

        More efficient than filter() for lazy mode, as it operates
        on feature dicts directly without creating Site objects.

        Args:
            key: Property name to filter on
            value: Value to match

        Returns:
            New SiteCollection with matching sites
        """
        if self._feature_dicts is not None and self._sites is None:
            filtered = [
                fd for fd in self._feature_dicts
                if fd.get('properties', {}).get(key) == value
            ]
            return SiteCollection(
                feature_dicts=filtered,
                source_crs=self._source_crs,
                metadata=self._metadata
            )
        return self.filter(lambda s: s.properties.get(key) == value)

    @classmethod
    def from_geojson(
        cls,
        path: Union[str, Path],
        source_crs: Optional[str] = None,
        lazy: bool = False,
        lazy_threshold: int = 1000
    ) -> 'SiteCollection':
        """Load sites from a GeoJSON file.

        Args:
            path: Path to GeoJSON file (FeatureCollection or single Feature)
            source_crs: Override CRS (auto-detected from file if not provided)
            lazy: Force lazy loading mode
            lazy_threshold: Auto-switch to lazy if more features than this

        Returns:
            SiteCollection instance
        """
        path = Path(path)

        with open(path) as f:
            data = json.load(f)

        if source_crs is None:
            source_crs = _detect_crs(data)

        features = data.get('features', [data])

        # Auto-switch to lazy for large files
        if lazy or len(features) > lazy_threshold:
            return cls(
                feature_dicts=features,
                source_crs=source_crs,
                metadata={'path': str(path), 'mode': 'lazy'}
            )
        else:
            sites = [Site.from_geojson(f, source_crs=source_crs) for f in features]
            return cls(sites=sites, metadata={'path': str(path), 'mode': 'eager'})

    @classmethod
    def from_sites(cls, sites: List[Site], metadata: Optional[dict] = None) -> 'SiteCollection':
        """Create from a list of Site objects.

        Args:
            sites: List of Site instances
            metadata: Optional metadata dict

        Returns:
            SiteCollection instance
        """
        return cls(sites=sites, metadata=metadata)

    @classmethod
    def from_feature_collection(
        cls,
        fc: ee.FeatureCollection,
        site_id_property: str = 'rid',
        start_year_property: str = 'start_year'
    ) -> 'SiteCollection':
        """Create from an ee.FeatureCollection.

        Note: This triggers a getInfo() call to fetch feature data.
        For large collections, prefer from_geojson() or export workflows.

        Args:
            fc: Earth Engine FeatureCollection
            site_id_property: Property name for site ID
            start_year_property: Property name for start year

        Returns:
            SiteCollection instance
        """
        fc_info = fc.getInfo()
        features = fc_info.get('features', [])

        sites = []
        for f in features:
            props = f.get('properties', {})
            geom = f.get('geometry', {})

            # Reconstruct ee.Feature
            geom_type = geom.get('type')
            coords = geom.get('coordinates')

            if geom_type == 'Polygon':
                ee_geom = ee.Geometry.Polygon(coords)
            elif geom_type == 'MultiPolygon':
                ee_geom = ee.Geometry.MultiPolygon(coords)
            else:
                continue  # Skip unsupported types

            ee_feature = ee.Feature(ee_geom, props)

            site = Site(
                ee_feature,
                site_id=str(props.get(site_id_property, '')),
                start_year=props.get(start_year_property)
            )
            sites.append(site)

        return cls(sites=sites, metadata={'source': 'ee.FeatureCollection'})

## Batch Extraction Methods (Path A)

These methods extract data interactively, returning pandas DataFrames. Best for collections under ~5000 sites.

In [None]:
#| export
from fastcore.basics import patch

@patch
def extract_categorical(
    self: SiteCollection,
    layer: 'CategoricalLayer',
    years: List[int],
    chunk_size: Optional[int] = None,
    max_pixels: int = int(1e9),
    progress: bool = True
) -> ChunkedResult:
    """Extract categorical data from all sites.

    Batches sites into chunks to avoid GEE timeout/memory limits.
    Each chunk is processed with a single getInfo() call.

    Args:
        layer: CategoricalLayer to extract from
        years: List of years to extract
        chunk_size: Sites per chunk (auto-calculated if None)
        max_pixels: Max pixels per reduceRegion call
        progress: Show progress bar (requires tqdm)

    Returns:
        ChunkedResult with DataFrame and any errors

    Example:
        result = sites.extract_categorical(MAPBIOMAS_LULC, years=[2020, 2021, 2022])
        print(result)  # ChunkedResult(sites=500, errors=2, success_rate=99.6%)
        df = result.data
    """
    years_list = list(years)

    if chunk_size is None:
        chunk_size = calculate_chunk_size(len(years_list), len(self))

    all_dfs = []
    errors = []

    # Set up progress bar
    chunks = list(chunk_items(self, chunk_size))
    if progress:
        try:
            from tqdm.auto import tqdm
            chunks = tqdm(chunks, desc="Extracting categorical")
        except ImportError:
            pass  # tqdm not available

    for chunk_idx, chunk_sites in enumerate(chunks):
        try:
            chunk_df = _extract_categorical_chunk(
                chunk_sites, layer, years_list, max_pixels
            )
            all_dfs.append(chunk_df)
        except Exception as e:
            # Record error for each site in the failed chunk
            for site in chunk_sites:
                errors.append({
                    'site_id': site.site_id,
                    'chunk_idx': chunk_idx,
                    'error': str(e)
                })

    data = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
    return ChunkedResult(data=data, errors=errors)

In [None]:
#| export
@patch
def extract_continuous(
    self: SiteCollection,
    layer: 'ContinuousLayer',
    start_date: str,
    end_date: str,
    reducer: str = 'mean',
    frequency: str = 'yearly',
    chunk_size: Optional[int] = None,
    max_pixels: int = int(1e9),
    progress: bool = True
) -> ChunkedResult:
    """Extract continuous data from all sites.

    Batches sites into chunks to avoid GEE timeout/memory limits.

    Args:
        layer: ContinuousLayer to extract from
        start_date: Start date (YYYY-MM-DD)
        end_date: End date (YYYY-MM-DD)
        reducer: Spatial reducer ('mean', 'median', 'min', 'max')
        frequency: Temporal grouping ('all', 'monthly', 'yearly')
        chunk_size: Sites per chunk (auto-calculated if None)
        max_pixels: Max pixels per reduceRegion call
        progress: Show progress bar

    Returns:
        ChunkedResult with DataFrame and any errors
    """
    # Estimate number of temporal periods for chunk size calculation
    start_year = int(start_date[:4])
    end_year = int(end_date[:4])
    n_periods = end_year - start_year + 1
    if frequency == 'monthly':
        n_periods *= 12

    if chunk_size is None:
        chunk_size = calculate_chunk_size(n_periods, len(self))

    all_dfs = []
    errors = []

    chunks = list(chunk_items(self, chunk_size))
    if progress:
        try:
            from tqdm.auto import tqdm
            chunks = tqdm(chunks, desc="Extracting continuous")
        except ImportError:
            pass

    for chunk_idx, chunk_sites in enumerate(chunks):
        try:
            chunk_df = _extract_continuous_chunk(
                chunk_sites, layer, start_date, end_date,
                reducer, frequency, max_pixels
            )
            all_dfs.append(chunk_df)
        except Exception as e:
            for site in chunk_sites:
                errors.append({
                    'site_id': site.site_id,
                    'chunk_idx': chunk_idx,
                    'error': str(e)
                })

    data = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
    return ChunkedResult(data=data, errors=errors)

## Export Methods (Path B)

These methods export data to Google Drive or Cloud Storage using GEE batch tasks. Best for large collections (>5000 sites).

In [None]:
#| export
@patch
def export_categorical(
    self: SiteCollection,
    layer: 'CategoricalLayer',
    years: List[int],
    destination: 'ExportDestination',
    config: Optional['ExportConfig'] = None,
    max_pixels: int = int(1e9)
) -> 'ExportTask':
    """Export categorical extraction to Google Drive or Cloud Storage.

    For collections larger than ~5000 sites, this is more reliable
    than interactive extraction. Results are exported as CSV or GeoJSON
    files, one per chunk.

    Args:
        layer: CategoricalLayer to extract from
        years: List of years to extract
        destination: Where to export (Drive or GCS)
        config: Export configuration (chunk size, concurrency)
        max_pixels: Max pixels per reduceRegion call

    Returns:
        ExportTask for monitoring progress

    Example:
        from gee_polygons.export import ExportDestination, ExportConfig

        task = sites.export_categorical(
            layer=MAPBIOMAS_LULC,
            years=range(2010, 2024),
            destination=ExportDestination(type='drive', folder='exports'),
            config=ExportConfig(chunk_size=50, max_concurrent=15)
        )

        # Monitor progress
        print(task.status())

        # Wait for completion
        task.wait(timeout_minutes=180)

        # Get result file locations
        print(task.results_info())
    """
    from gee_polygons.export import ExportDestination, ExportConfig, ExportTask, _wait_for_task_slot

    config = config or ExportConfig()
    years_list = list(years)

    task_ids = []
    chunk_mapping = {}
    active_tasks = []

    chunks = list(chunk_items(self, config.chunk_size))

    for chunk_idx, chunk_sites in enumerate(chunks):
        # Wait if at max concurrent tasks
        active_tasks = _wait_for_task_slot(active_tasks, config.max_concurrent)

        # Build server-side FeatureCollection for this chunk
        fc = _build_categorical_export_fc(chunk_sites, layer, years_list, max_pixels)

        # Create export task
        description = f"{config.description_prefix}_cat_chunk_{chunk_idx:04d}"
        file_name = f"{destination.file_prefix}_chunk_{chunk_idx:04d}"

        if destination.type == 'drive':
            task = ee.batch.Export.table.toDrive(
                collection=fc,
                description=description,
                folder=destination.folder,
                fileNamePrefix=file_name,
                fileFormat=destination.file_format
            )
        else:  # cloud_storage
            bucket = destination.folder.split('/')[0]
            prefix = '/'.join(destination.folder.split('/')[1:] + [file_name])
            task = ee.batch.Export.table.toCloudStorage(
                collection=fc,
                description=description,
                bucket=bucket,
                fileNamePrefix=prefix,
                fileFormat=destination.file_format
            )

        task.start()
        task_id = task.id
        task_ids.append(task_id)
        active_tasks.append(task_id)

        start_idx = chunk_idx * config.chunk_size
        end_idx = start_idx + len(chunk_sites)
        chunk_mapping[task_id] = (start_idx, end_idx)

    return ExportTask(
        task_ids=task_ids,
        destination=destination,
        config=config,
        chunk_mapping=chunk_mapping
    )

In [None]:
#| export
@patch
def export_continuous(
    self: SiteCollection,
    layer: 'ContinuousLayer',
    start_date: str,
    end_date: str,
    destination: 'ExportDestination',
    reducer: str = 'mean',
    frequency: str = 'yearly',
    config: Optional['ExportConfig'] = None,
    max_pixels: int = int(1e9)
) -> 'ExportTask':
    """Export continuous extraction to Google Drive or Cloud Storage.

    Args:
        layer: ContinuousLayer to extract from
        start_date: Start date (YYYY-MM-DD)
        end_date: End date (YYYY-MM-DD)
        destination: Where to export (Drive or GCS)
        reducer: Spatial reducer ('mean', 'median', 'min', 'max')
        frequency: Temporal grouping ('monthly', 'yearly')
        config: Export configuration
        max_pixels: Max pixels per reduceRegion call

    Returns:
        ExportTask for monitoring progress
    """
    from gee_polygons.export import ExportDestination, ExportConfig, ExportTask, _wait_for_task_slot

    if frequency == 'all':
        raise ValueError("frequency='all' not supported for export (too many records)")

    config = config or ExportConfig()

    task_ids = []
    chunk_mapping = {}
    active_tasks = []

    chunks = list(chunk_items(self, config.chunk_size))

    for chunk_idx, chunk_sites in enumerate(chunks):
        active_tasks = _wait_for_task_slot(active_tasks, config.max_concurrent)

        # Build server-side FeatureCollection
        fc = _build_continuous_export_fc(
            chunk_sites, layer, start_date, end_date,
            reducer, frequency, max_pixels
        )

        description = f"{config.description_prefix}_cont_chunk_{chunk_idx:04d}"
        file_name = f"{destination.file_prefix}_chunk_{chunk_idx:04d}"

        if destination.type == 'drive':
            task = ee.batch.Export.table.toDrive(
                collection=fc,
                description=description,
                folder=destination.folder,
                fileNamePrefix=file_name,
                fileFormat=destination.file_format
            )
        else:
            bucket = destination.folder.split('/')[0]
            prefix = '/'.join(destination.folder.split('/')[1:] + [file_name])
            task = ee.batch.Export.table.toCloudStorage(
                collection=fc,
                description=description,
                bucket=bucket,
                fileNamePrefix=prefix,
                fileFormat=destination.file_format
            )

        task.start()
        task_id = task.id
        task_ids.append(task_id)
        active_tasks.append(task_id)

        start_idx = chunk_idx * config.chunk_size
        end_idx = start_idx + len(chunk_sites)
        chunk_mapping[task_id] = (start_idx, end_idx)

    return ExportTask(
        task_ids=task_ids,
        destination=destination,
        config=config,
        chunk_mapping=chunk_mapping
    )

## Internal Extraction Functions

These functions handle the actual GEE operations for each chunk.

In [None]:
#| export
def _extract_categorical_chunk(
    sites: List[Site],
    layer: 'CategoricalLayer',
    years: List[int],
    max_pixels: int
) -> pd.DataFrame:
    """Extract categorical data for a chunk of sites.

    Builds a single ee.FeatureCollection containing all site-year combinations,
    then calls getInfo() once for the entire chunk.
    """
    records = []

    for site in sites:
        for year in years:
            if layer.temporal_mode == 'band':
                img = ee.Image(layer.asset_id)
                band = layer.band_name(year)
                classified = img.select(band)
                band_name = band
            else:
                collection = (
                    ee.ImageCollection(layer.asset_id)
                    .filterDate(f'{year}-01-01', f'{year}-12-31')
                    .filterBounds(site.geometry)
                    .select(layer.band)
                )
                classified = collection.reduce(ee.Reducer.mode())
                band_name = f'{layer.band}_mode'

            stats = classified.reduceRegion(
                reducer=ee.Reducer.frequencyHistogram(),
                geometry=site.geometry,
                scale=layer.scale,
                maxPixels=max_pixels
            )

            hist = ee.Dictionary(stats.get(band_name))

            records.append(
                ee.Feature(None, {
                    'site_id': site.site_id,
                    'year': year,
                    'histogram': hist
                })
            )

    # Single getInfo call for entire chunk
    fc = ee.FeatureCollection(records).getInfo()

    # Convert to tidy rows
    rows = []
    for f in fc['features']:
        props = f['properties']
        site_id = props['site_id']
        year = props['year']
        hist = props['histogram']

        if hist is None:
            continue

        for cls_str, count in hist.items():
            cls = int(float(cls_str))
            rows.append({
                'site_id': site_id,
                'year': year,
                'class_value': cls,
                'count': count,
                'area_ha': count * (layer.scale ** 2) / 10_000,
                'class_name': layer.class_name(cls)
            })

    return pd.DataFrame(rows)

In [None]:
#| export
def _extract_continuous_chunk(
    sites: List[Site],
    layer: 'ContinuousLayer',
    start_date: str,
    end_date: str,
    reducer: str,
    frequency: str,
    max_pixels: int
) -> pd.DataFrame:
    """Extract continuous data for a chunk of sites.

    Builds a single ee.FeatureCollection for all site-period combinations.
    """
    reducer_fn = getattr(ee.Reducer, reducer)()
    bands = layer.bands

    # Build temporal periods
    if frequency == 'yearly':
        periods = _build_yearly_periods(start_date, end_date)
    elif frequency == 'monthly':
        periods = _build_monthly_periods(start_date, end_date)
    else:
        # 'all' frequency - handled differently below
        periods = None

    records = []

    for site in sites:
        # Load and preprocess collection for this site
        collection = (
            ee.ImageCollection(layer.collection_id)
            .filterDate(start_date, end_date)
            .filterBounds(site.geometry)
        )

        if layer.preprocess is not None:
            collection = collection.map(layer.preprocess)

        collection = collection.select(bands)

        if frequency == 'all':
            # One record per image
            def reduce_image(img):
                stats = img.reduceRegion(
                    reducer=reducer_fn,
                    geometry=site.geometry,
                    scale=layer.scale,
                    maxPixels=max_pixels
                )
                props = {
                    'site_id': site.site_id,
                    'date': img.date().format('YYYY-MM-dd')
                }
                for band in bands:
                    props[band] = stats.get(band)
                return ee.Feature(None, props)

            site_records = collection.map(reduce_image)
            records.append(site_records)

        else:
            # Aggregate by period
            for period in periods:
                filtered = collection.filterDate(period['start'], period['end'])
                composite = filtered.median()

                stats = composite.reduceRegion(
                    reducer=reducer_fn,
                    geometry=site.geometry,
                    scale=layer.scale,
                    maxPixels=max_pixels
                )

                props = {'site_id': site.site_id}
                props.update(period['props'])
                for band in bands:
                    props[band] = stats.get(band)

                records.append(ee.Feature(None, props))

    # Handle 'all' frequency differently (records are FeatureCollections)
    if frequency == 'all':
        merged = ee.FeatureCollection([])
        for fc in records:
            merged = merged.merge(fc)
        fc = merged.getInfo()
    else:
        fc = ee.FeatureCollection(records).getInfo()

    # Convert to rows
    rows = []
    for f in fc['features']:
        props = f['properties']

        # Skip if all band values are None
        if all(props.get(b) is None for b in bands):
            continue

        row = {'site_id': props['site_id']}

        if frequency == 'all':
            row['date'] = props.get('date')
        elif frequency == 'yearly':
            row['year'] = props.get('year')
        elif frequency == 'monthly':
            row['year'] = props.get('year')
            row['month'] = props.get('month')

        for band in bands:
            row[band] = props.get(band)

        rows.append(row)

    return pd.DataFrame(rows)

In [None]:
#| export
def _build_yearly_periods(start_date: str, end_date: str) -> List[dict]:
    """Build list of yearly periods."""
    start_year = int(start_date[:4])
    end_year = int(end_date[:4])

    periods = []
    for year in range(start_year, end_year + 1):
        periods.append({
            'start': f'{year}-01-01',
            'end': f'{year}-12-31',
            'props': {'year': year}
        })
    return periods


def _build_monthly_periods(start_date: str, end_date: str) -> List[dict]:
    """Build list of monthly periods."""
    import datetime

    start = datetime.date.fromisoformat(start_date)
    end = datetime.date.fromisoformat(end_date)

    periods = []
    current = start.replace(day=1)

    while current <= end:
        year, month = current.year, current.month
        if month == 12:
            next_month = datetime.date(year + 1, 1, 1)
        else:
            next_month = datetime.date(year, month + 1, 1)

        periods.append({
            'start': current.isoformat(),
            'end': next_month.isoformat(),
            'props': {'year': year, 'month': month}
        })

        current = next_month

    return periods

## Export Helper Functions

In [None]:
#| export
def _build_categorical_export_fc(
    sites: List[Site],
    layer: 'CategoricalLayer',
    years: List[int],
    max_pixels: int
) -> ee.FeatureCollection:
    """Build FeatureCollection for categorical export.

    Creates a flattened structure where each feature represents one
    site-year-class combination with columns:
    site_id, year, class_value, count, area_ha, class_name
    """
    records = []

    for site in sites:
        for year in years:
            if layer.temporal_mode == 'band':
                img = ee.Image(layer.asset_id)
                band = layer.band_name(year)
                classified = img.select(band)
                band_name = band
            else:
                collection = (
                    ee.ImageCollection(layer.asset_id)
                    .filterDate(f'{year}-01-01', f'{year}-12-31')
                    .filterBounds(site.geometry)
                    .select(layer.band)
                )
                classified = collection.reduce(ee.Reducer.mode())
                band_name = f'{layer.band}_mode'

            stats = classified.reduceRegion(
                reducer=ee.Reducer.frequencyHistogram(),
                geometry=site.geometry,
                scale=layer.scale,
                maxPixels=max_pixels
            )

            hist = ee.Dictionary(stats.get(band_name))

            # Store histogram as JSON string for export
            # (GEE export doesn't handle nested dicts well)
            records.append(
                ee.Feature(None, {
                    'site_id': site.site_id,
                    'year': year,
                    'histogram': hist,
                    'scale': layer.scale
                })
            )

    return ee.FeatureCollection(records)


def _build_continuous_export_fc(
    sites: List[Site],
    layer: 'ContinuousLayer',
    start_date: str,
    end_date: str,
    reducer: str,
    frequency: str,
    max_pixels: int
) -> ee.FeatureCollection:
    """Build FeatureCollection for continuous export."""
    reducer_fn = getattr(ee.Reducer, reducer)()
    bands = layer.bands

    if frequency == 'yearly':
        periods = _build_yearly_periods(start_date, end_date)
    elif frequency == 'monthly':
        periods = _build_monthly_periods(start_date, end_date)
    else:
        raise ValueError(f"Unsupported frequency for export: {frequency}")

    records = []

    for site in sites:
        collection = (
            ee.ImageCollection(layer.collection_id)
            .filterDate(start_date, end_date)
            .filterBounds(site.geometry)
        )

        if layer.preprocess is not None:
            collection = collection.map(layer.preprocess)

        collection = collection.select(bands)

        for period in periods:
            filtered = collection.filterDate(period['start'], period['end'])
            composite = filtered.median()

            stats = composite.reduceRegion(
                reducer=reducer_fn,
                geometry=site.geometry,
                scale=layer.scale,
                maxPixels=max_pixels
            )

            props = {'site_id': site.site_id}
            props.update(period['props'])
            for band in bands:
                props[band] = stats.get(band)

            records.append(ee.Feature(None, props))

    return ee.FeatureCollection(records)

## Example Usage

In [None]:
#| eval: false
# Initialize Earth Engine
ee.Authenticate()
ee.Initialize(project="your-project-id")

In [None]:
#| eval: false
# Load a collection of sites
sites = SiteCollection.from_geojson('../data/restoration_sites_subset.geojson')
print(sites)

In [None]:
#| eval: false
# Extract categorical data (Path A - interactive)
from gee_polygons.datasets.mapbiomas import MAPBIOMAS_LULC

result = sites.extract_categorical(MAPBIOMAS_LULC, years=range(2018, 2023))
print(result)
result.data.head()

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()