# extract

> The extraction engine: `extract_categorical` and `extract_continuous` for pulling stats from any raster.

In [1]:
#| default_exp extract

In [2]:
#| hide
from nbdev.showdoc import *

In [3]:
#| export
import ee
import geemap
import pandas as pd
from fastcore.basics import patch
from typing import Optional, Literal

from gee_polygons.layers import CategoricalLayer, ContinuousLayer
from gee_polygons.site import Site



## The Core Primitive

The heart of gee-polygons is one powerful primitive:

```python
site.extract_categorical(layer, years=[2018, 2019, 2020])
```

Given:
- A polygon (Site)
- A categorical raster descriptor (CategoricalLayer)
- A list of years

It returns a tidy DataFrame with pixel counts and areas per class, per year.

**This function knows nothing about MapBiomas, deforestation, or Brazil.** It's pure geometry + categorical values + time.

In [4]:
#| export
@patch
def extract_categorical(
    self: Site,
    layer: CategoricalLayer,
    years: list[int],
    max_pixels: int = int(1e9)
) -> pd.DataFrame:
    """Extract per-year pixel counts and areas from a categorical raster.
    
    Supports two temporal modes:
    - 'band': Each year is a band in a single Image (e.g., MapBiomas)
    - 'collection': An ImageCollection filtered by date (e.g., Dynamic World)
    
    Args:
        layer: A CategoricalLayer describing the data source
        years: List of years to extract
        max_pixels: Maximum pixels for reduction (default 1e9)
        
    Returns:
        A tidy DataFrame with columns:
        - site_id: Site identifier
        - year: Year of observation
        - class_value: Integer class value
        - count: Pixel count
        - area_ha: Area in hectares
        - class_name: Human-readable name (if available in layer)
    """
    records = []
    
    for year in years:
        if layer.temporal_mode == 'band':
            # MapBiomas style: select band from single image
            img = ee.Image(layer.asset_id)
            band = layer.band_name(year)
            classified = img.select(band)
            band_name = band
        else:
            # Dynamic World style: filter collection and reduce
            collection = (
                ee.ImageCollection(layer.asset_id)
                .filterDate(f'{year}-01-01', f'{year}-12-31')
                .filterBounds(self.geometry)
                .select(layer.band)
            )
            # Use mode (most frequent class) to aggregate across time
            classified = collection.reduce(ee.Reducer.mode())
            band_name = f'{layer.band}_mode'
        
        stats = classified.reduceRegion(
            reducer=ee.Reducer.frequencyHistogram(),
            geometry=self.geometry,
            scale=layer.scale,
            maxPixels=max_pixels
        )
        
        hist = ee.Dictionary(stats.get(band_name))
        
        # Build a feature to batch the getInfo call
        records.append(
            ee.Feature(None, {
                'year': year,
                'histogram': hist
            })
        )
    
    # Single getInfo call for all years
    fc = ee.FeatureCollection(records).getInfo()
    
    # Convert to tidy rows
    rows = []
    for f in fc['features']:
        year = f['properties']['year']
        hist = f['properties']['histogram']
        
        if hist is None:
            continue
            
        for cls_str, count in hist.items():
            cls = int(float(cls_str))  # Handle potential float strings
            rows.append({
                'site_id': self.site_id,
                'year': year,
                'class_value': cls,
                'count': count,
                'area_ha': count * (layer.scale ** 2) / 10_000,
                'class_name': layer.class_name(cls)
            })
    
    return pd.DataFrame(rows)

## Example Usage

Let's test with a simple example. First, we need to initialize GEE and load a site.

In [5]:
# Initialize Earth Engine
ee.Authenticate()
ee.Initialize(project='hs-brazilreforestation')

In [6]:
from gee_polygons.site import load_sites
sites = load_sites('../data/restoration_sites_subset.geojson')
site = sites[8]
print(site)

Site(id=9368, start_year=2012)


In [7]:
# Define a layer (or use a preset from gee_polygons.datasets)
layer = CategoricalLayer(
     asset_id='projects/mapbiomas-public/assets/brazil/lulc/collection10/mapbiomas_brazil_collection10_coverage_v2',
     band_pattern='classification_{}',
     scale=30
)

# Extract stats
df = site.extract_categorical(layer, years=[2012, 2013, 2014, 2015])
df.head(10)

Unnamed: 0,site_id,year,class_value,count,area_ha,class_name
0,9368,2012,21,147.211765,13.249059,
1,9368,2012,3,9.196078,0.827647,
2,9368,2013,21,99.458824,8.951294,
3,9368,2013,3,56.94902,5.125412,
4,9368,2014,21,86.427451,7.778471,
5,9368,2014,3,69.980392,6.298235,
6,9368,2015,3,156.407843,14.076706,


The output is always a tidy DataFrame:

| site_id | year | class_value | count | area_ha | class_name |
|---------|------|-------------|-------|---------|------------|
| 3107    | 2020 | 3           | 1520  | 136.8   | None       |
| 3107    | 2020 | 5           | 203   | 18.3    | None       |
| 3107    | 2021 | 3           | 1701  | 153.1   | None       |

If the `CategoricalLayer` has a `class_map`, the `class_name` column will be populated.

We can also try with a pre-set dataset.

In [8]:
from gee_polygons.datasets.mapbiomas import MAPBIOMAS_DEFREG
df = site.extract_categorical(MAPBIOMAS_DEFREG, years=range(2012, 2015))
df.head(10)

Unnamed: 0,site_id,year,class_value,count,area_ha,class_name
0,9368,2012,1,147.211765,13.249059,Anthropic
1,9368,2012,2,0.462745,0.041647,Primary Vegetation
2,9368,2012,3,8.733333,0.786,Secondary Vegetation
3,9368,2013,1,99.458824,8.951294,Anthropic
4,9368,2013,2,0.462745,0.041647,Primary Vegetation
5,9368,2013,3,8.733333,0.786,Secondary Vegetation
6,9368,2013,5,47.752941,4.297765,Secondary Veg Regrowth
7,9368,2014,1,86.427451,7.778471,Anthropic
8,9368,2014,2,0.462745,0.041647,Primary Vegetation
9,9368,2014,3,56.486275,5.083765,Secondary Vegetation


## Visualizing Layers

Before extracting stats, it's useful to visually verify the data. The `show_layer` method displays the categorical raster for specified years with the site polygon overlaid.

In [9]:
#| export
@patch
def show_layer(
    self: Site,
    layer: CategoricalLayer,
    years: list[int],
    zoom: int = 14,
    basemap: str = 'SATELLITE',
    site_color: str = 'blue',
    buffer_m: Optional[float] = None
) -> geemap.Map:
    """Display a categorical layer for multiple years with the site overlaid.
    
    Supports both temporal modes:
    - 'band': Shows each year's band from a single Image
    - 'collection': Shows mode-reduced composite per year
    
    Args:
        layer: A CategoricalLayer to visualize
        years: List of years to add as layers
        zoom: Initial zoom level (default 14)
        basemap: Basemap type (default 'SATELLITE')
        site_color: Color for site boundary (default 'blue')
        buffer_m: Optional buffer around site for clipping display
        
    Returns:
        A geemap.Map with yearly classification layers
    """
    m = geemap.Map()
    m.add_basemap(basemap)
    m.center_object(self.geometry, zoom)
    
    # Determine clip geometry
    clip_geom = self.buffer(buffer_m) if buffer_m else self.geometry
    
    # Build visualization params from palette
    if layer.palette:
        classes = sorted(layer.palette.keys())
        colors = [layer.palette[c] for c in classes]
        vis_params = {
            'min': min(classes),
            'max': max(classes),
            'palette': colors
        }
    else:
        vis_params = {'min': 0, 'max': 10}
    
    # Add each year as a layer (most recent visible by default)
    for i, year in enumerate(years):
        if layer.temporal_mode == 'band':
            # MapBiomas style
            img = ee.Image(layer.asset_id)
            band = layer.band_name(year)
            classified = img.select(band).clip(clip_geom)
        else:
            # Dynamic World style
            collection = (
                ee.ImageCollection(layer.asset_id)
                .filterDate(f'{year}-01-01', f'{year}-12-31')
                .filterBounds(self.geometry)
                .select(layer.band)
            )
            classified = collection.reduce(ee.Reducer.mode()).clip(clip_geom)
        
        shown = (i == len(years) - 1)
        m.add_layer(classified, vis_params, f'{year}', shown=shown)
    
    # Add site boundary on top
    site_style = {'color': site_color, 'fillColor': '#00000000', 'width': 2}
    m.add_layer(self.geometry, site_style, f'Site {self.site_id}')
    
    return m

In [10]:
# Example: Visualize the layer with a buffer around the site
site.show_layer(MAPBIOMAS_DEFREG, years=range(2010, 2018), buffer_m=500)

Map(center=[-22.511306816393418, -42.27634385094392], controls=(WidgetControl(options=['position', 'transparen…

## Continuous Extraction

For continuous data (NDVI, EVI, temperature, etc.), use `extract_continuous`. It supports:
- **Multi-band extraction**: Get multiple bands in one call
- **Preprocessing hooks**: Dataset-specific logic (cloud masking, index computation) via `layer.preprocess`
- **Temporal aggregation**: `'all'` (per-image), `'monthly'`, or `'yearly'`

The function is completely dataset-agnostic — all dataset-specific logic lives in the `preprocess` function defined in the layer.

In [11]:
#| export
@patch
def extract_continuous(
    self: Site,
    layer: ContinuousLayer,
    start_date: str,
    end_date: str,
    reducer: Literal['mean', 'median', 'min', 'max'] = 'mean',
    frequency: Literal['all', 'monthly', 'yearly'] = 'all',
    max_pixels: int = int(1e9)
) -> pd.DataFrame:
    """Extract continuous raster statistics over time for a site.
    
    Completely dataset-agnostic. All preprocessing (cloud masking, index
    computation, scaling) is handled by the layer's `preprocess` function.
    
    Args:
        layer: ContinuousLayer with bands to extract and optional preprocess
        start_date: Start date (YYYY-MM-DD)
        end_date: End date (YYYY-MM-DD)
        reducer: Spatial aggregation ('mean', 'median', 'min', 'max')
        frequency: Temporal output ('all', 'monthly', 'yearly')
        max_pixels: Maximum pixels for reduction
        
    Returns:
        DataFrame with columns: site_id, date/year/month, and one column per band
    """
    reducer_fn = getattr(ee.Reducer, reducer)()
    
    # Load collection
    collection = (
        ee.ImageCollection(layer.collection_id)
        .filterDate(start_date, end_date)
        .filterBounds(self.geometry)
    )
    
    # Apply preprocessing if defined (cloud masking, index computation, etc.)
    if layer.preprocess is not None:
        collection = collection.map(layer.preprocess)
    
    # Select the requested bands
    collection = collection.select(layer.bands)
    
    if frequency == 'all':
        return _extract_continuous_all(self, collection, layer, reducer_fn, max_pixels)
    elif frequency == 'yearly':
        return _extract_continuous_yearly(self, collection, layer, reducer_fn, start_date, end_date, max_pixels)
    elif frequency == 'monthly':
        return _extract_continuous_monthly(self, collection, layer, reducer_fn, start_date, end_date, max_pixels)
    else:
        raise ValueError(f"Unknown frequency: {frequency}")

In [12]:
#| export
def _extract_continuous_all(site, collection, layer, reducer_fn, max_pixels):
    """Extract one row per image in the collection."""
    bands = layer.bands
    
    def reduce_image(img):
        stats = img.reduceRegion(
            reducer=reducer_fn,
            geometry=site.geometry,
            scale=layer.scale,
            maxPixels=max_pixels
        )
        props = {'date': img.date().format('YYYY-MM-dd')}
        for band in bands:
            props[band] = stats.get(band)
        return ee.Feature(None, props)
    
    fc = ee.FeatureCollection(collection.map(reduce_image)).getInfo()
    
    rows = []
    for f in fc['features']:
        props = f['properties']
        # Skip if all band values are None
        if all(props.get(b) is None for b in bands):
            continue
        row = {'site_id': site.site_id, 'date': props['date']}
        for band in bands:
            row[band] = props.get(band)
        rows.append(row)
    
    return pd.DataFrame(rows)

In [13]:
#| export
def _extract_continuous_yearly(site, collection, layer, reducer_fn, start_date, end_date, max_pixels):
    """Aggregate by year: temporal median, then spatial reduce."""
    start_year = int(start_date[:4])
    end_year = int(end_date[:4])
    bands = layer.bands
    
    records = []
    for year in range(start_year, end_year + 1):
        yearly = collection.filterDate(f'{year}-01-01', f'{year}-12-31')
        composite = yearly.median()
        
        stats = composite.reduceRegion(
            reducer=reducer_fn,
            geometry=site.geometry,
            scale=layer.scale,
            maxPixels=max_pixels
        )
        
        props = {'year': year}
        for band in bands:
            props[band] = stats.get(band)
        records.append(ee.Feature(None, props))
    
    fc = ee.FeatureCollection(records).getInfo()
    
    rows = []
    for f in fc['features']:
        props = f['properties']
        if all(props.get(b) is None for b in bands):
            continue
        row = {'site_id': site.site_id, 'year': props['year']}
        for band in bands:
            row[band] = props.get(band)
        rows.append(row)
    
    return pd.DataFrame(rows)

In [14]:
#| export
def _extract_continuous_monthly(site, collection, layer, reducer_fn, start_date, end_date, max_pixels):
    """Aggregate by month: temporal median, then spatial reduce."""
    import datetime
    
    start = datetime.date.fromisoformat(start_date)
    end = datetime.date.fromisoformat(end_date)
    bands = layer.bands
    
    records = []
    current = start.replace(day=1)
    
    while current <= end:
        year, month = current.year, current.month
        if month == 12:
            next_month = datetime.date(year + 1, 1, 1)
        else:
            next_month = datetime.date(year, month + 1, 1)
        
        monthly = collection.filterDate(current.isoformat(), next_month.isoformat())
        composite = monthly.median()
        
        stats = composite.reduceRegion(
            reducer=reducer_fn,
            geometry=site.geometry,
            scale=layer.scale,
            maxPixels=max_pixels
        )
        
        props = {'year': year, 'month': month}
        for band in bands:
            props[band] = stats.get(band)
        records.append(ee.Feature(None, props))
        
        current = next_month
    
    fc = ee.FeatureCollection(records).getInfo()
    
    rows = []
    for f in fc['features']:
        props = f['properties']
        if all(props.get(b) is None for b in bands):
            continue
        row = {'site_id': site.site_id, 'year': props['year'], 'month': props['month']}
        for band in bands:
            row[band] = props.get(band)
        rows.append(row)
    
    return pd.DataFrame(rows)

### Example: Continuous Extraction

Using Sentinel-2 NDVI/EVI with preprocessing defined in the dataset module:

In [15]:
# Example with Sentinel-2 NDVI/EVI (requires updated datasets/sentinel2.py)
from gee_polygons.datasets.sentinel2 import SENTINEL2_NDVI_EVI

# Yearly NDVI/EVI summary
df = site.extract_continuous(
    SENTINEL2_NDVI_EVI,
    start_date='2018-01-01',
    end_date='2020-12-31',
    reducer='median',
    frequency='all'
)
df.head()

Unnamed: 0,site_id,date,NDVI,EVI
0,9368,2018-01-09,0.806691,0.591811
1,9368,2018-02-08,0.884718,0.589342
2,9368,2018-12-15,0.885487,0.596724
3,9368,2018-12-20,0.870644,0.592598
4,9368,2018-12-25,0.491157,0.659755


In [18]:
df.head(20)

Unnamed: 0,site_id,date,NDVI,EVI
0,9368,2018-01-09,0.806691,0.591811
1,9368,2018-02-08,0.884718,0.589342
2,9368,2018-12-15,0.885487,0.596724
3,9368,2018-12-20,0.870644,0.592598
4,9368,2018-12-25,0.491157,0.659755
5,9368,2018-12-30,0.873436,0.623111
6,9368,2019-01-04,0.846921,0.589451
7,9368,2019-01-09,0.892792,0.600742
8,9368,2019-01-14,0.89378,0.612306
9,9368,2019-01-19,0.338589,0.800424


In [17]:
#| hide
import nbdev; nbdev.nbdev_export()