# extract

> The extraction engine: `extract_categorical` for pulling stats from any categorical raster.

In [1]:
#| default_exp extract

In [1]:
#| hide
from nbdev.showdoc import *

In [2]:
#| export
import ee
import pandas as pd
from fastcore.basics import patch

from gee_polygons.layers import CategoricalLayer
from gee_polygons.site import Site



## The Core Primitive

The heart of gee-polygons is one powerful primitive:

```python
site.extract_categorical(layer, years=[2018, 2019, 2020])
```

Given:
- A polygon (Site)
- A categorical raster descriptor (CategoricalLayer)
- A list of years

It returns a tidy DataFrame with pixel counts and areas per class, per year.

**This function knows nothing about MapBiomas, deforestation, or Brazil.** It's pure geometry + categorical values + time.

In [3]:
#| export
@patch
def extract_categorical(
    self: Site,
    layer: CategoricalLayer,
    years: list[int],
    max_pixels: int = int(1e9)
) -> pd.DataFrame:
    """Extract per-year pixel counts and areas from a categorical raster.
    
    Args:
        layer: A CategoricalLayer describing the data source
        years: List of years to extract
        max_pixels: Maximum pixels for reduction (default 1e9)
        
    Returns:
        A tidy DataFrame with columns:
        - site_id: Site identifier
        - year: Year of observation
        - class_value: Integer class value
        - count: Pixel count
        - area_ha: Area in hectares
        - class_name: Human-readable name (if available in layer)
    """
    img = ee.Image(layer.asset_id)
    records = []
    
    for year in years:
        band = layer.band_name(year)
        classified = img.select(band)
        
        stats = classified.reduceRegion(
            reducer=ee.Reducer.frequencyHistogram(),
            geometry=self.geometry,
            scale=layer.scale,
            maxPixels=max_pixels
        )
        
        hist = ee.Dictionary(stats.get(band))
        
        # Build a feature to batch the getInfo call
        records.append(
            ee.Feature(None, {
                'year': year,
                'histogram': hist
            })
        )
    
    # Single getInfo call for all years
    fc = ee.FeatureCollection(records).getInfo()
    
    # Convert to tidy rows
    rows = []
    for f in fc['features']:
        year = f['properties']['year']
        hist = f['properties']['histogram']
        
        if hist is None:
            continue
            
        for cls_str, count in hist.items():
            cls = int(cls_str)
            rows.append({
                'site_id': self.site_id,
                'year': year,
                'class_value': cls,
                'count': count,
                'area_ha': count * (layer.scale ** 2) / 10_000,
                'class_name': layer.class_name(cls)
            })
    
    return pd.DataFrame(rows)

## Example Usage

Let's test with a simple example. First, we need to initialize GEE and load a site.

In [4]:
# Initialize Earth Engine
ee.Authenticate()
ee.Initialize(project='hs-brazilreforestation')

In [5]:
from gee_polygons.site import load_sites
sites = load_sites('../data/restoration_sites_subset.geojson')
site = sites[0]

In [10]:
# Define a layer (or use a preset from gee_polygons.datasets)
layer = CategoricalLayer(
     asset_id='projects/mapbiomas-public/assets/brazil/lulc/collection10/mapbiomas_brazil_collection10_coverage_v2',
     band_pattern='classification_{}',
     scale=30
)

# Extract stats
df = site.extract_categorical(layer, years=[2020, 2021, 2022])
df.head(10)

Unnamed: 0,site_id,year,class_value,count,area_ha,class_name
0,3107,2020,12,37.521569,3.376941,
1,3107,2020,15,362.745098,32.647059,
2,3107,2020,24,6.701961,0.603176,
3,3107,2020,3,0.015686,0.001412,
4,3107,2020,41,10.0,0.9,
5,3107,2021,12,37.521569,3.376941,
6,3107,2021,15,356.972549,32.127529,
7,3107,2021,24,6.701961,0.603176,
8,3107,2021,3,0.015686,0.001412,
9,3107,2021,41,10.0,0.9,


The output is always a tidy DataFrame:

| site_id | year | class_value | count | area_ha | class_name |
|---------|------|-------------|-------|---------|------------|
| 3107    | 2020 | 3           | 1520  | 136.8   | None       |
| 3107    | 2020 | 5           | 203   | 18.3    | None       |
| 3107    | 2021 | 3           | 1701  | 153.1   | None       |

If the `CategoricalLayer` has a `class_map`, the `class_name` column will be populated.

We can also try with a pre-set dataset.

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()