# Agricultural Fields Data - Zonal Stats Endpoint

#### Notes
The [zonal stats endpoint](https://docs.climateengine.org/docs/build/html/zonal_statistics.html#rst-zonal-stats-temporal-dataset-coordinates) is used in this script.

## Setup

In [None]:
import os
import logging
from dotenv import load_dotenv

# AUTHENTICATION
load_dotenv()
CLIMATE_ENGINE_API_KEY = os.environ.get('CLIMATE_ENGINE_API_KEY')

HEADERS = {
    'Accept': 'application/json',
    'Authorization': CLIMATE_ENGINE_API_KEY
}

# LOGGING
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
    force=True
)

logger = logging.getLogger("climateengine.scraper.zonal")

## Metadata
### Datasets

In [None]:
DATASETS = ['LANDSAT_SR', 'OPENET_CONUS', 'SENTINEL2_SR']

### Variables

In [None]:
from scrape_utils import synchronous_fetch_with_retry

# Variables to calculate statistics for
VARIABLES = [
    ['NDVI', 'MSAVI', 'NDWI_NIR_SWIR_Gao'],
    ['et_eemetric', 'et_geesebal', 'et_disalexi'],
    ['NDVI', 'MSAVI', 'NDWI_NIR_SWIR_Gao', 'NDRE', 'BSI']
]

# Verify variables are available
for i, dataset in enumerate(DATASETS):
    res = synchronous_fetch_with_retry(
        f'https://api.climateengine.org/metadata/dataset_variables?dataset={dataset}',
        headers=HEADERS
    )

    api_variables = set(res.get('Data').get('variables'))
    missing = set(VARIABLES[i]).difference(api_variables)

    if missing:
        logger.info(f'{dataset}: ✗ API missing requested variables {missing}')
    else:
        logger.info(f'{dataset}: ✓ all variables available')

### Dates

In [None]:
# Year ranges for each dataset
YEARS = [
    [2008, 2024],  # LANDSAT_SR
    [2008, 2024],  # OPENET_CONUS
    [2017, 2024]   # SENTINEL2_SR
]

# Months to process (growing season)
MONTHS = [
    [4, 5, 6, 7, 8, 9],
    [4, 5, 6, 7, 8, 9],
    [4, 5, 6, 7, 8, 9]
]

# Statistics to calculate for each variable
STATISTICS = [
    ['mean', 'max'],
    ['mean', 'max'],
    ['mean', 'max']
]

## Prepare Agricultural Fields

In [None]:
import pandas as pd

AG_FIELDS_URL = 'https://wc-prod.bearhive.duckdns.org/weppcloud/runs/copacetic-note/ag-fields/browse/ag_fields/CSB_2008_2024_Hangman_with_Crop_and_Performance.geojson?raw=true'

fields_data = synchronous_fetch_with_retry(AG_FIELDS_URL)

# Extract field data
fields = []
for feature in fields_data['features']:
    properties = feature['properties']
    field_id = properties.get('field_ID')
    geometry = feature['geometry']
    
    fields.append({
        'field_id': field_id,
        'geometry': geometry,
    })

fields_df = pd.DataFrame(fields)
logger.info(f'Loaded {len(fields_df)} agricultural fields')

## Fetch Monthly Statistics

This uses the `/zonal_stats/temporal_dataset/coordinates` endpoint to calculate temporal statistics (mean, max) for each month/year combination.

In [None]:
import aiohttp
import asyncio
import json
import calendar
from tqdm.asyncio import tqdm
from scrape_utils import asynchronous_fetch_with_retry

semaphore = asyncio.Semaphore(50)

async def fetch_data(
    dataset: str,
    dataset_index: int,
    variable: str,
    session: aiohttp.ClientSession
):  
    """Fetch all monthly statistics for a specific dataset/variable combination."""
    tasks = []
    task_metadata = []
    
    for statistic in STATISTICS[dataset_index]:
        for year in range(YEARS[dataset_index][0], YEARS[dataset_index][1] + 1):
            for month in MONTHS[dataset_index]: 
                start_date = f'{year}-{month:02d}-01'
                end_date = f'{year}-{month:02d}-{calendar.monthrange(year, month)[1]:02d}'

                for _, row in fields_df.head(10).iterrows():
                    # Store metadata for this task
                    task_metadata.append({
                        'field_id': row['field_id'],
                        'dataset': dataset,
                        'variable': variable,
                        'statistic': statistic,
                        'year': year,
                        'month': month
                    })
                    
                    # Create the API call task
                    tasks.append(
                        asynchronous_fetch_with_retry(
                            session=session,
                            url='https://api.climateengine.org/zonal_stats/temporal_dataset/coordinates',
                            semaphore=semaphore,
                            headers=HEADERS,
                            params={
                                'dataset': dataset,
                                'variable': variable,
                                'temporal_statistic': statistic,
                                'area_reducer': 'median',
                                'start_date': start_date,
                                'end_date': end_date,
                                'coordinates': json.dumps(row['geometry']['coordinates'])
                            }
                        )
                    )
    
    # Gather all results with progress bar
    api_responses = await tqdm.gather(*tasks)
    
    # Combine metadata with API responses
    results = []
    for metadata, response in zip(task_metadata, api_responses):
        # Extract value from API response
        value = None
        if response and 'Data' in response and len(response['Data']) > 0:
            data_dict = response['Data'][0]
            
            # Try exact match first
            if variable in data_dict:
                value = data_dict[variable]
            else:
                # Try with units suffix (e.g., "et_eemetric (mm)")
                for key in data_dict.keys():
                    if key.startswith(variable + ' '):
                        value = data_dict[key]
                        break
        
        results.append({
            **metadata,  # Unpack metadata
            'value': value
        })
    
    return results


def convert_results_to_dataframe(all_results: list) -> pd.DataFrame:
    """Convert list of results to a pandas DataFrame with proper structure."""
    if not all_results:
        return pd.DataFrame()
    
    # Create DataFrame from results
    df = pd.DataFrame(all_results)
    
    # Create column name: variable_statistic (e.g., NDVI_mean, NDVI_max)
    df['variable_stat'] = df['variable'] + '_' + df['statistic']
    
    # Pivot to wide format: one row per field/dataset/year/month, columns for each variable_stat
    df_wide = df.pivot_table(
        index=['field_id', 'dataset', 'year', 'month'],
        columns='variable_stat',
        values='value',
        aggfunc='first'
    ).reset_index()
    
    # Flatten column names
    df_wide.columns.name = None
    
    return df_wide


# Main execution
all_results = []


async with aiohttp.ClientSession(raise_for_status=True, timeout=aiohttp.ClientTimeout(total=None)) as session:
    for i, dataset in enumerate(DATASETS):
        for variable in VARIABLES[i]:
            logger.info(f'{dataset}-{variable}: starting...')
            results = await fetch_data(
                dataset=dataset,
                dataset_index=i,
                variable=variable,
                session=session
            )
            logger.info(f'{dataset}-{variable}: fetched {len(results)} results')
            
            # Add to combined results list
            all_results.extend(results)

# Convert all results to DataFrame
logger.info(f'Processing {len(all_results)} total results...')
results_df = convert_results_to_dataframe(all_results)
logger.info(f'Created DataFrame with {len(results_df)} rows and {len(results_df.columns)} columns')


## Preview Results

In [None]:
# Display summary of results
logger.info(f'Results shape: {results_df.shape}')
logger.info(f'Columns: {list(results_df.columns)}')
logger.info(f'Datasets: {results_df["dataset"].unique()}')
logger.info(f'Date range: {results_df["year"].min()}-{results_df["year"].max()}')

# Show sample data
display(results_df.head(10))

# Check for missing values
missing_by_column = results_df.isnull().sum()
if missing_by_column.sum() > 0:
    logger.info('Missing values by column:')
    for col, count in missing_by_column[missing_by_column > 0].items():
        logger.info(f'  {col}: {count}')

## Save Results to File

In [None]:
import os

output_dir = 'data/output'
os.makedirs(output_dir, exist_ok=True)

# Save combined results
combined_file = f'{output_dir}/zonal_stats_monthly_combined.parquet'
results_df.to_parquet(combined_file, index=False)
logger.info(f'Saved {len(results_df)} rows with {len(results_df.columns)} columns to {combined_file}')

# Also save as CSV for easier viewing
combined_csv = f'{output_dir}/zonal_stats_monthly_combined.csv'
results_df.to_csv(combined_csv, index=False)
logger.info(f'Also saved as CSV to {combined_csv}')

# Optionally, save separate files per dataset
for dataset_name in results_df['dataset'].unique():
    dataset_df = results_df[results_df['dataset'] == dataset_name]
    output_file = f'{output_dir}/{dataset_name.lower()}_monthly_stats.parquet'
    dataset_df.to_parquet(output_file, index=False)
    logger.info(f'{dataset_name}: saved {len(dataset_df)} rows to {output_file}')