In [None]:
# ==============================================================================
# notebooks/05_vegetation_indices_calculation.ipynb
# ==============================================================================

# # 05 - Vegetation Indices Calculation and Aggregation
# This notebook processes the exported Sentinel-2 data to calculate monthly vegetation indices
# (NDVI and SAVI) for coffee-growing areas within each woreda.
# It then aggregates these indices to monthly means per woreda.
#
# It covers:
# 1.  Loading required libraries and setting up paths.
# 2.  Downloading Sentinel-2 export CSVs from GCS.
# 3.  Processing each CSV: loading, ensuring correct data types, and handling potential issues.
# 4.  Aggregating the monthly NDVI and SAVI values per woreda.
# 5.  Saving the aggregated data to a single CSV file.

# ## 1. Load Project Setup and Libraries
# Import `pandas`, `numpy`, `os`, and custom GCS I/O module.

import pandas as pd
import numpy as np
import os
import sys

# Add src to path to import custom modules
project_root = os.path.abspath(os.path.join(os.getcwd(), '../'))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.gcs_io import download_gcs_files

print("Libraries and custom modules loaded.")

# Define common variables and paths
BUCKET_NAME = 'bensa-coffee-yield' # Ensure this matches your GCS bucket
START_DATE = '2017-01-01' # Ensure this matches your export start date
END_DATE = '2025-12-31' # Ensure this matches your export end date
processed_data_dir = '../data/processed/'
gee_exports_dir = '../data/gee_exports/'
sentinel2_export_gcs_folder = 'gee_exports/sentinel2_woredas/'
sentinel2_download_dir = os.path.join(gee_exports_dir, 'sentinel2/')

os.makedirs(sentinel2_download_dir, exist_ok=True)

# Load woreda boundaries for woreda_name information
PROCESSED_WOREDAS_GEOJSON_PATH = os.path.join(processed_data_dir, 'sidama_woredas.geojson')
gdf_woredas = None
try:
    gdf_woredas = gpd.read_file(PROCESSED_WOREDAS_GEOJSON_PATH)
    if 'Woreda_ID' not in gdf_woredas.columns or 'Woreda Name' not in gdf_woredas.columns:
        raise ValueError("Woredas GeoDataFrame must contain 'Woreda_ID' and 'Woreda Name' columns.")
    gdf_woredas['Woreda_ID'] = gdf_woredas['Woreda_ID'].astype(str) # Ensure ID is string
    print(f"Loaded GeoDataFrame with {len(gdf_woredas)} woredas.")
    woreda_name_map = gdf_woredas.set_index('Woreda_ID')['Woreda Name'].to_dict()
except FileNotFoundError:
    print(f"Error: '{PROCESSED_WOREDAS_GEOJSON_PATH}' not found. Please run '00_setup_and_common_data_loading.ipynb' first.")
    gdf_woredas = None
except ValueError as e:
    print(f"Data error in woreda GeoDataFrame: {e}")
except Exception as e:
    print(f"An unexpected error occurred loading woreda data: {e}")


# ## 2. Download Sentinel-2 Export CSVs from GCS
# Use the `gcs_io` module to download all CSV files exported by `01_gee_sentinel2_export.ipynb`.

if gdf_woredas is not None:
    print(f"\nDownloading Sentinel-2 export files from GCS folder '{sentinel2_export_gcs_folder}' to '{sentinel2_download_dir}'...")
    download_gcs_files(BUCKET_NAME, sentinel2_export_gcs_folder, sentinel2_download_dir)
    print("✅ Sentinel-2 CSV downloads complete.")
else:
    print("Skipping Sentinel-2 CSV download due to missing woreda data.")

# ## 3. Process Downloaded CSVs and Aggregate Vegetation Indices
# Iterate through each downloaded CSV, load it, extract relevant columns, and consolidate into a single DataFrame.
# Perform data cleaning and type conversions.

if gdf_woredas is not None:
    all_vi_data = []
    processed_count = 0
    error_count = 0

    print("\nProcessing downloaded Sentinel-2 CSVs...")
    for filename in os.listdir(sentinel2_download_dir):
        if filename.startswith('sentinel2_') and filename.endswith('.csv'):
            file_path = os.path.join(sentinel2_download_dir, filename)
            try:
                df = pd.read_csv(file_path)

                # Ensure 'Woreda_ID', 'year', 'month' are present and correctly typed
                # GEE exports these as properties. Rename if necessary.
                if 'Woreda_ID' in df.columns:
                    df = df.rename(columns={'Woreda_ID': 'woreda_id'})
                elif 'woreda_id' not in df.columns:
                    # Try to infer woreda_id from filename if not directly in columns
                    woreda_id_from_filename = filename.replace('sentinel2_', '').replace('.csv', '')
                    df['woreda_id'] = woreda_id_from_filename
                    print(f"  Inferred woreda_id '{woreda_id_from_filename}' from filename for {filename}.")
                
                # Check if 'woreda_id' is present after all attempts
                if 'woreda_id' not in df.columns:
                    raise KeyError(f"'woreda_id' column not found or inferrable in {filename}")

                df['woreda_id'] = df['woreda_id'].astype(str)
                df['year'] = df['year'].astype(int)
                df['month'] = df['month'].astype(int)

                # Select relevant VI columns and rename them for consistency
                # GEE might export them as 'NDVI' and 'SAVI' already
                df_subset = df[['woreda_id', 'year', 'month', 'NDVI', 'SAVI']].copy()
                df_subset = df_subset.rename(columns={'NDVI': 'avg_ndvi', 'SAVI': 'avg_savi'})

                all_vi_data.append(df_subset)
                processed_count += 1
            except Exception as e:
                print(f"  Error processing {filename}: {e}")
                error_count += 1

    if all_vi_data:
        df_monthly_vi = pd.concat(all_vi_data, ignore_index=True)

        # Add woreda_name for clarity
        df_monthly_vi['woreda_name'] = df_monthly_vi['woreda_id'].map(woreda_name_map)

        # Drop rows where woreda_name could not be mapped (i.e., woreda_id not in original gdf)
        df_monthly_vi.dropna(subset=['woreda_name'], inplace=True)
        
        # Ensure unique entries per woreda-year-month, taking mean if duplicates exist
        df_monthly_vi = df_monthly_vi.groupby(['woreda_id', 'woreda_name', 'year', 'month']).mean().reset_index()

        # Sort for better readability and consistency
        df_monthly_vi = df_monthly_vi.sort_values(by=['woreda_id', 'year', 'month']).reset_index(drop=True)

        print(f"\n✅ Finished processing all Sentinel-2 CSVs. {processed_count} files processed, {error_count} errors.")
        print(f"Consolidated monthly vegetation indices: {df_monthly_vi.shape[0]} records.")
        print(df_monthly_vi.head())

        # Save the consolidated data
        output_path = os.path.join(processed_data_dir, 'woreda_monthly_vegetation_indices.csv')
        df_monthly_vi.to_csv(output_path, index=False)
        print(f"✅ Consolidated monthly vegetation indices saved to {output_path}")

    else:
        print("No Sentinel-2 CSVs were processed successfully. Check GCS export tasks and download directory.")
else:
    print("Skipping processing Sentinel-2 CSVs due to missing woreda data.")
