In [None]:
# ==============================================================================
# notebooks/06_environmental_data_integration.ipynb
# ==============================================================================

# # 06 - Environmental Data Integration
# This notebook integrates the exported ERA5-Land, SRTM, and SMAP data.
# It involves:
# 1.  Downloading the exported CSV files from GCS for ERA5-Land, SRTM, and SMAP.
# 2.  Consolidating the ERA5-Land and SMAP data into monthly time-series per woreda.
# 3.  Integrating the static SRTM (elevation) data.
# 4.  Saving the combined environmental features.

# ## 1. Load Project Setup and Libraries
# Import `pandas`, `numpy`, `os`, and custom GCS I/O module.

import pandas as pd
import numpy as np
import os
import sys

# Add src to path to import custom modules
project_root = os.path.abspath(os.path.join(os.getcwd(), '../'))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.gcs_io import download_gcs_files

print("Libraries and custom modules loaded.")

# Define common variables and paths
BUCKET_NAME = 'bensa-coffee-yield' # Ensure this matches your GCS bucket
processed_data_dir = '../data/processed/'
gee_exports_dir = '../data/gee_exports/'

# GCS folders for exports
era5_export_gcs_folder = 'gee_exports/era5/'
srtm_export_gcs_folder = 'gee_exports/srtm/'
smap_export_gcs_folder = 'gee_exports/smap/'

# Local download directories
era5_download_dir = os.path.join(gee_exports_dir, 'era5/')
srtm_download_dir = os.path.join(gee_exports_dir, 'srtm/')
smap_download_dir = os.path.join(gee_exports_dir, 'smap/')

os.makedirs(era5_download_dir, exist_ok=True)
os.makedirs(srtm_download_dir, exist_ok=True)
os.makedirs(smap_download_dir, exist_ok=True)

# Load woreda boundaries for woreda_name information
PROCESSED_WOREDAS_GEOJSON_PATH = os.path.join(processed_data_dir, 'sidama_woredas.geojson')
gdf_woredas = None
try:
    gdf_woredas = gpd.read_file(PROCESSED_WOREDAS_GEOJSON_PATH)
    if 'Woreda_ID' not in gdf_woredas.columns or 'Woreda Name' not in gdf_woredas.columns:
        raise ValueError("Woredas GeoDataFrame must contain 'Woreda_ID' and 'Woreda Name' columns.")
    gdf_woredas['Woreda_ID'] = gdf_woredas['Woreda_ID'].astype(str) # Ensure ID is string
    print(f"Loaded GeoDataFrame with {len(gdf_woredas)} woredas.")
    woreda_name_map = gdf_woredas.set_index('Woreda_ID')['Woreda Name'].to_dict()
except FileNotFoundError:
    print(f"Error: '{PROCESSED_WOREDAS_GEOJSON_PATH}' not found. Please run '00_setup_and_common_data_loading.ipynb' first.")
    gdf_woredas = None
except ValueError as e:
    print(f"Data error in woreda GeoDataFrame: {e}")
except Exception as e:
    print(f"An unexpected error occurred loading woreda data: {e}")


# ## 2. Download Exported CSVs from GCS
# Download ERA5, SRTM, and SMAP CSV files.

if gdf_woredas is not None:
    print(f"\nDownloading ERA5 export files from GCS folder '{era5_export_gcs_folder}' to '{era5_download_dir}'...")
    download_gcs_files(BUCKET_NAME, era5_export_gcs_folder, era5_download_dir)
    print("✅ ERA5 CSV downloads complete.")

    print(f"\nDownloading SRTM export files from GCS folder '{srtm_export_gcs_folder}' to '{srtm_download_dir}'...")
    download_gcs_files(BUCKET_NAME, srtm_export_gcs_folder, srtm_download_dir)
    print("✅ SRTM CSV downloads complete.")

    print(f"\nDownloading SMAP export files from GCS folder '{smap_export_gcs_folder}' to '{smap_download_dir}'...")
    download_gcs_files(BUCKET_NAME, smap_export_gcs_folder, smap_download_dir)
    print("✅ SMAP CSV downloads complete.")
else:
    print("Skipping environmental CSV downloads due to missing woreda data.")

# ## 3. Process ERA5-Land Data
# Consolidate and clean ERA5-Land data.

if gdf_woredas is not None:
    all_era5_data = []
    processed_count = 0
    error_count = 0
    print("\nProcessing downloaded ERA5-Land CSVs...")
    for filename in os.listdir(era5_download_dir):
        if filename.startswith('era5_') and filename.endswith('.csv'):
            file_path = os.path.join(era5_download_dir, filename)
            try:
                df = pd.read_csv(file_path)
                # Handle woreda_id extraction from filename if not in column directly
                if 'Woreda_ID' in df.columns:
                    df = df.rename(columns={'Woreda_ID': 'woreda_id'})
                elif 'woreda_id' not in df.columns:
                    woreda_id_from_filename = filename.replace('era5_', '').replace('.csv', '')
                    df['woreda_id'] = woreda_id_from_filename

                if 'woreda_id' not in df.columns:
                    raise KeyError(f"'woreda_id' column not found or inferrable in {filename}")

                df['woreda_id'] = df['woreda_id'].astype(str)
                df['year'] = df['year'].astype(int)
                df['month'] = df['month'].astype(int)

                # Select relevant ERA5 columns. Adjust names if needed.
                era5_cols = ['woreda_id', 'year', 'month',
                             'era5_total_precipitation', 'era5_temperature_2m',
                             'era5_surface_pressure', 'era5_soil_temperature_level_1',
                             'era5_soil_volume_water_content_level_1']
                
                df_subset = df[era5_cols].copy()
                all_era5_data.append(df_subset)
                processed_count += 1
            except Exception as e:
                print(f"  Error processing {filename}: {e}")
                error_count += 1

    if all_era5_data:
        df_era5 = pd.concat(all_era5_data, ignore_index=True)
        df_era5['woreda_name'] = df_era5['woreda_id'].map(woreda_name_map)
        df_era5.dropna(subset=['woreda_name'], inplace=True) # Drop if woreda_name missing
        df_era5 = df_era5.groupby(['woreda_id', 'woreda_name', 'year', 'month']).mean().reset_index()
        df_era5 = df_era5.sort_values(by=['woreda_id', 'year', 'month']).reset_index(drop=True)
        print(f"\n✅ Consolidated monthly ERA5 data: {df_era5.shape[0]} records.")
        print(df_era5.head())
    else:
        print("No ERA5 CSVs were processed successfully.")
        df_era5 = None
else:
    print("Skipping processing ERA5 data due to missing woreda data.")

# ## 4. Process SMAP Data
# Consolidate and clean SMAP data.

if gdf_woredas is not None:
    all_smap_data = []
    processed_count = 0
    error_count = 0
    print("\nProcessing downloaded SMAP CSVs...")
    for filename in os.listdir(smap_download_dir):
        if filename.startswith('smap_') and filename.endswith('.csv'):
            file_path = os.path.join(smap_download_dir, filename)
            try:
                df = pd.read_csv(file_path)
                # Handle woreda_id extraction from filename if not in column directly
                if 'Woreda_ID' in df.columns:
                    df = df.rename(columns={'Woreda_ID': 'woreda_id'})
                elif 'woreda_id' not in df.columns:
                    woreda_id_from_filename = filename.replace('smap_', '').replace('.csv', '')
                    df['woreda_id'] = woreda_id_from_filename

                if 'woreda_id' not in df.columns:
                    raise KeyError(f"'woreda_id' column not found or inferrable in {filename}")

                df['woreda_id'] = df['woreda_id'].astype(str)
                df['year'] = df['year'].astype(int)
                df['month'] = df['month'].astype(int)

                # Select relevant SMAP columns. Adjust names if needed.
                smap_cols = ['woreda_id', 'year', 'month', 'sm_surface', 'sm_rootzone']
                df_subset = df[smap_cols].copy()
                
                # Rename SMAP bands for clarity and consistency
                df_subset = df_subset.rename(columns={'sm_surface': 'smap_sm_surface_pressure', # Original notes might have used 'pressure' incorrectly here
                                                      'sm_rootzone': 'smap_sm_rootzone_pressure'}) # These are volumetric soil moisture, not pressure

                all_smap_data.append(df_subset)
                processed_count += 1
            except Exception as e:
                print(f"  Error processing {filename}: {e}")
                error_count += 1

    if all_smap_data:
        df_smap = pd.concat(all_smap_data, ignore_index=True)
        df_smap['woreda_name'] = df_smap['woreda_id'].map(woreda_name_map)
        df_smap.dropna(subset=['woreda_name'], inplace=True)
        df_smap = df_smap.groupby(['woreda_id', 'woreda_name', 'year', 'month']).mean().reset_index()
        df_smap = df_smap.sort_values(by=['woreda_id', 'year', 'month']).reset_index(drop=True)
        print(f"\n✅ Consolidated monthly SMAP data: {df_smap.shape[0]} records.")
        print(df_smap.head())
    else:
        print("No SMAP CSVs were processed successfully.")
        df_smap = None
else:
    print("Skipping processing SMAP data due to missing woreda data.")

# ## 5. Process SRTM Elevation Data
# Consolidate SRTM elevation data (which is static per woreda).

if gdf_woredas is not None:
    all_srtm_data = []
    processed_count = 0
    error_count = 0
    print("\nProcessing downloaded SRTM CSVs...")
    for filename in os.listdir(srtm_download_dir):
        if filename.startswith('srtm_elevation_') and filename.endswith('.csv'):
            file_path = os.path.join(srtm_download_dir, filename)
            try:
                df = pd.read_csv(file_path)
                # Handle woreda_id extraction from filename if not in column directly
                if 'Woreda_ID' in df.columns:
                    df = df.rename(columns={'Woreda_ID': 'woreda_id'})
                elif 'woreda_id' not in df.columns:
                    woreda_id_from_filename = filename.replace('srtm_elevation_', '').replace('.csv', '')
                    df['woreda_id'] = woreda_id_from_filename

                if 'woreda_id' not in df.columns:
                    raise KeyError(f"'woreda_id' column not found or inferrable in {filename}")

                df['woreda_id'] = df['woreda_id'].astype(str)
                
                # Select elevation column and rename
                df_subset = df[['woreda_id', 'elevation']].copy()
                df_subset = df_subset.rename(columns={'elevation': 'avg_elevation'})

                all_srtm_data.append(df_subset)
                processed_count += 1
            except Exception as e:
                print(f"  Error processing {filename}: {e}")
                error_count += 1

    if all_srtm_data:
        df_srtm = pd.concat(all_srtm_data, ignore_index=True)
        df_srtm['woreda_name'] = df_srtm['woreda_id'].map(woreda_name_map)
        df_srtm.dropna(subset=['woreda_name'], inplace=True)
        df_srtm = df_srtm.drop_duplicates(subset=['woreda_id']).reset_index(drop=True) # Elevation is static per woreda
        print(f"\n✅ Consolidated SRTM elevation data: {df_srtm.shape[0]} records.")
        print(df_srtm.head())
    else:
        print("No SRTM CSVs were processed successfully.")
        df_srtm = None
else:
    print("Skipping processing SRTM data due to missing woreda data.")

# ## 6. Merge All Environmental Data
# Combine ERA5, SMAP (monthly) and SRTM (static) into a single DataFrame.

if df_era5 is not None and df_smap is not None and df_srtm is not None:
    # Merge monthly ERA5 and SMAP data first
    df_monthly_env = pd.merge(df_era5, df_smap, on=['woreda_id', 'woreda_name', 'year', 'month'], how='outer')
    
    # Merge with static elevation data (df_srtm)
    # Elevation is static, so it just needs to be joined by woreda_id
    df_monthly_env = pd.merge(df_monthly_env, df_srtm[['woreda_id', 'avg_elevation']], on='woreda_id', how='left')

    df_monthly_env = df_monthly_env.sort_values(by=['woreda_id', 'year', 'month']).reset_index(drop=True)

    print(f"\n✅ Consolidated all monthly environmental data: {df_monthly_env.shape[0]} records.")
    print(df_monthly_env.head())

    # Save the consolidated data
    output_path = os.path.join(processed_data_dir, 'woreda_monthly_environmental_data.csv')
    df_monthly_env.to_csv(output_path, index=False)
    print(f"✅ Consolidated monthly environmental data saved to {output_path}")
else:
    print("Skipping full environmental data consolidation due to missing sub-datasets.")
