In [None]:
# ==============================================================================
# notebooks/07_yield_data_preparation.ipynb
# ==============================================================================

# # 07 - Yield Data Preparation
# This notebook focuses on preparing the coffee yield data for use in the yield prediction model. It involves:
# 1. Loading the raw yield data (which should be provided separately, typically in `data/input/`).
# 2. Cleaning and preprocessing the yield data as necessary.
# 3. Aggregating the yield data to the woreda-year level to align with the temporal and spatial resolution of the satellite and environmental features.
# 4. Merging the yield data with the woreda geometries for spatial context.
# 5. Saving the processed yield data.

# ## 1. Load Project Setup and Libraries
# We'll load the `gdf_woredas` and import `pandas` and `geopandas`.

import pandas as pd
import geopandas as gpd
import os
import sys

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '../'))
if project_root not in sys.path:
    sys.path.append(project_root)

print("Libraries loaded.")

# Load gdf_woredas saved by '00_setup_and_common_data_loading.ipynb'
processed_data_dir = '../data/processed/'
gdf_woredas_path = os.path.join(processed_data_dir, 'sidama_woredas.geojson')

try:
    gdf_woredas = gpd.read_file(gdf_woredas_path)
    print(f"Loaded GeoDataFrame with {len(gdf_woredas)} woredas from {gdf_woredas_path}")
except FileNotFoundError:
    print(f"Error: '{gdf_woredas_path}' not found. Please run '00_setup_and_common_data_loading.ipynb' first.")
    gdf_woredas = None

# ## 2. Define Input and Output Paths
# Specify the path to your raw yield data file and where the processed yield data will be saved. You will need to replace `raw_yield_data.csv` with the actual filename of your yield data and ensure it is placed in `data/input/`.

if gdf_woredas is not None:
    # --- IMPORTANT: Replace with your actual yield data path ---
    # Your raw yield data should contain 'woreda_id', 'year', and 'yield' columns at minimum.
    # The 'yield' should be in quintals per hectare for consistency with the original notebook.
    RAW_YIELD_DATA_PATH = '../data/input/raw_yield_data.csv' # <<< YOU MUST REPLACE THIS WITH YOUR DATA

    OUTPUT_YIELD_DATA_PATH = os.path.join(processed_data_dir, 'woreda_annual_yield_data.csv')

    os.makedirs(os.path.dirname(RAW_YIELD_DATA_PATH), exist_ok=True)
    os.makedirs(os.path.dirname(OUTPUT_YIELD_DATA_PATH), exist_ok=True)

    print(f"Raw yield data expected at: {RAW_YIELD_DATA_PATH}")
    print(f"Processed yield data will be saved to: {OUTPUT_YIELD_DATA_PATH}")
else:
    print("Skipping path definitions as woreda data is not loaded.")

# ## 3. Load and Preprocess Raw Yield Data
# This section loads the raw yield data. You might need to adapt the column names and any initial cleaning steps based on the format of your specific dataset. The example assumes columns like `woreda_id`, `year`, and `yield_quintals_ha`.

if gdf_woredas is not None:
    df_raw_yield = None
    if not os.path.exists(RAW_YIELD_DATA_PATH):
        print(f"Error: Raw yield data file not found at {RAW_YIELD_DATA_PATH}.")
        print("Please ensure your raw yield data (e.g., 'raw_yield_data.csv') is placed in the 'data/input/' directory.")
    else:
        try:
            # Load your raw yield data. Adjust 'sep' and 'encoding' if necessary.
            df_raw_yield = pd.read_csv(RAW_YIELD_DATA_PATH)
            print(f"Loaded raw yield data: {df_raw_yield.shape[0]} records.")
            print("Columns:", df_raw_yield.columns.tolist())
            print(df_raw_yield.head())

            # --- Data Cleaning and Standardization (ADAPT THIS SECTION) ---
            # Rename columns to standardized names if necessary
            # Example: If your yield column is named 'production_q_ha' and woreda ID is 'w_id'
            # df_raw_yield = df_raw_yield.rename(columns={'w_id': 'woreda_id', 'production_q_ha': 'yield_quintals_ha'})

            # Ensure 'woreda_id' and 'year' are appropriate types
            df_raw_yield['woreda_id'] = df_raw_yield['woreda_id'].astype(str)
            df_raw_yield['year'] = df_raw_yield['year'].astype(int)

            # Handle missing or erroneous yield values (e.g., replace 0 with NaN if 0 means no data)
            # df_raw_yield['yield_quintals_ha'] = df_raw_yield['yield_quintals_ha'].replace(0, np.nan)

            # Aggregate if raw data is not already at woreda-year level
            # For example, if you have multiple entries per woreda-year (e.g., by farm, or different crop types)
            df_yield_processed = df_raw_yield.groupby(['woreda_id', 'year'])['yield_quintals_ha'].mean().reset_index()
            df_yield_processed.rename(columns={'yield_quintals_ha': 'annual_yield_quintals_ha'}, inplace=True)

            print(f"\nProcessed yield data aggregated to annual woreda level: {df_yield_processed.shape[0]} records.")
            print(df_yield_processed.head())

            # --- Merge with Woreda Names ---
            # We need the woreda names for clarity in the final dataset
            df_woreda_names = gdf_woredas[['Woreda_ID', 'Woreda Name']].rename(columns={'Woreda_ID': 'woreda_id', 'Woreda Name': 'woreda_name'})
            df_yield_processed = pd.merge(df_yield_processed, df_woreda_names, on='woreda_id', how='left')

            # Reorder columns
            df_yield_processed = df_yield_processed[['woreda_id', 'woreda_name', 'year', 'annual_yield_quintals_ha']]

            print(f"\nFinal processed yield data preview:")
            print(df_yield_processed.head())

            # Save the processed yield data
            df_yield_processed.to_csv(OUTPUT_YIELD_DATA_PATH, index=False)
            print(f"✅ Processed annual yield data saved to {OUTPUT_YIELD_DATA_PATH}")

        except FileNotFoundError:
            print(f"Error: The specified raw yield data file '{RAW_YIELD_DATA_PATH}' was not found.")
        except KeyError as e:
            print(f"Error: Missing expected column in raw yield data: {e}. Please check column names and adapt the script.")
        except Exception as e:
            print(f"An unexpected error occurred during yield data processing: {e}")
else:
    print("Skipping yield data preparation as woreda data is not loaded.")
