In [None]:
# ==============================================================================
# notebooks/00_setup_and_common_data_loading.ipynb
# ==============================================================================

# # 00 - Setup and Common Data Loading
# This notebook performs essential setup steps and loads common data used across the project.
# It covers:
# 1.  Initializing Google Earth Engine (GEE).
# 2.  Authenticating with Google Cloud services (GEE, GCS).
# 3.  Defining global project parameters (e.g., GEE project, GCS bucket, date ranges).
# 4.  Loading and preprocessing administrative boundary data (woredas).
# 5.  Setting up necessary directory structures.

# ## 1. Initialize Google Earth Engine and Authenticate
# Authenticate and initialize the Earth Engine API and Google Cloud Colab authentication.

import ee
from google.colab import auth
import geopandas as gpd
import os
import sys

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '../'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Authenticate Google Colab for GEE and GCS access
print("Authenticating Google Colab...")
auth.authenticate_user()
print("Authentication complete.")

# Initialize Earth Engine
print("Initializing Earth Engine...")
try:
    # Replace 'your-gee-project-id' with your actual GEE project ID
    ee.Initialize(project='bensa-coffee-yield')
    print("Earth Engine initialized successfully.")
except Exception as e:
    print(f"Error initializing Earth Engine: {e}")
    print("Please ensure you have authenticated and your GEE project ID is correct.")

# ## 2. Define Project Parameters
# Define important variables such as the GCS bucket name, date ranges for data export, and paths.

# --- GCS Configuration ---
# IMPORTANT: Replace 'your-gcs-bucket-name' with the name of your GCS bucket
BUCKET_NAME = 'bensa-coffee-yield' # <<< REPLACE THIS WITH YOUR GCS BUCKET NAME
print(f"Using GCS Bucket: {BUCKET_NAME}")

# --- Date Ranges for Data Export ---
# Define the period for which to export satellite and climate data.
# This should ideally cover all historical yield data years + future prediction year (e.g., 2025).
START_YEAR = 2017
END_YEAR = 2025 # Inclusive, so data up to Dec 31, 2025
# GEE export dates are typically 'YYYY-MM-DD'
START_DATE = f'{START_YEAR}-01-01'
END_DATE = f'{END_YEAR}-12-31' # For the last year, include all months

print(f"Data export period: {START_DATE} to {END_DATE}")

# --- Directory Paths ---
# Define paths for raw, processed, and GEE export data
data_dir = '../data/'
input_data_dir = os.path.join(data_dir, 'input/')
processed_data_dir = os.path.join(data_dir, 'processed/')
gee_exports_dir = os.path.join(data_dir, 'gee_exports/')

# Create directories if they don't exist
os.makedirs(input_data_dir, exist_ok=True)
os.makedirs(processed_data_dir, exist_ok=True)
os.makedirs(gee_exports_dir, exist_ok=True)
os.makedirs(os.path.join(processed_data_dir, 'coffee_extents'), exist_ok=True)
os.makedirs(os.path.join(gee_exports_dir, 'sentinel2'), exist_ok=True)
os.makedirs(os.path.join(gee_exports_dir, 'era5'), exist_ok=True)
os.makedirs(os.path.join(gee_exports_dir, 'srtm'), exist_ok=True)
os.makedirs(os.path.join(gee_exports_dir, 'smap'), exist_ok=True)

print("\nProject directories created/checked.")

# ## 3. Load and Preprocess Administrative Boundaries
# Load the Sidama woreda (district) boundaries. This GeoDataFrame will be used throughout the project
# for filtering, clipping, and aggregation.

# Path to your administrative boundary shapefile (assuming it's in data/input)
# Make sure to place your shapefile (e.g., sidama_woredas.shp) and its accompanying
# files (.dbf, .shx, .prj, .cpg) in the `data/input/` directory.
WOREDAS_SHAPEFILE_PATH = os.path.join(input_data_dir, 'sidama_woredas.shp') # <<< MAKE SURE THIS FILE EXISTS

# Output path for the processed GeoJSON
PROCESSED_WOREDAS_GEOJSON_PATH = os.path.join(processed_data_dir, 'sidama_woredas.geojson')

# Load the shapefile
gdf_woredas = None
try:
    if not os.path.exists(WOREDAS_SHAPEFILE_PATH):
        print(f"Error: Woredas shapefile not found at {WOREDAS_SHAPEFILE_PATH}.")
        print("Please place `sidama_woredas.shp` (and its accompanying files like .dbf, .shx, .prj) in the 'data/input/' directory.")
    else:
        gdf_woredas = gpd.read_file(WOREDAS_SHAPEFILE_PATH)
        print(f"\nLoaded GeoDataFrame with {len(gdf_woredas)} woredas.")
        print(gdf_woredas.head())

        # Ensure 'Woreda_ID' is string for consistent merging later
        if 'Woreda_ID' in gdf_woredas.columns:
            gdf_woredas['Woreda_ID'] = gdf_woredas['Woreda_ID'].astype(str)
            print("Woreda_ID column converted to string type.")
        else:
            print("Warning: 'Woreda_ID' column not found. Ensure your shapefile has a unique ID column for woredas.")

        # Optionally, simplify geometries or ensure valid geometries
        gdf_woredas['geometry'] = gdf_woredas.geometry.buffer(0) # Fixes invalid geometries

        # Save the processed GeoDataFrame as a GeoJSON for easier loading in subsequent notebooks
        gdf_woredas.to_file(PROCESSED_WOREDAS_GEOJSON_PATH, driver='GeoJSON')
        print(f"âœ… Processed woredas saved to {PROCESSED_WOREDAS_GEOJSON_PATH}")

except Exception as e:
    print(f"An error occurred during woreda data loading/preprocessing: {e}")
    gdf_woredas = None # Ensure it's None if loading fails

# ## Common Variables for GEE (as ee.FeatureCollection)
# Convert the GeoDataFrame to an Earth Engine FeatureCollection for use in GEE exports.

ee_woredas = None
if gdf_woredas is not None:
    try:
        # Convert GeoDataFrame to EE FeatureCollection
        # This requires converting GeoJSON string to EE object
        geojson_str = gdf_woredas.to_json()
        ee_woredas = ee.FeatureCollection(geojson_str)
        print("\nGeoDataFrame converted to Earth Engine FeatureCollection.")
        print(f"EE FeatureCollection size: {ee_woredas.size().getInfo()}")
    except Exception as e:
        print(f"Error converting GeoDataFrame to EE FeatureCollection: {e}")
        ee_woredas = None
else:
    print("\nSkipping EE FeatureCollection conversion as woreda data is not loaded.")

# This notebook should be run first to set up the environment and load common data.
# The `gdf_woredas` and `ee_woredas` variables, along with `BUCKET_NAME`, `START_DATE`, `END_DATE`, etc.,
# will be used by subsequent notebooks.
