In [None]:
# ==============================================================================
# notebooks/09_yield_prediction_model.ipynb
# ==============================================================================

# # 09 - Yield Prediction Model: Training, Evaluation, and Prediction
# This notebook is the culmination of the data pipeline. It focuses on:
# 1.  Loading the consolidated master dataset (`master_woreda_data.csv`).
# 2.  Preparing the data for machine learning (handling missing values, feature scaling).
# 3.  Training and evaluating multiple regression models (Random Forest, XGBoost, Ridge, Lasso, Stacking).
# 4.  Selecting the best performing model based on cross-validation metrics.
# 5.  Using the best model to make yield predictions for 2025.
# 6.  Analyzing feature importance to understand which variables drive yield predictions.

# ## 1. Load Project Setup and Libraries
# We'll load `pandas`, `numpy`, and various modules from `sklearn` and `xgboost` for model training and evaluation. `pickle` is used for saving the trained model.

import pandas as pd
import numpy as np
import os
import pickle

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression

print("Libraries loaded.")

# Define processed data directory
processed_data_dir = '../data/processed/'
model_dir = '../models/'

os.makedirs(processed_data_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

print(f"Processed data directory: {processed_data_dir}")
print(f"Models will be saved to: {model_dir}")

# ## 2. Load Master Dataset
# Load the `master_woreda_data.csv` file, which contains all the integrated features and the target yield variable.

master_data_path = os.path.join(processed_data_dir, 'master_woreda_data.csv')

try:
    df_master = pd.read_csv(master_data_path)
    print(f"Loaded master dataset: {df_master.shape[0]} records, {df_master.shape[1]} columns.")
    print(df_master.head())
except FileNotFoundError as e:
    print(f"Error loading master data: {e}. Please ensure you have run notebook 08 to generate this file.")
    df_master = None

# ## 3. Data Preparation for Modeling
# This section involves:
# -   Separating features (X) and target (y).
# -   Handling missing values (e.g., imputation).
# -   Scaling numerical features.

if df_master is not None:
    # Separate data for training (where yield is available) and prediction (2025 data)
    df_train = df_master.dropna(subset=['annual_yield_quintals_ha']).copy()
    df_2025 = df_master[df_master['year'] == 2025].copy()

    print(f"\nTraining data (with yield): {df_train.shape[0]} records.")
    print(f"2025 data (for prediction): {df_2025.shape[0]} records.")

    # Define features and target
    # Exclude identifiers and the target itself from features
    feature_cols = [col for col in df_train.columns if col not in ['woreda_id', 'woreda_name', 'year', 'annual_yield_quintals_ha']]
    target_col = 'annual_yield_quintals_ha'

    X = df_train[feature_cols]
    y = df_train[target_col]

    X_2025 = df_2025[feature_cols]

    # --- Handle Missing Values (Imputation) ---
    # For simplicity, using mean imputation. Consider more sophisticated methods like MICE or KNN imputation.
    # It's crucial to fit the imputer ONLY on training data and transform both train and test/prediction data.
    for col in feature_cols:
        if X[col].isnull().any():
            mean_val = X[col].mean()
            X[col] = X[col].fillna(mean_val)
            X_2025[col] = X_2025[col].fillna(mean_val) # Apply same imputation to 2025 data
            print(f"  Imputed missing values in '{col}' with mean {mean_val:.2f}")

    # Check for any remaining NaNs (should be none in X and X_2025 features now)
    if X.isnull().sum().sum() > 0 or X_2025.isnull().sum().sum() > 0:
        print("Warning: Missing values still exist in features after imputation!")

    # --- Feature Scaling ---
    # Standardize numerical features. Fit scaler ONLY on training data.
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_2025_scaled = scaler.transform(X_2025) # Transform 2025 data using scaler fitted on training data

    X_scaled_df = pd.DataFrame(X_scaled, columns=feature_cols, index=X.index)
    X_2025_scaled_df = pd.DataFrame(X_2025_scaled, columns=feature_cols, index=X_2025.index)

    print("\nFeatures scaled.")
    print(X_scaled_df.head())
else:
    print("Skipping data preparation due to missing master dataset.")

# ## 4. Model Training and Cross-Validation
# Train and evaluate different regression models using K-Fold Cross-Validation to get robust performance metrics.

if 'X_scaled_df' in locals() and X_scaled_df is not None:
    models = {
        'Random Forest': RandomForestRegressor(random_state=42),
        'XGBoost': XGBRegressor(random_state=42, n_estimators=100),
        'Ridge': Ridge(random_state=42),
        'Lasso': Lasso(random_state=42),
        'Stacking': StackingRegressor(
            estimators=[
                ('rf', RandomForestRegressor(random_state=42)),
                ('xgb', XGBRegressor(random_state=42, n_estimators=50)) # Lower n_estimators for base models
            ],
            final_estimator=LinearRegression(),
            cv=5 # Cross-validation for the stacking process
        )
    }

    results = []
    best_rmse = float('inf')
    best_model = None

    print("\nStarting model training and cross-validation...")

    for name, model in models.items():
        print(f"\nTraining {name}...")
        kf = KFold(n_splits=5, shuffle=True, random_state=42)

        # Use negative mean squared error for cross_val_score
        rmse_scores = np.sqrt(-cross_val_score(model, X_scaled_df, y, cv=kf, scoring='neg_mean_squared_error', n_jobs=-1))
        r2_scores = cross_val_score(model, X_scaled_df, y, cv=kf, scoring='r2', n_jobs=-1)
        mae_scores = -cross_val_score(model, X_scaled_df, y, cv=kf, scoring='neg_mean_absolute_error', n_jobs=-1)

        avg_rmse = np.mean(rmse_scores)
        avg_r2 = np.mean(r2_scores)
        avg_mae = np.mean(mae_scores)

        print(f"  {name} - Cross-Validation Results:")
        print(f"    Avg RMSE: {avg_rmse:.3f} (Std: {np.std(rmse_scores):.3f})")
        print(f"    Avg R2: {avg_r2:.3f} (Std: {np.std(r2_scores):.3f})")
        print(f"    Avg MAE: {avg_mae:.3f} (Std: {np.std(mae_scores):.3f})")

        results.append({
            'Model': name,
            'Avg RMSE': avg_rmse,
            'Std RMSE': np.std(rmse_scores),
            'Avg R2': avg_r2,
            'Std R2': np.std(r2_scores),
            'Avg MAE': avg_mae,
            'Std MAE': np.std(mae_scores)
        })

        # Fit the model on the full training data for later use (e.g., prediction)
        model.fit(X_scaled_df, y)

        if avg_rmse < best_rmse:
            best_rmse = avg_rmse
            best_model = {'Model': name, 'Trained Model': model}

    df_results = pd.DataFrame(results).sort_values(by='Avg RMSE')
    print("--- Model Comparison ---")
    print(df_results)

    print(f"✅ Best Model (based on Avg RMSE): {best_model['Model']}")

    # Save the best model
    model_save_path = os.path.join(model_dir, 'best_yield_prediction_model.pkl')
    with open(model_save_path, 'wb') as f:
        pickle.dump(best_model['Trained Model'], f)
    print(f"✅ Best trained model saved to {model_save_path}")

    # Save the scaler as well, needed for future predictions
    scaler_save_path = os.path.join(model_dir, 'feature_scaler.pkl')
    with open(scaler_save_path, 'wb') as f:
        pickle.dump(scaler, f)
    print(f"✅ Feature scaler saved to {scaler_save_path}")

    # Save feature_cols for consistency when loading model later
    feature_cols_path = os.path.join(model_dir, 'feature_columns.pkl')
    with open(feature_cols_path, 'wb') as f:
        pickle.dump(feature_cols, f)
    print(f"✅ Feature column names saved to {feature_cols_path}")

else:
    print("Skipping model training due to missing prepared data.")

# ## 5. 2025 Yield Prediction
# Use the best trained model to predict coffee yields for the year 2025 (or any year with features but no observed yield).

if 'best_model' in locals() and best_model is not None and 'X_2025_scaled_df' in locals() and X_2025_scaled_df is not None:
    print("\nMaking 2025 yield predictions...")
    predictions_2025 = best_model['Trained Model'].predict(X_2025_scaled_df)

    # Add predictions back to the 2025 DataFrame
    df_2025['predicted_yield_quintals_ha'] = predictions_2025

    # Ensure predictions are non-negative
    df_2025['predicted_yield_quintals_ha'] = df_2025['predicted_yield_quintals_ha'].apply(lambda x: max(0, x))

    output_path = os.path.join(processed_data_dir, 'sidama_coffee_yield_predictions_2025.csv')
    df_2025[['woreda_id', 'woreda_name', 'year', 'predicted_yield_quintals_ha']].to_csv(output_path, index=False)
    print(f"\n✅ 2025 predictions saved to {output_path}: {df_2025.shape}")
    print(df_2025[['woreda_name', 'year', 'predicted_yield_quintals_ha']].head())
else:
    print("Skipping 2025 prediction due to missing model or 2025 data.")

# ## 6. Feature Importance Analysis
# Understand which features contribute most to the model's predictions. This helps in interpreting the model and potentially identifying key drivers of coffee yield.

if 'best_model' in locals() and best_model is not None and 'feature_cols' in locals():
    print("\n--- Feature Importance/Coefficients ---")
    if best_model['Model'] in ['Random Forest', 'XGBoost']:
        feature_importance = pd.DataFrame({
            'Feature': feature_cols,
            'Importance': best_model['Trained Model'].feature_importances_
        }).sort_values(by='Importance', ascending=False)
        print(feature_importance.head(10))
    elif best_model['Model'] in ['Ridge', 'Lasso']:
        coefficients = best_model['Trained Model'].coef_
        feature_importance = pd.DataFrame({
            'Feature': feature_cols,
            'Coefficient': coefficients
        }).sort_values(by='Coefficient', key=abs, ascending=False)
        print(feature_importance.head(10))
    elif best_model['Model'] == 'Stacking':
        # For stacking, direct feature importance can be complex.
        # Can inspect final_estimator_ coefficients if it's linear, or base models' importances.
        # This is a simplified view.
        print("Feature importance for Stacking Regressor is complex. Consider inspecting base models.")
        if hasattr(best_model['Trained Model'].final_estimator_, 'coef_'):
             coefficients = best_model['Trained Model'].final_estimator_.coef_
             meta_feature_names = [f'base_model_{i}' for i in range(len(best_model['Trained Model'].estimators))]
             meta_importance = pd.DataFrame({
                 'Meta-Feature (Base Model)': meta_feature_names,
                 'Coefficient': coefficients
             }).sort_values(by='Coefficient', key=abs, ascending=False)
             print("Meta-Feature (Base Model) Coefficients:")
             print(meta_importance)
    else:
        print("Feature importance not directly available for this model type.")
else:
    print("Skipping feature importance analysis due to missing model or feature columns.")

