In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor
from sklearn.metrics import classification_report, mean_absolute_error, r2_score
import joblib

df = pd.read_csv("../data/processed/crop_risk_insurance_engineered_v1.csv")

#### Features & targets

In [2]:
exclude = ['actual_yield_t_ha', 'expected_yield_t_ha', 'yield_loss_pct',
           'payout_usd_per_ha', 'risk_class', 'risk_class_encoded',
           'season_year', 'yield_potential_t_ha']   # drop non-predictive

X = df.drop(columns=exclude, errors='ignore')
y_class = df['risk_class']
y_loss = df['yield_loss_pct']

# Split
X_train, X_test, y_class_train, y_class_test = train_test_split(
    X, y_class, test_size=0.20, random_state=42, stratify=y_class
)

_, _, y_loss_train, y_loss_test = train_test_split(
    X, y_loss, test_size=0.20, random_state=42
)

#### 1. Random Forest Classifier

In [3]:
rf_clf = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, class_weight='balanced')
rf_clf.fit(X_train, y_class_train)

print("Classification Report:")
print(classification_report(y_class_test, rf_clf.predict(X_test)))

joblib.dump(rf_clf, "../models/risk_classifier_rf_baseline.joblib")

Classification Report:
              precision    recall  f1-score   support

        High       0.82      0.86      0.84       819
         Low       0.82      0.90      0.86      1272
      Medium       0.26      0.12      0.16       309

    accuracy                           0.79      2400
   macro avg       0.63      0.63      0.62      2400
weighted avg       0.75      0.79      0.76      2400



['../models/risk_classifier_rf_baseline.joblib']

#### 2. XGBoost on yield_loss_pct

In [4]:
xgb_reg = XGBRegressor(n_estimators=300, max_depth=6, learning_rate=0.05,
                       subsample=0.8, colsample_bytree=0.8, random_state=42)
xgb_reg.fit(X_train, y_loss_train)

y_loss_pred = xgb_reg.predict(X_test).clip(0, 1)
payout_pred = 600 * y_loss_pred
payout_true = 600 * y_loss_test

print("\nPayout metrics:")
print(f"MAE: {mean_absolute_error(payout_true, payout_pred):.2f} USD")
print(f"R²:  {r2_score(payout_true, payout_pred):.3f}")

joblib.dump(xgb_reg, "../models/yield_loss_regressor_xgb.joblib")


Payout metrics:
MAE: 176.65 USD
R²:  -0.057


['../models/yield_loss_regressor_xgb.joblib']