# Main — WAOB Soybean Forecast (National & States)

This notebook runs a compact, reproducible workflow:
1. **Data prep**: load WAOB-style state file; build crop condition features; add **US** weighted line.
2. **National model tests**:
   - Train up to **2013** (no dummy) → predict **2020** and compare to actual.
   - Train up to **2019** (with `dummy_2003`) → predict **2020** and compare to actual.
3. **State models** (7 major producers):
   - Train up to **2019** (with dummy) per state → predict **2020**; export a comparative table (metrics, coefficients, forecast).
4. **Augmented models** adding **`gex_JA_min`**:
   - Repeat national and state tests with crop condition feature.
   
All models are plain OLS with the WAOB feature set:
- Baseline: `yield ~ trend + jun_shortfall + temp_JA + prec_JA + I(prec_JA**2)` (+ `dummy_2003` when used)
- Augmented: baseline + `gex_JA_min`


In [None]:
import os, sys
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

# local module providing crop condition features
sys.path.append(os.path.abspath("."))
from src import get_soy_condition_features, SEVEN_STATES

# Parameters
YEAR_FROM = 1987
YEAR_TO   = 2024
STATES    = SEVEN_STATES  # ("IA","IL","IN","OH","MO","MN","NE")
STATES_FILE = "data/processed/waob_features_states.csv"  # merged WAOB features at state level

TARGET_YEAR_FORECAST = 2020

BASELINE_FORM_NO_D = "yield_bu_acre ~ trend + jun_shortfall + temp_JA + prec_JA + I(prec_JA**2)"
BASELINE_FORM_WITH_D = BASELINE_FORM_NO_D + " + dummy_2003"
AUGMENTED_FORM_NO_D = BASELINE_FORM_NO_D + " + gex_JA_min"
AUGMENTED_FORM_WITH_D = BASELINE_FORM_WITH_D + " + gex_JA_min"

plt.rcParams['figure.figsize'] = (6,4)
%matplotlib inline


## 1) Load data, build US line, and merge condition features

In [None]:
# Fetch annual crop-condition features by state
_, cond_annual = get_soy_condition_features(YEAR_FROM, YEAR_TO, STATES)

# Load WAOB-state file with yield and weather features
states = pd.read_csv(STATES_FILE)

# Keep needed columns and drop duplicates
cols_keep = ["state","year","yield_bu_acre","harvest_ha","acres_harvested",
             "trend","jun_shortfall","temp_JA","prec_JA","prec_JA_sq","dummy_2003"]
states = states[cols_keep].drop_duplicates(subset=["state","year"]).copy()

# Merge conditions into states
df = states.merge(cond_annual, on=["state","year"], how="left")

# Helper: add US weighted line per year using harvest_ha
def add_us_weighted_row(df_in: pd.DataFrame) -> pd.DataFrame:
    if "harvest_ha" not in df_in.columns:
        raise ValueError("harvest_ha is required to compute US weights.")
    num_cols = df_in.select_dtypes(include=[np.number]).columns.tolist()
    num_cols = [c for c in num_cols if c not in ["harvest_ha","year"]]  # exclude weight and key
    def wmean(g: pd.DataFrame) -> pd.Series:
        w = g["harvest_ha"].fillna(0.0).astype(float)
        out = {c: (np.average(g[c].astype(float), weights=w) if w.sum() > 0 else np.nan)
               for c in num_cols}
        out["harvest_ha"] = float(w.sum())
        out["acres_harvested"] = float(g.get("acres_harvested", pd.Series([0]*len(g))).fillna(0).sum())
        return pd.Series(out)
    us = df_in.groupby("year", as_index=True).apply(wmean).reset_index()
    us["state"] = "US"
    return pd.concat([df_in, us], ignore_index=True, sort=False)

df_full = add_us_weighted_row(df)
df_us   = df_full[df_full["state"]=="US"].sort_values("year").copy()

print("df_full shape:", df_full.shape, "| US years:", df_us["year"].min(), "→", df_us["year"].max())
display(df_us.tail(3))


## 2) Helpers: fit, metrics, and forecast utilities

In [None]:
def fit_ols(formula: str, data: pd.DataFrame):
    model = smf.ols(formula=formula, data=data).fit()
    return model

def metrics_from_model(m):
    rmse = float(np.sqrt(m.mse_resid)) if hasattr(m, "mse_resid") else np.nan
    return {
        "n_obs": int(m.nobs),
        "r2": float(m.rsquared),
        "r2_adj": float(m.rsquared_adj),
        "rmse": rmse
    }

def train_and_predict(formula: str, df_train: pd.DataFrame, df_predict: pd.DataFrame, pred_label: str):
    m = fit_ols(formula, df_train)
    yhat = m.predict(df_predict)
    out = df_predict[["state","year"]].copy()
    out[pred_label] = np.asarray(yhat, dtype=float)
    return m, out

def ensure_year(df_in: pd.DataFrame, state: str, year: int):
    sub = df_in[(df_in["state"]==state) & (df_in["year"]==year)]
    if sub.empty:
        raise ValueError(f"No row for {state} {year} in data.")
    return sub


## 3) National model tests

In [None]:
# A) Train through 2013 (no dummy), predict 2020
train_A = df_us[(df_us["year"] >= 1988) & (df_us["year"] <= 2013)].dropna(subset=["yield_bu_acre","trend","jun_shortfall","temp_JA","prec_JA","prec_JA_sq"])
row_2020 = ensure_year(df_us, "US", 2020).dropna(subset=["trend","jun_shortfall","temp_JA","prec_JA","prec_JA_sq"])

mA, predA = train_and_predict(BASELINE_FORM_NO_D, train_A, row_2020, "forecast_2020_A")
metA = metrics_from_model(mA)

print("=== National A: train ≤2013, no dummy ===")
display(pd.Series(metA))
print(mA.summary())

act_2020 = float(row_2020["yield_bu_acre"])
fcst_2020_A = float(predA["forecast_2020_A"])
print(f"Actual 2020 = {act_2020:.2f}  |  Forecast A = {fcst_2020_A:.2f}  |  Error = {fcst_2020_A - act_2020:.2f}")

# B) Train through 2019 (with dummy), predict 2020
need_cols = ["yield_bu_acre","trend","jun_shortfall","temp_JA","prec_JA","prec_JA_sq","dummy_2003"]
train_B = df_us[(df_us["year"] >= 1988) & (df_us["year"] <= 2019)].dropna(subset=need_cols)
mB, predB = train_and_predict(BASELINE_FORM_WITH_D, train_B, row_2020, "forecast_2020_B")
metB = metrics_from_model(mB)

print("\n=== National B: train ≤2019, with dummy_2003 ===")
display(pd.Series(metB))
print(mB.summary())

fcst_2020_B = float(predB["forecast_2020_B"])
print(f"Actual 2020 = {act_2020:.2f}  |  Forecast B = {fcst_2020_B:.2f}  |  Error = {fcst_2020_B - act_2020:.2f}")


## 4) State-level models (baseline with dummy, train ≤2019) — predict 2020

In [None]:
rows = []
coefs_rows = []

for st in STATES:
    d = df_full[df_full['state']==st].copy()
    train = d[(d["year"]>=1988)&(d["year"]<=2019)].dropna(subset=["yield_bu_acre","trend","jun_shortfall","temp_JA","prec_JA","prec_JA_sq","dummy_2003"])
    try:
        row20 = ensure_year(d, st, TARGET_YEAR_FORECAST).dropna(subset=["trend","jun_shortfall","temp_JA","prec_JA","prec_JA_sq"])
    except ValueError:
        continue

    if train.empty or row20.empty:
        continue

    m, pred = train_and_predict(BASELINE_FORM_WITH_D, train, row20, "forecast_2020")
    met = metrics_from_model(m)
    y_true = float(row20["yield_bu_acre"]) if not row20["yield_bu_acre"].isna().all() else np.nan
    y_hat = float(pred["forecast_2020"])

    rows.append({
        "state": st,
        "n_obs": met["n_obs"],
        "r2": met["r2"],
        "r2_adj": met["r2_adj"],
        "rmse": met["rmse"],
        "y2020_actual": y_true,
        "y2020_forecast": y_hat,
        "y2020_error": y_hat - y_true
    })

    # store coefficients
    prms = m.params.to_dict()
    prms["state"] = st
    coefs_rows.append(prms)

state_results_baseline = pd.DataFrame(rows).sort_values("state")
state_coefs_baseline = pd.DataFrame(coefs_rows).set_index("state").sort_index()

print("=== State baseline results (≤2019 + dummy → 2020) ===")
display(state_results_baseline)
print("\n=== State baseline coefficients ===")
display(state_coefs_baseline)


## 5) National model adding `gex_JA_min`

In [None]:
# A') ≤2013 without dummy, + gex_JA_min
train_Ap = df_us[(df_us["year"] >= 1988) & (df_us["year"] <= 2013)].dropna(subset=["yield_bu_acre","trend","jun_shortfall","temp_JA","prec_JA","prec_JA_sq","gex_JA_min"])
row_2020_p = ensure_year(df_us, "US", 2020).dropna(subset=["trend","jun_shortfall","temp_JA","prec_JA","prec_JA_sq","gex_JA_min"])

mAp, predAp = train_and_predict(AUGMENTED_FORM_NO_D, train_Ap, row_2020_p, "forecast_2020_Ap")
metAp = metrics_from_model(mAp)

print("=== National A': ≤2013, + gex_JA_min, no dummy ===")
display(pd.Series(metAp))
print(mAp.summary())

act_2020 = float(row_2020_p["yield_bu_acre"])
fcst_2020_Ap = float(predAp["forecast_2020_Ap"])
print(f"Actual 2020 = {act_2020:.2f}  |  Forecast A' = {fcst_2020_Ap:.2f}  |  Error = {fcst_2020_Ap - act_2020:.2f}")

# B') ≤2019 with dummy, + gex_JA_min
need_cols_aug = ["yield_bu_acre","trend","jun_shortfall","temp_JA","prec_JA","prec_JA_sq","dummy_2003","gex_JA_min"]
train_Bp = df_us[(df_us["year"] >= 1988) & (df_us["year"] <= 2019)].dropna(subset=need_cols_aug)
mBp, predBp = train_and_predict(AUGMENTED_FORM_WITH_D, train_Bp, row_2020_p, "forecast_2020_Bp")
metBp = metrics_from_model(mBp)

print("\n=== National B': ≤2019 + dummy + gex_JA_min ===")
display(pd.Series(metBp))
print(mBp.summary())

fcst_2020_Bp = float(predBp["forecast_2020_Bp"])
print(f"Actual 2020 = {act_2020:.2f}  |  Forecast B' = {fcst_2020_Bp:.2f}  |  Error = {fcst_2020_Bp - act_2020:.2f}")


## 6) State-level models with `gex_JA_min` (≤2019 + dummy) — predict 2020

In [None]:
rows_aug = []
coefs_rows_aug = []

for st in STATES:
    d = df_full[df_full['state']==st].copy()
    need = ["yield_bu_acre","trend","jun_shortfall","temp_JA","prec_JA","prec_JA_sq","dummy_2003","gex_JA_min"]
    train = d[(d["year"]>=1988)&(d["year"]<=2019)].dropna(subset=need)
    try:
        row20 = ensure_year(d, st, TARGET_YEAR_FORECAST).dropna(subset=["trend","jun_shortfall","temp_JA","prec_JA","prec_JA_sq","gex_JA_min"])
    except ValueError:
        continue

    if train.empty or row20.empty:
        continue

    m, pred = train_and_predict(AUGMENTED_FORM_WITH_D, train, row20, "forecast_2020_aug")
    met = metrics_from_model(m)
    y_true = float(row20["yield_bu_acre"]) if not row20["yield_bu_acre"].isna().all() else np.nan
    y_hat = float(pred["forecast_2020_aug"])

    rows_aug.append({
        "state": st,
        "n_obs": met["n_obs"],
        "r2": met["r2"],
        "r2_adj": met["r2_adj"],
        "rmse": met["rmse"],
        "y2020_actual": y_true,
        "y2020_forecast": y_hat,
        "y2020_error": y_hat - y_true
    })

    # store coefficients
    prms = m.params.to_dict()
    prms["state"] = st
    coefs_rows_aug.append(prms)

state_results_aug = pd.DataFrame(rows_aug).sort_values("state")
state_coefs_aug = pd.DataFrame(coefs_rows_aug).set_index("state").sort_index()

print("=== State augmented results (≤2019 + dummy + gex_JA_min → 2020) ===")
display(state_results_aug)
print("\n=== State augmented coefficients ===")
display(state_coefs_aug)


## 7) Optional: save outputs to disk

In [None]:
OUT_DIR = "data/processed"
os.makedirs(OUT_DIR, exist_ok=True)

# National summaries
pd.Series(metA).to_csv(f"{OUT_DIR}/national_A_metrics.csv")
pd.Series(metB).to_csv(f"{OUT_DIR}/national_B_metrics.csv")
pd.Series(metAp).to_csv(f"{OUT_DIR}/national_Ap_metrics.csv")
pd.Series(metBp).to_csv(f"{OUT_DIR}/national_Bp_metrics.csv")

# State tables
state_results_baseline.to_csv(f"{OUT_DIR}/state_baseline_results_2019_train_2020_pred.csv", index=False)
state_coefs_baseline.to_csv(f"{OUT_DIR}/state_baseline_coefs_2019.csv")

state_results_aug.to_csv(f"{OUT_DIR}/state_aug_results_2019_train_2020_pred.csv", index=False)
state_coefs_aug.to_csv(f"{OUT_DIR}/state_aug_coefs_2019.csv")

print("Saved outputs to", OUT_DIR)
