In [4]:
# fit_waob_upto_2013.py
import pandas as pd
import numpy as np
import statsmodels.api as sm
from pathlib import Path

# === 1) Charger les données nationales
csv_path = "data/processed/waob_features_national.csv" # adapte le chemin si besoin
df = pd.read_csv(csv_path)

# === 2) Filtrer : on s'arrête en 2013
df = df[(df["year"] >= 1988) & (df["year"] <= 2019)]
df["trend"] = df["trend"] - df["trend"].mean()


# === 3) Préparer X (features) et y (target)
feature_cols = ["trend", "jun_shortfall", "temp_JA", "prec_JA", "prec_JA_sq", "dummy_2003"]
#feature_cols = ["trend", "jun_shortfall", "temp_JA", "prec_JA", "prec_JA_sq"]
target_col = "yield_bu_acre"

# Sanity check colonnes
missing_cols = [c for c in feature_cols + [target_col] if c not in df.columns]
if missing_cols:
    raise ValueError(f"Colonnes manquantes dans le CSV: {missing_cols}")

# Drop lignes incomplètes
keep = [target_col] + feature_cols
df_model = df[keep].dropna().copy()

y = df_model[target_col]
X = df_model[feature_cols]

# === 4) Ajouter l'intercept et préparer matrices ===
X = sm.add_constant(X, has_constant="add")   # colonne 'const' en 1ère position
cols = X.columns.tolist()                    # ['const','trend','jun_shortfall',...]
X_np = X.values.astype(float)
y_np = y.values.astype(float)

# --- Option A: OLS (référence) ---
model_ols = sm.OLS(y_np, X_np).fit()

# --- Option B: Moindres carrés sous contrainte (const >= 0) ---
from scipy.optimize import lsq_linear

n_params = X_np.shape[1]
lower = np.full(n_params, -np.inf)
upper = np.full(n_params,  np.inf)
lower[0] = 59.173 # contrainte: intercept >= 0

res_c = lsq_linear(X_np, y_np, bounds=(lower, upper))  # solve constrained LS
beta_c = res_c.x
fitted_c = X_np @ beta_c
resid_c = y_np - fitted_c

# métriques (compatibles avec celles de statsmodels)
n = len(y_np)
p = n_params
rss = float(np.sum(resid_c**2))
tss = float(np.sum((y_np - y_np.mean())**2))
r2_c = 1.0 - rss / tss
r2_adj_c = 1.0 - (1 - r2_c) * (n - 1) / (n - p)
rmse_c = float(np.sqrt(rss / (n - p)))

# === 5) Résultats ===
print("\n=== OLS (sans contrainte) ===")
print(model_ols.summary())

print("\n=== RÉGRESSION SOUS CONTRAINTE (const >= 0) ===")
print("Coefficients (ordre des colonnes):", cols)
print(np.round(beta_c, 6))
print(f"n_obs={n}  R2={r2_c:.4f}  R2_adj={r2_adj_c:.4f}  RMSE={rmse_c:.4f}")
print(f"Intercept contraint: {beta_c[0]:.6f}  (borne active ? {'oui' if np.isclose(beta_c[0], 50.0) else 'non'})")

# === 6) Sauvegarde coeffs et métriques (constrained) ===
out_c = pd.DataFrame({"coef": beta_c}, index=cols)
out_c.to_csv("waob_constrained_params.csv")

metrics_c = {"n_obs": n, "r2": r2_c, "r2_adj": r2_adj_c, "rmse": rmse_c}
pd.Series(metrics_c).to_csv("waob_constrained_metrics.csv")

print("\n=== Métriques sauvegardées (constrained) ===")
print(metrics_c)

# === 7) Fitted vs Actual (constrained) ===
df_out_c = df.loc[df_model.index, ["year", target_col]].copy()
df_out_c["fitted_constrained"] = fitted_c
df_out_c.sort_values("year", inplace=True)
df_out_c.to_csv("waob_fitted_vs_actual_constrained.csv", index=False)

print("\nExtrait 'fitted vs actual' (constrained, fin) :")
print(df_out_c.tail(10).to_string(index=False))


=== OLS (sans contrainte) ===
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.903
Model:                            OLS   Adj. R-squared:                  0.879
Method:                 Least Squares   F-statistic:                     38.59
Date:                Tue, 07 Oct 2025   Prob (F-statistic):           1.86e-11
Time:                        20:27:04   Log-Likelihood:                -63.700
No. Observations:                  32   AIC:                             141.4
Df Residuals:                      25   BIC:                             151.7
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          6.6870

In [5]:
# === 8) Prédiction 2020 avec modèle contraint ===
YEAR_TO_PRED = 2020
csv_path = "data/processed/waob_features_national.csv" # adapte le chemin si besoin
df = pd.read_csv(csv_path)
# On récupère la ligne 2020 depuis ton dataset national (avec toutes les features)
row20 = df[df["year"] == YEAR_TO_PRED]
if row20.empty:
    raise ValueError(f"Aucune ligne {YEAR_TO_PRED} dans {csv_path}")

# Recrée le vecteur X2020 dans le même ordre que ton modèle
Xcols = X.columns.tolist()  # ['const', 'trend', 'jun_shortfall', 'temp_JA', ...]
X20 = sm.add_constant(row20[[c for c in Xcols if c != "const"]], has_constant="add")

# Prédiction (produit scalaire β'x)
yhat20_constrained = float(X20.values @ res_c.x)

print("\n=== Prédiction 2020 (modèle contraint) ===")
print(f"Intercept contraint = {res_c.x[0]:.3f}")
print(f"Trend 2020 = {row20['trend'].iloc[0]:.3f}")
print(f"jun_shortfall = {row20['jun_shortfall'].iloc[0]:.3f}")
print(f"temp_JA = {row20['temp_JA'].iloc[0]:.3f}")
print(f"prec_JA = {row20['prec_JA'].iloc[0]:.3f}")
print(f"prec_JA_sq = {row20['prec_JA_sq'].iloc[0]:.3f}")
print(f"dummy_2003 = {row20['dummy_2003'].iloc[0]}")
print(f"\n➡️ Rendement prévu pour {YEAR_TO_PRED} (contrainte intercept ≥ 50) : {yhat20_constrained:.2f} bu/ac")



=== Prédiction 2020 (modèle contraint) ===
Intercept contraint = 59.173
Trend 2020 = 33.000
jun_shortfall = 0.000
temp_JA = 74.057
prec_JA = 4.121
prec_JA_sq = 16.981
dummy_2003 = 0

➡️ Rendement prévu pour 2020 (contrainte intercept ≥ 50) : 57.60 bu/ac


  yhat20_constrained = float(X20.values @ res_c.x)
