In [10]:
# fit_waob_upto_2013.py
import pandas as pd
import numpy as np
import statsmodels.api as sm
from pathlib import Path

# === 1) Charger les données nationales
csv_path = "data/processed/waob_features_national.csv" # adapte le chemin si besoin
df = pd.read_csv(csv_path)

# === 2) Filtrer : on s'arrête en 2013
df = df[(df["year"] >= 1988) & (df["year"] <= 2019)]


# === 3) Préparer X (features) et y (target)
feature_cols = ["trend", "jun_shortfall", "temp_JA", "prec_JA", "prec_JA_sq", "dummy_2003"]
#feature_cols = ["trend", "jun_shortfall", "temp_JA", "prec_JA", "prec_JA_sq"]
target_col = "yield_bu_acre"

# Sanity check colonnes
missing_cols = [c for c in feature_cols + [target_col] if c not in df.columns]
if missing_cols:
    raise ValueError(f"Colonnes manquantes dans le CSV: {missing_cols}")

# Drop lignes incomplètes
keep = [target_col] + feature_cols
df_model = df[keep].dropna().copy()

y = df_model[target_col]
X = df_model[feature_cols]

# === 4) Ajouter l'intercept et fit OLS
X = sm.add_constant(X, has_constant="add")  # force la constante
model = sm.OLS(y, X).fit()

# === 5) Résultats
print("\n=== Résumé du modèle (années ≤ 2013) ===")
print(model.summary())

print("\n=== Coefficients (params) ===")
print(model.params)

# === 6) (Optionnel) Sauvegarder les coefficients et métriques
out_coef = pd.DataFrame(model.params, columns=["coef"])
out_coef["std_err"] = model.bse
out_coef["t"] = model.tvalues
out_coef["p_value"] = model.pvalues
out_coef.to_csv("waob_ols_params_upto_2013.csv")

metrics = {
    "n_obs": int(model.nobs),
    "r2": model.rsquared,
    "r2_adj": model.rsquared_adj,
    "rmse": np.sqrt(model.mse_resid),
}
pd.Series(metrics).to_csv("waob_ols_metrics_upto_2013.csv")

print("\n=== Métriques sauvegardées ===")
print(metrics)

# === 7) (Optionnel) Fitted vs Actual pour contrôle visuel
df_out = df.loc[df_model.index, ["year", target_col]].copy()
df_out["fitted"] = model.fittedvalues
df_out.sort_values("year", inplace=True)
df_out.to_csv("waob_fitted_vs_actual_upto_2013.csv", index=False)

print("\nExtrait 'fitted vs actual' (fin) :")
print(df_out.tail(10).to_string(index=False))



=== Résumé du modèle (années ≤ 2013) ===
                            OLS Regression Results                            
Dep. Variable:          yield_bu_acre   R-squared:                       0.903
Model:                            OLS   Adj. R-squared:                  0.879
Method:                 Least Squares   F-statistic:                     38.59
Date:                Tue, 07 Oct 2025   Prob (F-statistic):           1.86e-11
Time:                        19:24:17   Log-Likelihood:                -63.700
No. Observations:                  32   AIC:                             141.4
Df Residuals:                      25   BIC:                             151.7
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
cons