In [14]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [15]:
TARGET_STATE = "IA"

csv_path = "data/processed/waob_features_states.csv" # adapte le chemin si besoin
df = pd.read_csv(csv_path)

df = df[(df["year"] >= 1988) & (df["year"] <= 2012)]
df = df[df["state"] == TARGET_STATE]


feature_cols = ["trend", "jun_shortfall", "temp_JA", "prec_JA", "prec_JA_sq"]
target_col = "yield_bu_acre"

keep = [target_col] + feature_cols
df_model = df[keep].dropna().copy()

y = df_model[target_col]
X = df_model[feature_cols]

X = sm.add_constant(X, has_constant="add")  # force la constante
model = sm.OLS(y, X).fit()

print("\n=== Summary model (year ≤ 2012) ===")
print(model.summary())

metrics = {
    "n_obs": int(model.nobs),
    "r2": model.rsquared,
    "r2_adj": model.rsquared_adj,
    "rmse": np.sqrt(model.mse_resid),
}
#pd.Series(metrics).to_csv("waob_ols_metrics_upto_2013.csv")


=== Summary model (year ≤ 2012) ===
                            OLS Regression Results                            
Dep. Variable:          yield_bu_acre   R-squared:                       0.749
Model:                            OLS   Adj. R-squared:                  0.684
Method:                 Least Squares   F-statistic:                     11.37
Date:                Wed, 08 Oct 2025   Prob (F-statistic):           3.51e-05
Time:                        00:09:40   Log-Likelihood:                -61.255
No. Observations:                  25   AIC:                             134.5
Df Residuals:                      19   BIC:                             141.8
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const    

In [16]:

df = pd.read_csv(csv_path)


df = df[(df["year"] >= 1988) & (df["year"] <= 2019)]
df = df[df["state"] == TARGET_STATE]
print(df.tail())

feature_cols = ["trend", "jun_shortfall", "temp_JA", "prec_JA", "prec_JA_sq", "dummy_2003"]
target_col = "yield_bu_acre"

keep = [target_col] + feature_cols
df_model = df[keep].dropna().copy()

y = df_model[target_col]
X = df_model[feature_cols]

X = sm.add_constant(X, has_constant="add")  # force la constante
model = sm.OLS(y, X).fit()

print("\n=== Summary model (year ≤ 2019) ===")
print(model.summary())

metrics = {
    "n_obs": int(model.nobs),
    "r2": model.rsquared,
    "r2_adj": model.rsquared_adj,
    "rmse": np.sqrt(model.mse_resid),
}
#pd.Series(metrics).to_csv("waob_ols_metrics_upto_2013.csv")

   state  year  yield_bu_acre  trend  jun_shortfall  temp_JA  prec_JA  \
27    IA  2015           54.1     28            0.0    70.45    5.390   
28    IA  2016           58.4     29            0.0    72.40    6.030   
29    IA  2017           56.4     30            0.0    71.25    3.385   
30    IA  2018           58.8     31            0.0    72.45    4.905   
31    IA  2019           54.0     32            0.0    72.35    3.455   

    prec_JA_sq  dummy_2003  acres_harvested    harvest_ha  
27   29.052100           0         59200000  2.395739e+07  
28   36.360900           0         57390000  2.322491e+07  
29   11.458225           0         59690000  2.415569e+07  
30   24.059025           0         59230000  2.396953e+07  
31   11.937025           0         54670000  2.212416e+07  

=== Summary model (year ≤ 2019) ===
                            OLS Regression Results                            
Dep. Variable:          yield_bu_acre   R-squared:                       0.867
Model:

In [17]:
df_out = df.loc[df_model.index, ["year", target_col]].copy()
df_out["fitted"] = model.fittedvalues
df_out.sort_values("year", inplace=True)
#df_out.to_csv("waob_fitted_vs_actual_upto_2013.csv", index=False)

print("\nfitted vs actual :")
print(df_out.tail(10).to_string(index=False))


fitted vs actual :
 year  yield_bu_acre    fitted
 2010         51.600 51.583620
 2011         51.100 50.075541
 2012         42.800 43.594783
 2013         44.875 45.873469
 2014         51.000 54.873147
 2015         54.100 55.577902
 2016         58.400 55.406166
 2017         56.400 54.235104
 2018         58.800 56.696411
 2019         54.000 55.235423


In [18]:

YEAR_TO_PRED = 2020
Xcols = ["trend", "jun_shortfall", "temp_JA", "prec_JA", "prec_JA_sq", "dummy_2003"]

df = pd.read_csv(csv_path)
df = df[df["state"] == TARGET_STATE]

row20 = df[df["year"] == YEAR_TO_PRED]

if row20.empty:
    raise ValueError(f"Aucune ligne {YEAR_TO_PRED} dans {csv_path}.")
X20 = sm.add_constant(row20[Xcols], has_constant="add")
yhat20_ols = float(model.predict(X20))
print(f"\nForecast Y={YEAR_TO_PRED} with find coefs (2020 weather) : {yhat20_ols:.2f} bu/ac\n vs {row20["yield_bu_acre"]:.2f}")





SyntaxError: f-string: unmatched '[' (3754679799.py, line 13)