In [3]:
import os
import pandas as pd
import numpy as np
import requests


In [4]:
CLIMDIV_PRCP_URL = "https://www.ncei.noaa.gov/pub/data/cirs/climdiv/climdiv-pcpnst-v1.0.0-20250905"
CLIMDIV_TEMP_URL = "https://www.ncei.noaa.gov/pub/data/cirs/climdiv/climdiv-tmpcst-v1.0.0-20250905"

USDA_BASE_URL = "https://quickstats.nass.usda.gov/api/api_GET/"

DEFAULT_STATES = ["IA","IL","IN","OH","MO","MN","NE"]
USDA_API_KEY = "D1ABF2AD-362D-346E-A641-93A2FA6ED6D8"

MAPPING = {
        1:"AL",  2:"AZ",  3:"AR",  4:"CA",  5:"CO",  6:"CT",  7:"DE",  8:"FL",  9:"GA",
    10:"ID", 11:"IL", 12:"IN", 13:"IA", 14:"KS", 15:"KY", 16:"LA", 17:"ME", 18:"MD",
    19:"MA", 20:"MI", 21:"MN", 22:"MS", 23:"MO", 24:"MT", 25:"NE", 26:"NV", 27:"NH",
    28:"NJ", 29:"NM", 30:"NY", 31:"NC", 32:"ND", 33:"OH", 34:"OK", 35:"OR", 36:"PA",
    37:"RI", 38:"SC", 39:"SD", 40:"TN", 41:"TX", 42:"UT", 43:"VT", 44:"VA", 45:"WA",
    46:"WV", 47:"WI", 48:"WY", 49:"HI", 50:"AK", 110:"US"
    }

In [5]:
def ensure_dirs():
    for p in ["data/raw","data/interim","data/processed"]:
        os.makedirs(p, exist_ok=True)

ensure_dirs()

def month_name(m):
    return ["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"][m]


def add_shortfall(df):
    # df: colonnes ['state','year','prec_jun', ...]
    stats = (df.groupby('state')['prec_jun']
               .agg(mean_jun='mean', p10=lambda x: np.percentile(x, 10))
               .reset_index())
    df = df.merge(stats, on='state', how='left')
    # shortfall = (mean - actual) si actual est dans le bottom 10%, sinon 0
    df['jun_shortfall'] = np.where(
        df['prec_jun'] <= df['p10'],
        df['mean_jun'] - df['prec_jun'],
        0.0
    )
    return df.drop(columns=['mean_jun','p10'])

In [6]:
def load_weather(url):
    resp = requests.get(url)
    resp.raise_for_status()
    rows = []

    for line in resp.text.splitlines():
        line = line.strip()
        if not line:
            print("pas de ligne")
            continue

        state_code = int(line[0:3])
        division = int(line[3:4])
        year = int(line[6:10])

        if state_code > 50 and state_code != 110:
            continue
        
        parts = line[8:].split()
        if len(parts) < 12:
            print("moins de 12 cols")
            continue  
        values = [float(x) for x in parts[:12]]
        
        rows.append([state_code, division, year] + values)

    cols = ["state_code", "division", "year"] + [f"m{m:02d}" for m in range(1, 13)]
    df = pd.DataFrame(rows, columns=cols)

    

    return df

def climdiv_statewide_month(df):
    grp = df.groupby(["state_code","year"]).mean(numeric_only=True).reset_index()
    long = grp.melt(id_vars=["state_code","year"], value_vars=[f"m{m:02d}" for m in range(1,13)],
                    var_name="month", value_name="value")
    long["month"] = long["month"].str[1:].astype(int)
    print("ok climdiv srtatewide mothn")
    long["state"] = long["state_code"].map(MAPPING)
    return long



def build_weather_table(start, end, states):
    tdf = load_weather(CLIMDIV_TEMP_URL)
    tdf.to_csv("data/raw/tdf.csv", index=False)
    pdf = load_weather(CLIMDIV_PRCP_URL)
    pdf.to_csv("data/raw/pdf.csv", index=False)

    t_long = climdiv_statewide_month(tdf)
    p_long = climdiv_statewide_month(pdf)

    t_long = t_long.dropna(subset=["state"])
    p_long = p_long.dropna(subset=["state"])
    
    t_long = t_long[(t_long["year"].between(start, end)) & (t_long["state"].isin(states))]
    p_long = p_long[(p_long["year"].between(start, end)) & (p_long["state"].isin(states))]

    
    T = t_long.pivot_table(index=["state","year"], columns="month", values="value").reset_index()
    P = p_long.pivot_table(index=["state","year"], columns="month", values="value").reset_index()
    
    T.columns = ["state","year"] + [f"t_{month_name(m).lower()}" for m in range(1,13)]
    P.columns = ["state","year"] + [f"p_{month_name(m).lower()}" for m in range(1,13)]

    T.to_csv("data/interim/t_long.csv", index=False)
    P.to_csv("data/interim/p_long.csv", index=False)
    
    W = pd.merge(T, P, on=["state","year"], how="inner")
    W['temp_JA'] = W[['t_jul','t_aug']].mean(axis=1)
    W['prec_JA'] = W[['p_jul','p_aug']].mean(axis=1)
    W["prec_jun"] = W["p_jun"]
    W['prec_JA_sq'] = 0
    
    print("ok build weather")
    return W[["state","year","temp_JA","prec_JA","prec_JA_sq","prec_jun"]]



In [7]:
ACRE_TO_HA = 0.40468564224
# 1 bu soja = 60 lb = 27.2155422 kg ; 1 acre = 0.40468564224 ha
BUAC_TO_THA = 0.0272155422 / 0.40468564224  # ≈ 0.06725106937166737

def usda_quickstats(params):
    key = USDA_API_KEY
    if not key:
        raise RuntimeError("USDA API key not found")
    params = dict(params)
    params["key"] = key
    r = requests.get(USDA_BASE_URL, params=params, timeout=60)
    r.raise_for_status()
    return pd.DataFrame(r.json()["data"])

# --- Utilitaires ---
def _to_numeric(series):
    return pd.to_numeric(series.astype(str).str.replace(",", ""), errors="coerce")

# --- État: métriques soja (garde ton helper existant) ---
def get_soy_state_metric(start, end, states, metric, unit, out_col, agg="sum"):
    params = {
        "commodity_desc": "SOYBEANS",
        "statisticcat_desc": metric,   # "AREA HARVESTED" ou "YIELD"
        "unit_desc": unit,             # "ACRES" ou "BU / ACRE", etc.
        "agg_level_desc": "STATE",
        "sector_desc": "CROPS",
        "group_desc": "FIELD CROPS",
        "source_desc": "SURVEY",
        "year__GE": start,
        "year__LE": end
    }
    df = usda_quickstats(params)
    df = df[df["state_alpha"].isin(states)].copy()
    df["Value"] = _to_numeric(df["Value"])

    if agg == "mean":
        out = df.groupby(["state_alpha", "year"], as_index=False)["Value"].mean()
    else:
        out = df.groupby(["state_alpha", "year"], as_index=False)["Value"].sum()

    out.rename(columns={"state_alpha": "state", "Value": out_col}, inplace=True)
    return out

def get_soy_national_yield(start, end, unit="BU / ACRE"):
    """Rendement national (NASS, US), en BU/ACRE + conversion t/ha."""
    params = {
        "commodity_desc": "SOYBEANS",
        "statisticcat_desc": "YIELD",
        "unit_desc": unit,             # "BU / ACRE" (source officielle)
        "agg_level_desc": "NATIONAL",
        "sector_desc": "CROPS",
        "group_desc": "FIELD CROPS",
        "source_desc": "SURVEY",
        "year__GE": start,
        "year__LE": end
    }
    df = usda_quickstats(params).copy()
    # QuickStats renvoie "UNITED STATES" côté 'state_name' pour NATIONAL
    df["Value"] = _to_numeric(df["Value"])
    out = (df.groupby("year", as_index=False)["Value"]
             .mean()  # sécurité si plusieurs lignes
             .rename(columns={"Value": "yield_bu_acre"}))
    out["yield_t_ha"] = out["yield_bu_acre"] * BUAC_TO_THA
    return out[["year", "yield_bu_acre", "yield_t_ha"]]

In [8]:
def build_state_features(start, end, states):
    # 1) météo (ta fonction existante)
    W = build_weather_table(start, end, states)  # suppose temp_JA, prec_JA déjà dedans

    # 2) rendement ÉTAT (utile pour QC, pas utilisé dans l’agrégation finale)
    Y = get_soy_state_metric(start, end, states,
                             "YIELD", "BU / ACRE", "yield_bu_acre", "mean")
    Y.to_csv("data/raw/yield_state_bu_ac.csv", index=False)

    # 3) area harvested en ACRES -> conversion en HECTARES pour pondération
    A = get_soy_state_metric(start, end, states,
                             "AREA HARVESTED", "ACRES", "acres_harvested", "sum")
    A["harvest_ha"] = A["acres_harvested"] * ACRE_TO_HA
    A.to_csv("data/raw/harvest_state_acres_ha.csv", index=False)

    # 4) merge + shortfall + variables du modèle
    df = W.merge(Y, on=["state","year"], how="left").merge(A, on=["state","year"], how="left")

    # IMPORTANT: assure-toi que add_shortfall construit (mean_jun - prec_jun) si bottom 10%
    df = add_shortfall(df)

    df["trend"] = df["year"] - 1987
    df["dummy_2003"] = (df["year"] == 2003).astype(int)

    # si prec_JA_sq n’existe pas encore, laisse l’agrégateur le calculer après moyenne pondérée
    cols = [
        "state","year",
        "yield_bu_acre",           # QC seulement (pas utilisé pour Y national)
        "trend","jun_shortfall","temp_JA","prec_JA",
        "dummy_2003",
        "acres_harvested","harvest_ha"
    ]

    print("ok build state feature")
    return df[cols]



In [9]:
def aggregate_national(df_state, df_us_yield, method="weighted", weight_col="harvest_ha"):
    rows = []
    for year, grp in df_state.groupby("year"):
        if (method == "weighted") and (weight_col in grp.columns):
            w = grp[weight_col].astype(float)
        else:
            w = pd.Series(1.0, index=grp.index)

        # moyenne pondérée utilitaire
        def wavg(s): return np.average(s.astype(float), weights=w)

        p_ja = wavg(grp["prec_JA"])
        rows.append({
            "year": int(year),
            "trend": int(year - 1987),
            "jun_shortfall": wavg(grp["jun_shortfall"]),
            "temp_JA": wavg(grp["temp_JA"]),
            "prec_JA": p_ja,
            "prec_JA_sq": p_ja ** 2,             # carré de l’agrégat, pas moyenne des carrés
            "dummy_2003": int(year == 2003),
            "harvest_total_ha": float(w.sum()),
            "harvest_total_acres": float(grp["acres_harvested"].sum())
        })

    X = pd.DataFrame(rows).sort_values("year")

    # joindre le rendement NATIONAL (BU/AC + t/ha)
    X = X.merge(df_us_yield, on="year", how="left")

    # colonnes finales : météo pondérée + rendement national
    order = ["year", "yield_bu_acre", "yield_t_ha", "trend",
             "jun_shortfall", "temp_JA", "prec_JA", "prec_JA_sq",
             "dummy_2003", "harvest_total_acres", "harvest_total_ha"]
    return X[order]

# --- Exécution type ---
WAOB_STATES = ["IA","IL","IN","OH","MO","MN","NE"]  # les 7 États du modèle
data_states = build_state_features(1988, 2024, WAOB_STATES)
data_states.to_csv("data/processed/waob_features_states.csv", index=False)

us_yield = get_soy_national_yield(1988, 2024, unit="BU / ACRE")
us_yield.to_csv("data/raw/yield_national_bu_ac_and_t_ha.csv", index=False)

df_nat = aggregate_national(data_states, us_yield, method="weighted", weight_col="harvest_ha")
df_nat.to_csv("data/processed/waob_features_national.csv", index=False)
print(df_nat.tail())

ok climdiv srtatewide mothn
ok climdiv srtatewide mothn
ok build weather
ok build state feature
    year  yield_bu_acre  yield_t_ha  trend  jun_shortfall    temp_JA  \
32  2020          51.76    3.480915     33       0.000000  74.056834   
33  2021          51.00    3.429805     34       0.250905  73.363571   
34  2022          50.40    3.389454     35       0.000000  73.249047   
35  2023          50.22    3.377349     36       0.241538  71.791787   
36  2024          52.38    3.522611     37       0.000000  72.467190   

     prec_JA  prec_JA_sq  dummy_2003  harvest_total_acres  harvest_total_ha  
32  4.120770   16.980746           0          289280000.0      1.170675e+08  
33  4.073506   16.593451           0          299350000.0      1.211426e+08  
34  3.512567   12.338130           0          304870000.0      1.233765e+08  
35  3.269765   10.691366           0          290320000.0      1.174883e+08  
36  4.537819   20.591799           0          299120000.0      1.210496e+08  
