# Build `datas.csv` — States + US (weather, crop conditions, yield)

This notebook:
1. Loads **weather + yield** features from `src.fetchweather`.
2. Builds **annual crop-condition** features from `src.soy_conditions`.
3. Merges both on `(state, year)`.
4. Adds a **US** weighted row per year (weights = `harvest_ha`).
5. Writes a single file: `data/processed/datas.csv` (states + US).


In [None]:
import os, sys
import numpy as np
import pandas as pd

# Make project root importable
sys.path.append(os.path.abspath("."))

# Local modules
from src import SEVEN_STATES, get_soy_condition_features
# After moving the generated python file to src/, import like:
# from src.fetchweather import load_weather_features
from fetchweather import load_weather_features

YEAR_FROM = 1987
YEAR_TO   = 2024
STATES    = SEVEN_STATES  # ("IA","IL","IN","OH","MO","MN","NE")
OUT_CSV   = "data/processed/datas.csv"


## 1) Load weather + yield features

In [None]:
weather = load_weather_features(
    states_file="data/processed/waob_features_states.csv",
    states=STATES, year_from=YEAR_FROM, year_to=YEAR_TO
)
print("Weather+Yield shape:", weather.shape)
weather.head(3)

## 2) Build annual crop-condition features

In [None]:
# Weekly (ignored here), Annual crop-condition features
_, cond_annual = get_soy_condition_features(YEAR_FROM, YEAR_TO, STATES)

def normalize_keys(df):
    out = df.copy()
    out["state"] = out["state"].astype(str).str.strip().str.upper()
    out["year"] = pd.to_numeric(out["year"], errors="coerce").astype("Int64")
    return out

cond_annual = normalize_keys(cond_annual).drop_duplicates(subset=["state","year"]).copy()
print("Conditions shape:", cond_annual.shape)
cond_annual.head(3)

## 3) Merge on (state, year)

In [None]:
df = weather.merge(cond_annual, on=["state","year"], how="left", indicator=True)
print(df["_merge"].value_counts())
df = df.drop(columns="_merge")

# Safety: aggregate numerics if duplicates appear
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
df = (df.groupby(["state","year"], as_index=False)
        .agg({**{c: "mean" for c in num_cols},
              **{c: "first" for c in df.columns if c not in num_cols and c not in ["state","year"]}}))
print("Merged df shape:", df.shape)
df.head(3)

## 4) Add US weighted row (weights = `harvest_ha`)

In [None]:
def add_us_weighted_row(df_in: pd.DataFrame) -> pd.DataFrame:
    if "harvest_ha" not in df_in.columns:
        raise ValueError("harvest_ha is required to compute US weights.")
    num_cols = df_in.select_dtypes(include=[np.number]).columns.tolist()
    num_cols = [c for c in num_cols if c not in ["harvest_ha","year"]]
    def wmean(g: pd.DataFrame) -> pd.Series:
        w = g["harvest_ha"].fillna(0.0).astype(float)
        out = {c: (np.average(g[c].astype(float), weights=w) if w.sum() > 0 else np.nan)
               for c in num_cols}
        out["harvest_ha"] = float(w.sum())
        out["acres_harvested"] = float(g.get("acres_harvested", pd.Series([0]*len(g))).fillna(0).sum())
        return pd.Series(out)
    us = df_in.groupby("year", as_index=True).apply(wmean).reset_index()
    us["state"] = "US"
    return pd.concat([df_in, us], ignore_index=True, sort=False)

df_out = add_us_weighted_row(df)
print("Final rows (states + US):", df_out.shape)
df_out.tail(3)

## 5) Write `data/processed/datas.csv`

In [None]:
import os
os.makedirs("data/processed", exist_ok=True)
df_out.to_csv(OUT_CSV, index=False)
print("✅ Written:", OUT_CSV)
