# 02_clean_features (Rebuilt)

This notebook builds a clean daily features table from master hydrology/load data.

**Outputs:** `features_daily.csv`

Steps:
1. Load master file
2. Normalize column names & types
3. Fill small gaps conservatively
4. Add calendar features
5. Add lags & rolling stats
6. Save final features file

### Cell 1 — Setup & configuration

In [None]:

import pandas as pd
import numpy as np
from pathlib import Path

MASTER_PATH = "master_kaligandaki_daily_withrain.csv"   # or .xlsx
FEATURES_OUT = "features_daily.csv"

START = pd.Timestamp("2019-01-01")
END   = pd.Timestamp("2023-12-31")

print("MASTER_PATH:", MASTER_PATH)


### Cell 2 — Load master (CSV/XLSX) & clean

In [None]:

def load_master(path: str) -> pd.DataFrame:
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"Master not found: {p.resolve()}")
    if p.suffix.lower() in [".xlsx", ".xls"]:
        df = pd.read_excel(p)
    else:
        df = pd.read_csv(p)
    if "date" not in df.columns:
        for cand in ["Date","DATE","day","Day","dt","timestamp","Timestamp"]:
            if cand in df.columns:
                df = df.rename(columns={cand: "date"})
                break
    df["date"] = pd.to_datetime(df["date"], errors="coerce").dt.normalize()
    df = df.dropna(subset=["date"]).drop_duplicates(subset=["date"]).sort_values("date").reset_index(drop=True)
    return df

df = load_master(MASTER_PATH)
print("Loaded rows:", len(df), "| range:", df["date"].min().date(), "→", df["date"].max().date())
df.head()


### Cell 3 — Standardize columns & coerce numeric

In [None]:

rename_map = {
    # e.g. "rain_mm": "rainfall_mm",
    # "Q_cms": "discharge_m3s",
    # "res_level_m": "reservoir_m",
    # "load": "load_MW",
}
df = df.rename(columns=rename_map)

for col in ["rainfall_mm","discharge_m3s","reservoir_m","load_MW"]:
    if col not in df.columns:
        df[col] = np.nan
    df[col] = pd.to_numeric(df[col], errors="coerce")

df = df[(df["date"]>=START) & (df["date"]<=END)].copy()
print(df.dtypes)
df.head(3)


### Cell 4 — Continuous daily index & gap filling

In [None]:

full_index = pd.DataFrame({"date": pd.date_range(df["date"].min(), df["date"].max(), freq="D")})
df = full_index.merge(df, on="date", how="left")

df["rainfall_mm"] = df["rainfall_mm"].fillna(0)

for col in ["discharge_m3s","reservoir_m"]:
    df[col] = df[col].interpolate(limit=3, limit_direction="both")

print("NaNs remaining:")
print(df.isna().sum())
df.head(3)


### Cell 5 — Calendar features

In [None]:

cal = pd.DataFrame({"date": df["date"]})
cal["doy"]   = cal["date"].dt.dayofyear
cal["dow"]   = cal["date"].dt.weekday
cal["month"] = cal["date"].dt.month

cal["doy_sin"] = np.sin(2*np.pi*cal["doy"]/365.25)
cal["doy_cos"] = np.cos(2*np.pi*cal["doy"]/365.25)

df = df.merge(cal, on="date", how="left")
df.head(3)


### Cell 6 — Lags & rolling stats

In [None]:

def add_lags_rolls(base: pd.DataFrame, cols, lags=(1,2,3,7,14), rolls=(3,7,14,30)) -> pd.DataFrame:
    out = base.copy().set_index("date")
    for c in cols:
        if c not in out.columns: 
            continue
        for L in lags:
            out[f"{c}_lag{L}"] = out[c].shift(L)
        for W in rolls:
            roll = out[c].rolling(W, min_periods=max(1, int(W*0.6)))
            out[f"{c}_rmean{W}"] = roll.mean()
            out[f"{c}_rstd{W}"]  = roll.std()
    return out.reset_index()

feature_cols = ["rainfall_mm","discharge_m3s","reservoir_m","load_MW"]
df_feat = add_lags_rolls(df, feature_cols)
df_feat.head(3)


### Cell 7 — Target helpers (if load exists)

In [None]:

if "load_MW" in df_feat.columns and df_feat["load_MW"].notna().any():
    df_feat["load_MW_diff1"]  = df_feat["load_MW"].diff(1)
    df_feat["load_MW_rmean7"] = df_feat["load_MW"].rolling(7, min_periods=4).mean()
df_feat.head(3)


### Cell 8 — Final trim & save

In [None]:

warmup_days = 30
min_keep = df_feat["date"].min() + pd.Timedelta(days=warmup_days)
df_out = df_feat[df_feat["date"] >= min_keep].reset_index(drop=True)

df_out.to_csv(FEATURES_OUT, index=False)
print("Saved:", FEATURES_OUT, "| rows:", len(df_out), "| cols:", len(df_out.columns))
df_out.head()
