# 02: Feature Engineering
Loads ../output/data/training_data.csv, splits into train/val/test, scales features, and saves artifacts to ../output/feature_engineering/.


from pathlib import Path
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Self-contained: read input from root-level output/data folder
input_dir = Path("..") / "output" / "data"
output_dir = Path("..") / "output" / "feature_engineering"
csv_in = input_dir / "training_data.csv"

if not csv_in.exists():
    raise FileNotFoundError(f"Could not find {csv_in}. Run 01_data_generation first.")

df = pd.read_csv(csv_in)

# Separate features/target
feature_cols = [c for c in df.columns if c != "target"]
X = df[feature_cols]
y = df["target"]

# Split: first test, then val from remaining
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
val_ratio = 0.1 / (1 - 0.2)  # val as 10% of full => relative to temp
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=val_ratio, random_state=42)

# Scale
scaler = StandardScaler()
X_train_s = pd.DataFrame(scaler.fit_transform(X_train), columns=feature_cols, index=X_train.index)
X_val_s = pd.DataFrame(scaler.transform(X_val), columns=feature_cols, index=X_val.index)
X_test_s = pd.DataFrame(scaler.transform(X_test), columns=feature_cols, index=X_test.index)

# Save artifacts to root-level output/feature_engineering folder (CSV only)
output_dir.mkdir(parents=True, exist_ok=True)

def _save_df(df_out: pd.DataFrame, base: str):
    csv = output_dir / f"{base}.csv"
    df_out.to_csv(csv, index=False)
    return str(csv)

p_train = _save_df(pd.concat([X_train_s, y_train], axis=1), "train")
p_val = _save_df(pd.concat([X_val_s, y_val], axis=1), "val")
p_test = _save_df(pd.concat([X_test_s, y_test], axis=1), "test")

with open(output_dir / "feature_names.json", "w") as f:
    json.dump(feature_cols, f)

print("Saved artifacts:")
print(" -", p_train)
print(" -", p_val)
print(" -", p_test)
print(" -", output_dir / "feature_names.json")