# 03: Model Training
Loads engineered datasets from ../output/feature_engineering/, trains a model, logs to MLflow at ../output/mlruns/, and saves artifacts to ../output/modeling/. CSV-only and self-contained.

from pathlib import Path
import json
import pandas as pd
import numpy as np
import os

# Local, self-contained utilities

def ensure_directory_exists(path: str) -> None:
    Path(path).mkdir(parents=True, exist_ok=True)


def read_csv(path: Path) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f"File {path} not found")
    return pd.read_csv(path)


def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict:
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    return {
        "rmse": float(np.sqrt(mean_squared_error(y_true, y_pred))),
        "mae": float(mean_absolute_error(y_true, y_pred)),
        "r2": float(r2_score(y_true, y_pred)),
    }

# Load datasets from root-level output/feature_engineering folder (CSV only)
input_dir = Path("..") / "output" / "feature_engineering"
modeling_dir = Path("..") / "output" / "modeling"
train = read_csv(input_dir / "train.csv")
val = read_csv(input_dir / "val.csv")
test = read_csv(input_dir / "test.csv")

feature_cols = [c for c in train.columns if c != "target"]
X_train, y_train = train[feature_cols], train["target"].values
X_val, y_val = val[feature_cols], val["target"].values
X_test, y_test = test[feature_cols], test["target"].values

# Build & train a simple, robust model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Evaluate
val_pred = model.predict(X_val)
val_metrics = regression_metrics(y_val, val_pred)

test_pred = model.predict(X_test)
test_metrics = regression_metrics(y_test, test_pred)

print("Validation:", val_metrics)
print("Test:", test_metrics)

# MLflow logging (root-level output/mlruns folder)
import mlflow
mlruns_dir = Path("..") / "output" / "mlruns"
ensure_directory_exists(str(mlruns_dir))
mlflow.set_tracking_uri(f"file:{mlruns_dir.resolve()}")
mlflow.set_experiment("regression_experiment")

params = {"algorithm": "RandomForestRegressor", "n_estimators": 200, "random_state": 42}
metrics = {**{f"val_{k}": v for k, v in val_metrics.items()}, **{f"test_{k}": v for k, v in test_metrics.items()}}

with mlflow.start_run():
    for k, v in params.items():
        mlflow.log_param(k, v)
    for k, v in metrics.items():
        mlflow.log_metric(k, v)
    # Log model
    mlflow.sklearn.log_model(model, artifact_path="model")

# Save artifacts locally (root-level output/modeling folder)
ensure_directory_exists(str(modeling_dir))
with open(modeling_dir / "model_params.json", "w") as f:
    json.dump(params, f)
with open(modeling_dir / "feature_names.json", "w") as f:
    json.dump(feature_cols, f)

# Also persist the trained model via joblib for the API to load
from joblib import dump as joblib_dump
model_path = modeling_dir / "regression_model.pkl"
joblib_dump(model, model_path)

print(f"Saved model and artifacts to {modeling_dir}")