In [None]:
import pandas as pd

df = pd.read_csv("data/processed/train_ready.csv")
df.sample(2)


In [None]:
import pandas as pd
import numpy as np
import joblib
import os
import json
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    r2_score
)
from sklearn.ensemble import HistGradientBoostingRegressor


# ====================================================
# 1) SET TARGET
# ====================================================

X = df.drop(columns=["pret_log"])
y_log = df["pret_log"]

X_train, X_test, y_train_log, y_test_log = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)

categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols     = X.select_dtypes(exclude=["object"]).columns.tolist()


# ====================================================
# 2) PREPROCESSING PIPELINE
# ====================================================

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
        ("num", "passthrough", numeric_cols),
    ]
)


# ====================================================
# 3) HISTGRADIENTBOOSTING REGRESSOR (LIGHTWEIGHT)
# ====================================================

hgb = HistGradientBoostingRegressor(
    max_depth=10,        # control complexity
    learning_rate=0.05,  # smaller LR for smoother fits
    max_leaf_nodes=64,   # limits tree size
    random_state=42
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("hgb", hgb)
])

print("Fitting HistGradientBoostingRegressor...")
pipeline.fit(X_train, y_train_log)


# ====================================================
# 4) METRICS
# ====================================================

y_pred_log = pipeline.predict(X_test)
y_pred     = np.expm1(y_pred_log)
y_true     = np.expm1(y_test_log)

rmse_log = np.sqrt(mean_squared_error(y_test_log, y_pred_log))
r2_log   = r2_score(y_test_log, y_pred_log)

rmse_price = np.sqrt(mean_squared_error(y_true, y_pred))
mae_price  = mean_absolute_error(y_true, y_pred)
mape       = mean_absolute_percentage_error(y_true, y_pred) * 100
accuracy   = 100 - mape
r2_price   = r2_score(y_true, y_pred)


print("\n=== PRICE Metrics (EUR) ===")
print(f"RMSE_price: {rmse_price:,.2f} EUR")
print(f"MAE_price:  {mae_price:,.2f} EUR")
print(f"R2_price:   {r2_price:.4f}")
print(f"  MAPE    : {mape:.2f}%")
print(f"  Accuracy: {accuracy:.2f}%")

# ====================================================
# 6) 20 RANDOM SAMPLE PREDICTIONS
# ====================================================
results = pd.DataFrame({
    "true_price": y_true,
    "pred_price": y_pred,
})

results["abs_error"]  = (results["true_price"] - results["pred_price"]).abs()
results["pct_error"]  = results["abs_error"] / results["true_price"] * 100

print("\n=== 20 random predictions (EUR) ===")
print(
    results
    .sample(20, random_state=42)
    .round({"true_price": 0, "pred_price": 0, "abs_error": 0, "pct_error": 2})
)

# ====================================================
# 7) FEATURE IMPORTANCES
# ====================================================
rf_best = hgb.named_steps["rf"]
feature_names = hgb.named_steps["preprocessor"].get_feature_names_out()

importances = pd.DataFrame({
    "feature": feature_names,
    "importance": rf_best.feature_importances_,
}).sort_values("importance", ascending=False)

print("\nTop 20 important features:")
print(importances.head(20))


# ====================================================
# 5) SAVE MODEL (OVERWRITE OLD RF MODEL)
# ====================================================

os.makedirs("backend/models_storage/metadata", exist_ok=True)
model_path = "backend/models_storage/hist_gradient_boosting.pkl"

joblib.dump(pipeline, model_path)
model_size_mb = os.path.getsize(model_path) / 1024 / 1024

print(f"\nModel saved to {model_path} ({model_size_mb:.2f} MB)")


# ====================================================
# 6) SAVE METADATA (SAME PATTERN)
# ====================================================

metadata = {
    "saved_at": datetime.now().isoformat(),
    "model_type": "HistGradientBoostingRegressor",
    "tuning_method": "manual_fixed_params",
    "best_parameters": {
        "max_depth": 10,
        "learning_rate": 0.05,
        "max_leaf_nodes": 64,
        "random_state": 42,
    },
    "performance_metrics": {
        "rmse_log": float(rmse_log),
        "r2_log": float(r2_log),
        "rmse_price_eur": float(rmse_price),
        "mae_price_eur": float(mae_price),
        "mape_percent": float(mape),
        "accuracy_percent": float(accuracy),
    },
    "training_info": {
        "train_samples": int(len(X_train)),
        "test_samples": int(len(X_test)),
        "total_features_raw": int(X.shape[1]),
        "categorical_features": categorical_cols,
        "numeric_features": numeric_cols,
    },
    "paths": {
        "model_path": model_path,
    },
}

metadata_path = "backend/models_storage/metadata/hist_gradient_boosting_metadata.json"
with open(metadata_path, "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2)

print(f"Metadata saved to {metadata_path}")
