In [None]:
# -*- coding: utf-8 -*-
import warnings, joblib, pandas as pd
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import lightgbm as lgb

warnings.filterwarnings("ignore")

# --- Data Loading and Feature Selection ---
df = pd.read_csv(CSV_PATH)
# Remove non-feature columns
df = df.drop(columns=[c for c in ID_COLS if c in df.columns], errors="ignore")

y_all = df[TARGET].astype(float)
X_all = df.drop(columns=[TARGET]).select_dtypes(include="number")
# Replace infinite values and drop missing data
X_all = X_all.replace([float("inf"), -float("inf")], pd.NA).dropna()

# --- Loading Predefined Index Splits ---
idx_tr = pd.read_csv("split_train.csv")["idx"].values
idx_va = pd.read_csv("split_val.csv")["idx"].values
idx_te = pd.read_csv("split_test.csv")["idx"].values

# Creating training, validation, and test sets based on indices
X_train, y_train = X_all.iloc[idx_tr], y_all.iloc[idx_tr]
X_val,   y_val   = X_all.iloc[idx_va], y_all.iloc[idx_va]
X_test,  y_test  = X_all.iloc[idx_te], y_all.iloc[idx_te]

# --- Model Configuration ---
params = {
    "n_estimators": 1586,
    "learning_rate": 0.06213190804903206,
    "num_leaves": 147,
    "max_depth": -1,
    "min_child_samples": 90,
    "min_sum_hessian_in_leaf": 0.276379400698885,
    "feature_fraction": 0.833584461172753,
    "bagging_fraction": 0.7878386380407747,
    "bagging_freq": 2,
    "lambda_l1": 12.634781942523427,
    "lambda_l2": 28.826312926864468,
    "min_gain_to_split": 0.0008122876817837996,
    "boosting_type": "dart",
    "objective": "regression",
    "metric": "l2",
    "verbosity": -1,
    "random_state": RANDOM_STATE,
    "n_jobs": -1
}

# --- Model Training ---
model = lgb.LGBMRegressor(**params)
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="l2",
    callbacks=[
        lgb.early_stopping(ES, verbose=False), 
        lgb.log_evaluation(0)
    ]
)

# --- Performance Evaluation ---
def print_metrics(set_name, y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    print(f"{set_name:5} -> RÂ²: {r2:.6f} | MAE: {mae:.6f} | MSE: {mse:.6f}")

print_metrics("Train", y_train, model.predict(X_train))
print_metrics("Val",   y_val,   model.predict(X_val))
print_metrics("Test",  y_test,  model.predict(X_test))

# --- Model Persistence ---
joblib.dump(model, "lightgbm_model.pkl")
print("Model successfully saved: lightgbm_model.pkl")