In [None]:
# -*- coding: utf-8 -*-
import warnings, os
import pandas as pd
from sklearn.metrics import r2_score, mean_absolute_error
from catboost import CatBoostRegressor, Pool

warnings.filterwarnings("ignore")

# ======== Configuration ========
CSV_PATH = "9_makale_data_guncel.csv"
TARGET = "band_gap"
ID_COLS = ["material_id", "formula_pretty"]
RANDOM_STATE = 42
MODEL_PATH = "catboost.cbm"   # Required name for stacking later

# --- Data Loading and Column Sanitization ---
df = pd.read_csv(CSV_PATH)
df = df.drop(columns=[c for c in ID_COLS if c in df.columns], errors="ignore")

y_all = df[TARGET].astype(float)
X_all = df.drop(columns=[TARGET]).select_dtypes(include="number").copy()
# Sanitize column names: replace spaces with underscores for CatBoost compatibility
X_all.columns = [c.replace(" ", "_") for c in X_all.columns]

# --- Loading Predefined Index Splits ---
train_idx = pd.read_csv("split_train.csv")["idx"].values
val_idx   = pd.read_csv("split_val.csv")["idx"].values
test_idx  = pd.read_csv("split_test.csv")["idx"].values

X_train, y_train = X_all.iloc[train_idx], y_all.iloc[train_idx]
X_val,   y_val   = X_all.iloc[val_idx],   y_all.iloc[val_idx]
X_test,  y_test  = X_all.iloc[test_idx],  y_all.iloc[test_idx]

# Create CatBoost Pools for optimized data handling
train_pool = Pool(X_train, y_train)
val_pool   = Pool(X_val, y_val)
test_pool  = Pool(X_test, y_test)

# --- Model Parameters (Optimized) ---
params = {
    'bootstrap_type': 'Bernoulli', 
    'iterations': 1976, 
    'learning_rate': 0.04603191747136697,
    'depth': 8, 
    'l2_leaf_reg': 12.609828711930545, 
    'random_strength': 1.4388521607368419,
    'rsm': 0.6126181450251172, 
    'leaf_estimation_iterations': 4, 
    'border_count': 131,
    'subsample': 0.6100547067944161,
    'loss_function': 'RMSE', 
    'eval_metric': 'R2',
    'random_seed': RANDOM_STATE, 
    'thread_count': -1, 
    'task_type': 'CPU',
    'allow_writing_files': False, 
    'verbose': False
}

# --- Training ---
model = CatBoostRegressor(**params)
model.fit(train_pool, eval_set=val_pool, use_best_model=True, verbose=False)

# --- Performance Metrics ---
def print_metrics(name, y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    print(f"{name:>6} -> RÂ²={r2:.6f} | MAE={mae:.6f} eV")

print_metrics("Train", y_train, model.predict(train_pool))
print_metrics("Val",   y_val,   model.predict(val_pool))
print_metrics("Test",  y_test,  model.predict(test_pool))

# --- Model Persistence ---
model.save_model(MODEL_PATH)
print(f"Model successfully saved: {MODEL_PATH}")