In [None]:
# Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
import shap
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor


In [None]:
# ---------------------------------------------------------------
# Load Training and Validation Data
# ---------------------------------------------------------------

train_df = pd.read_pickle("Datasets/train_df.pkl")
val_df = pd.read_pickle("Datasets/val_df.pkl")
test_df = pd.read_pickle("Datasets/test_df.pkl")

# Define target variable (log price)
y_train_log = np.log(train_df['Price Sold USD'])
y_val_log = np.log(val_df['Price Sold USD'])
y_test_log = np.log(test_df['Price Sold USD'])

In [None]:
# ---------------------------------------------------------------
# CatBoost: Define Features
# ---------------------------------------------------------------

cat_features = [
    'Artist Name',
    'Paint Imputed',
    'Material Imputed',
    'Auction House',
    'Country',
    'Birth Period',
    'Alive Status'
]

num_features = [
    'Log Area',
    'Artist Sale Count',
    'CPI_US',
    'Artist Cumulative Price'
]

cb_features = cat_features + num_features


In [None]:
# ---------------------------------------------------------------
# CatBoost: Define Objective Function for Optuna Tuning
# ---------------------------------------------------------------

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 300, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'depth': trial.suggest_int('depth', 6, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-2, 10.0, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'random_strength': trial.suggest_float('random_strength', 0.0, 1.0),
        'border_count': trial.suggest_int('border_count', 32, 254),
        'rsm': trial.suggest_float('rsm', 0.5, 1.0),
        'task_type': 'CPU',
        'verbose': 0,
        'random_state': 42
    }

    model = CatBoostRegressor(**params)
    model.fit(
        train_df[cb_features], y_train_log,
        eval_set=(val_df[cb_features], y_val_log),
        cat_features=cat_features,
        early_stopping_rounds=50,
        use_best_model=True
    )

    val_preds_log = model.predict(val_df[cb_features])
    return mean_absolute_error(y_val_log, val_preds_log)

# Run Optuna Optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print("Best MAE (log):", study.best_value)
print("Best hyperparameters:", study.best_params)

In [None]:
# ---------------------------------------------------------------
# CatBoost:Train Final Model with Best Parameters
# ---------------------------------------------------------------

best_params = study.best_params
best_params.update({
    'task_type': 'CPU',
    'verbose': 100,
    'random_state': 42
})

model_cb_best = CatBoostRegressor(**best_params)
model_cb_best.fit(
    train_df[cb_features], y_train_log,
    eval_set=(val_df[cb_features], y_val_log),
    cat_features=cat_features,
    early_stopping_rounds=50,
    use_best_model=True
)


In [None]:
# ---------------------------------------------------------------
# CatBoost: Evaluate the Optimized Model on Validation Set
# ---------------------------------------------------------------

val_preds_log = model_cb_best.predict(val_df[cb_features])
val_preds = np.exp(val_preds_log)

mae_usd = mean_absolute_error(val_df['Price Sold USD'], val_preds)
mae_log = mean_absolute_error(y_val_log, val_preds_log)

print(f"Validation MAE (USD): {mae_usd:,.2f}")
print(f"Validation MAE (Log): {mae_log:.4f}")

In [None]:
# ---------------------------------------------------------------
# CatBoost: Evaluate on Test Set
# ---------------------------------------------------------------

# Predict on test set
test_preds_log = model_cb_best.predict(test_df[cb_features])
test_preds_usd = np.exp(test_preds_log)

# Compute MAE on log and USD scale
mae_test_log = mean_absolute_error(y_test_log, test_preds_log)
mae_test_usd = mean_absolute_error(test_df['Price Sold USD'], test_preds_usd)

print(f"\nTest MAE (Log): {mae_test_log:.4f}")
print(f"Test MAE (USD): ${mae_test_usd:,.2f}")

In [None]:
# ---------------------------------------------------------------
# CatBoost: Validation Residual and Actual vs. Predicted Plots
# ---------------------------------------------------------------

# Actual vs Predicted (log scale)
plt.figure(figsize=(8, 6))
plt.scatter(y_val_log, val_preds_log, alpha=0.3, edgecolor='k')
plt.plot([y_val_log.min(), y_val_log.max()], [y_val_log.min(), y_val_log.max()], 'r--')
plt.xlabel('Actual Log Price')
plt.ylabel('Predicted Log Price')
plt.title('Actual vs Predicted Log Prices (CatBoost)')
plt.grid(True)
plt.tight_layout()
plt.show()

# Residuals
residuals_log = y_val_log - val_preds_log
plt.figure(figsize=(8, 5))
plt.hist(residuals_log, bins=50, color='skyblue', edgecolor='black')
plt.title("Residuals (Log Price)")
plt.xlabel("Actual - Predicted (Log Scale)")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# ---------------------------------------------------------------
# CatBoost: SHAP Values for CatBoost (Optimized Model)
# ---------------------------------------------------------------

explainer = shap.Explainer(model_cb_best)
shap_values = explainer(test_df[cb_features])

# Summary plots
shap.summary_plot(shap_values, test_df[cb_features], plot_type="bar")
shap.summary_plot(shap_values, test_df[cb_features])