# 📊 CatBoost Segment Classification Pipeline
This notebook includes:
- SPA (Sequential Feature Selection) on numerical features
- CatBoost model training
- Optuna hyperparameter tuning
- Final prediction for test dataset

In [None]:
# 📦 Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier
import optuna

In [None]:
# 📂 Load Data
X_all = pd.read_csv("train.csv").drop(columns=["segment"])
y_all = pd.read_csv("train.csv")["segment"]
X_final_test = pd.read_csv("test.csv")

In [None]:
# 🔧 Define numeric and categorical columns
numeric_columns = [col for col in X_all.columns if "num_" in col]  # customize this
categorical_columns = [col for col in X_all.columns if col not in numeric_columns]

X_train, X_val, y_train, y_val = train_test_split(
    X_all, y_all, test_size=0.2, stratify=y_all, random_state=42
)

In [None]:
# 📉 Apply SPA (Sequential Feature Selection) on numeric columns
selector = SequentialFeatureSelector(RandomForestClassifier(n_estimators=100),
                                     n_features_to_select=100, direction='backward', n_jobs=-1)
selector.fit(X_train[numeric_columns], y_train)

selected_numeric_columns = X_train[numeric_columns].columns[selector.get_support()]

In [None]:
# 🧪 Rebuild training/validation/test sets
X_train_selected = pd.concat([X_train[selected_numeric_columns], X_train[categorical_columns]], axis=1)
X_val_selected = pd.concat([X_val[selected_numeric_columns], X_val[categorical_columns]], axis=1)
X_test_selected = pd.concat([X_final_test[selected_numeric_columns], X_final_test[categorical_columns]], axis=1)

In [None]:
# ⚖️ Compute class weights
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(zip(np.unique(y_train), class_weights))

In [None]:
# 🧪 Optuna Tuning
cat_features_idx = [X_train_selected.columns.get_loc(col) for col in categorical_columns]

def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 300, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-2, 10.0, log=True),
        "random_strength": trial.suggest_float("random_strength", 1e-3, 1.0, log=True),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "loss_function": "MultiClass",
        "verbose": 0,
        "cat_features": cat_features_idx,
        "class_weights": class_weights
    }
    model = CatBoostClassifier(**params)
    model.fit(X_train_selected, y_train)
    preds = model.predict(X_val_selected)
    return f1_score(y_val, preds, average='macro')

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("Best Parameters:", study.best_params)

In [None]:
# 🚀 Final model training on all data
X_total = pd.concat([X_all[selected_numeric_columns], X_all[categorical_columns]], axis=1)
final_model = CatBoostClassifier(
    **study.best_params,
    loss_function='MultiClass',
    cat_features=cat_features_idx,
    class_weights=class_weights,
    verbose=100
)
final_model.fit(X_total, y_all)

# Predict on final test set
preds_test = final_model.predict(X_test_selected)

# Save submission
submission = pd.DataFrame({"id": X_final_test["id"], "segment": preds_test.ravel()})
submission.to_csv("submission.csv", index=False)