In [3]:
import pandas as pd
from IPython.core.display_functions import display
from catboost import CatBoostClassifier
from matplotlib import pyplot as plt
from sklearn import clone
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [4]:
# Create a synthetic dataset
X, y = make_classification(
    n_samples=1000,  # Number of samples
    n_features=10,    # Number of features
    n_informative=2, # Number of informative features
    n_classes=2,     # Number of classes (binary classification)
    n_clusters_per_class=1,  # Number of clusters per class
    random_state=42  # Seed for reproducibility
)

# Create a DataFrame with feature names
feature_names = [f"feature_{i}" for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=feature_names)

# Add the target variable to the DataFrame
df['target'] = y


In [5]:
df.head(2)

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,target
0,1.376053,-1.607661,-0.835143,-0.319848,-0.936875,1.603393,0.568754,0.151758,0.184741,2.089539,0
1,-0.005795,1.671348,-0.176134,-0.893746,1.616256,-1.371212,-0.000382,-0.387965,-0.901376,-2.899514,1


In [6]:
train, valid = train_test_split(df, test_size=0.33, random_state=42)

In [12]:
def forward_selection(train, valid, features, cat_features, target):
    stats = []

    picked_features = []
    best_ginis = []
    # forward selection
    for iteration in tqdm(range(len(features))):
        print('Iteration number', iteration)
        best_gini = None
        best_feature = None
        stat = {'iteration': iteration}
        for col in features:
            print('Feature ', col)
            if col in picked_features:
                continue

            X_train = train[list(picked_features) + [col]]
            X_valid = valid[list(picked_features) + [col]]

            model = CatBoostClassifier(verbose=0, eval_metric='AUC', early_stopping_rounds=60, random_state=42,
                                       cat_features=list(set(cat_features) & set(X_train.columns)))
            model.fit(X_train, train[target], eval_set=(X_valid, valid[target]))
            valid_preds = model.predict_proba(X_valid)[:, 1]
            valid_gini = 2 * roc_auc_score(valid[target], valid_preds) - 1

            stat[col] = valid_gini

            if best_gini is None or best_gini < valid_gini:
                print('Best gini < val gini ', best_gini, valid_gini)
                best_gini = valid_gini
                best_feature = col

        stats.append(stat)

        if best_gini:
            picked_features.append(best_feature)
            best_ginis.append(best_gini)
            print('*'*10)
            print('Best_features:', picked_features)
            print('*'*10)

    stat_2 = pd.DataFrame(stats).T
    stat_2['cnt_nans'] = stat_2.isna().sum(axis=1).values
    stat_2 = stat_2.sort_values('cnt_nans', ascending=False).drop(columns=['cnt_nans'], index=['iteration'])

    display((stat_2 * 100).round(1))

    stat_1 = pd.DataFrame({'names': picked_features, 'ginis': best_ginis})

    plt.figure(figsize=(16, 9))
    sns.lineplot(data=stat_1, x='names', y='ginis')
    plt.xticks(rotation=90)
    plt.show()

    num_features = stat_1['ginis'].argmax() + 1
    best_gini = stat_1['ginis'].max()

    best_features = stat_1['names'].iloc[:num_features]

    print('---' * 5, 'info', '---' * 5, sep='')
    print('Best ginis:', best_gini)
    print('Num features:', len(best_features))
    print('Best features:', ' '.join(best_features))
    print('---' * 12)

#     best_model = clone(model)
#     best_model.fit(train[sorted(best_features)], train[target],
#                    cat_features=list(set(cat_features) & set(best_features)))

    return stat_1, stat_2


In [None]:
s1, s2 = forward_selection(train,valid, train.columns[:-1], cat_features=[],target='target')

  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

Iteration number 0
Feature  feature_0
Best gini < val gini  None 0.16354350382128158
Feature  feature_1
Feature  feature_2
Feature  feature_3
Feature  feature_4
Best gini < val gini  0.16354350382128158 0.9304453262786594
Feature  feature_5
Best gini < val gini  0.9304453262786594 0.9326131687242796
Feature  feature_6
Feature  feature_7
Feature  feature_8


 10%|████████▎                                                                          | 1/10 [00:02<00:18,  2.09s/it]

Feature  feature_9
**********
Best_features: ['feature_5']
**********
Iteration number 1
Feature  feature_0
Best gini < val gini  None 0.9792401528512638
Feature  feature_1
Feature  feature_2
Feature  feature_3
Feature  feature_4
Best gini < val gini  0.9792401528512638 0.9834656084656084
Feature  feature_5
Feature  feature_6
Feature  feature_7
Feature  feature_8
Feature  feature_9


 20%|████████████████▌                                                                  | 2/10 [00:05<00:20,  2.61s/it]