In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from catboost import CatBoostClassifier, Pool

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
df_init = pd.read_csv('data.csv', sep=';')
df_init.head()

Unnamed: 0,Id,Result,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,...,Feature_109,Feature_110,Feature_111,Feature_112,Feature_113,Feature_114,Feature_115,Feature_116,Feature_117,Feature_118
0,1,2,2,56,12,1,7.0,4,0,1.0,...,1.0,0.0,1,0,1,1,1,1,1,0
1,2,2,2,69,19,1,6.0,4,0,1.0,...,1.0,0.0,1,0,1,1,1,0,1,1
2,3,2,1,66,8,1,4.0,4,0,1.0,...,1.0,0.0,1,0,1,1,1,0,1,0
3,4,2,2,62,16,1,,3,0,1.0,...,1.0,0.0,1,0,1,1,0,0,1,1
4,5,2,2,67,30,1,,4,0,1.0,...,1.0,0.0,1,0,1,1,1,0,1,0


In [18]:
def train(df, trash_features = [], drop_susp_features = True):

    df_ = df.copy()

    target_feature = 'Result'
    golden_features = ['Feature_3', 'Feature_4', 'Feature_35']
    susp_features = ['Feature_29', 'Feature_30', 'Feature_32', 'Feature_101'] if drop_susp_features else []
    all_features = df_.columns.drop([target_feature] + golden_features + susp_features + trash_features + ['Id']).tolist()

    df_ = df_[[target_feature] + all_features]

    num_features = ['Feature_39', 'Feature_40', 'Feature_41', 'Feature_42', 'Feature_43', 'Feature_44', 'Feature_45', 'Feature_46',
                'Feature_47', 'Feature_48', 'Feature_49', 'Feature_50', 'Feature_51', 'Feature_53', 'Feature_55', 'Feature_57', 
                'Feature_58', 'Feature_59', 'Feature_64', 'Feature_70', 'Feature_71', 'Feature_72', 'Feature_73']
    cat_features = []

    for col in df_.drop(target_feature, axis=1).columns:
        if col in num_features:
            df_[col].fillna(df_[col].median(), inplace=True)
            df_[col] = df_[col].astype('float64')
        else:
            cat_features.append(col)

            df_[col].fillna(-1, inplace=True)
            df_[col] = df_[col].astype('int64')
    
    # Result=0 - alive, Result=1 died
    df_.loc[df_[target_feature] == 1, target_feature] = 1
    df_.loc[df_[target_feature] == 2, target_feature] = 0

    X = df_[all_features]
    y = df_[target_feature]

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        shuffle=True,
        test_size=0.1,
        random_state=0,
        stratify=y,
    )

    train_pool = Pool(
        X_train,
        y_train,
        cat_features=cat_features
    )

    params_frozen = {
        'eval_metric': 'Accuracy',
        'early_stopping_rounds': 50,
    }

    params_grid = {
        'depth': [4, 6, 8, 10],
        'learning_rate': [0.03, 0.1, 0.3, 0.5],
        'iterations': [100, 200, 400, 600, 800, 1000, 1200],
        'l2_leaf_reg': [2, 2.5, 3, 3.5, 4],
        'bagging_temperature': [1, 1.5, 2, 2.5],
    }

    clf = CatBoostClassifier(**params_frozen)
    rs = clf.randomized_search(
        params_grid,
        train_pool,
        n_iter=5,
        shuffle=True,
        stratified=True,
        partition_random_seed=0,
        cv=5,
        calc_cv_statistics=True,
        search_by_train_test_split=False,
        refit=True,
    )

    return clf, df_, X_train, X_test, y_train, y_test, all_features, cat_features, target_feature


Epoch 1 from 3
Stopped by overfitting detector  (50 iterations wait)
0:	loss: 0.8052308	best: 0.8052308 (0)	total: 57.8s	remaining: 3m 51s
Stopped by overfitting detector  (50 iterations wait)
1:	loss: 0.8283077	best: 0.8283077 (1)	total: 2m 4s	remaining: 3m 6s
Stopped by overfitting detector  (50 iterations wait)
2:	loss: 0.8440000	best: 0.8440000 (2)	total: 2m 37s	remaining: 1m 44s
Stopped by overfitting detector  (50 iterations wait)
3:	loss: 0.8123077	best: 0.8440000 (2)	total: 3m 50s	remaining: 57.6s
Stopped by overfitting detector  (50 iterations wait)
4:	loss: 0.8129231	best: 0.8440000 (2)	total: 5m 21s	remaining: 0us
Accuracy: 0.93
F1 Score: 0.89

Classification report: 
              precision    recall  f1-score   support

           0       0.91      1.00      0.95        10
           1       1.00      0.80      0.89         5

    accuracy                           0.93        15
   macro avg       0.95      0.90      0.92        15
weighted avg       0.94      0.93      

In [13]:
trash_features = []
epochs = 3
for epoch in range(1, epochs+1):

    print(f"\nEpoch {epoch} from {epochs}\n{'=' * 50}")

    clf, df, X_train, X_test, y_train, y_test, all_features, cat_features, target_feature = train(
        df_init,
        trash_features=trash_features,
        drop_susp_features=True,
    )

    test_pool = Pool(
        X_test,
        y_test,
        cat_features=cat_features
    )
    y_test_pred = clf.predict(test_pool)

    print(f'Accuracy: {round(accuracy_score(y_test, y_test_pred), 2)}')
    print(f'F1 Score: {round(f1_score(y_test, y_test_pred), 2)}')
    print(f'\nClassification report: \n{classification_report(y_test, y_test_pred)}')

    feature_importances = pd.DataFrame(
        zip(X_train.columns, clf.get_feature_importance()),
        columns=['feature_name', 'importance']
    )

    feature_importances.sort_values(by='importance', ascending=False, inplace=True)
    features = feature_importances[feature_importances['importance'] < 1 ]['feature_name'].tolist()
    if len(trash_features) > 0:
        for feature in features:
            if feature not in trash_features:
                trash_features.append(feature)
    else:
        trash_features=features

    print('Fetures to drop')
    print(trash_features)

In [21]:
trash_features = ['Feature_90', 'Feature_100', 'Feature_65', 'Feature_116', 'Feature_93', 'Feature_40', 'Feature_46',
                'Feature_68', 'Feature_79', 'Feature_44', 'Feature_80', 'Feature_53', 'Feature_39', 'Feature_50', 'Feature_8',
                'Feature_67', 'Feature_60', 'Feature_96', 'Feature_22', 'Feature_75', 'Feature_77', 'Feature_84', 'Feature_45',
                'Feature_73', 'Feature_43', 'Feature_83', 'Feature_114', 'Feature_14', 'Feature_89', 'Feature_52', 'Feature_71',
                'Feature_10', 'Feature_51', 'Feature_28', 'Feature_11', 'Feature_112', 'Feature_106', 'Feature_42', 'Feature_87',
                'Feature_82', 'Feature_21', 'Feature_118', 'Feature_48', 'Feature_74', 'Feature_12', 'Feature_98', 'Feature_72',
                'Feature_117', 'Feature_107', 'Feature_38', 'Feature_111', 'Feature_55', 'Feature_92', 'Feature_24', 'Feature_113',
                'Feature_91', 'Feature_86', 'Feature_95', 'Feature_7', 'Feature_31', 'Feature_15', 'Feature_63', 'Feature_76',
                'Feature_78', 'Feature_62', 'Feature_97', 'Feature_13', 'Feature_61', 'Feature_115', 'Feature_110', 'Feature_26', 
                'Feature_16', 'Feature_19', 'Feature_20', 'Feature_25', 'Feature_27', 'Feature_36', 'Feature_37', 'Feature_2', 
                'Feature_66', 'Feature_1', 'Feature_85', 'Feature_54', 'Feature_64', 'Feature_58', 'Feature_34', 'Feature_88', 'Feature_81']

clf, df, X_train, X_test, y_train, y_test, all_features, cat_features, target_feature = train(
    df_init,
    trash_features=trash_features,
    drop_susp_features=True,
)

test_pool = Pool(
    X_test,
    y_test,
    cat_features=cat_features
)
y_test_pred = clf.predict(test_pool)

print(f'Accuracy: {round(accuracy_score(y_test, y_test_pred), 2)}')
print(f'F1 Score: {round(f1_score(y_test, y_test_pred), 2)}')
print(f'\nClassification report: \n{classification_report(y_test, y_test_pred)}')

Stopped by overfitting detector  (50 iterations wait)
0:	loss: 0.8516923	best: 0.8516923 (0)	total: 41.2s	remaining: 2m 44s
Stopped by overfitting detector  (50 iterations wait)
1:	loss: 0.8443077	best: 0.8516923 (0)	total: 1m 8s	remaining: 1m 43s
Stopped by overfitting detector  (50 iterations wait)
2:	loss: 0.8363077	best: 0.8516923 (0)	total: 1m 24s	remaining: 56.2s
Stopped by overfitting detector  (50 iterations wait)
3:	loss: 0.8363077	best: 0.8516923 (0)	total: 1m 56s	remaining: 29.2s
Stopped by overfitting detector  (50 iterations wait)
4:	loss: 0.8440000	best: 0.8516923 (0)	total: 2m 28s	remaining: 0us
Accuracy: 0.87
F1 Score: 0.8

Classification report: 
              precision    recall  f1-score   support

           0       0.90      0.90      0.90        10
           1       0.80      0.80      0.80         5

    accuracy                           0.87        15
   macro avg       0.85      0.85      0.85        15
weighted avg       0.87      0.87      0.87        15



In [22]:
feature_importances = pd.DataFrame(
    zip(X_train.columns, clf.get_feature_importance()),
    columns=['feature_name', 'importance']
)

feature_importances.sort_values(by='importance', ascending=False, inplace=True)
feature_importances.head(50)

Unnamed: 0,feature_name,importance
4,Feature_18,17.177368
17,Feature_108,9.77505
7,Feature_41,8.82636
2,Feature_9,8.349722
5,Feature_23,8.048525
10,Feature_56,7.610637
14,Feature_70,5.624118
1,Feature_6,5.487453
8,Feature_47,5.226363
16,Feature_99,4.559015
