In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from catboost import CatBoostClassifier, Pool

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [7]:
df_init = pd.read_csv('data.csv', sep=';')
df_init.head()

Unnamed: 0,Id,Result,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,...,Feature_109,Feature_110,Feature_111,Feature_112,Feature_113,Feature_114,Feature_115,Feature_116,Feature_117,Feature_118
0,1,2,2,56,12,1,7.0,4,0,1.0,...,1.0,0.0,1,0,1,1,1,1,1,0
1,2,2,2,69,19,1,6.0,4,0,1.0,...,1.0,0.0,1,0,1,1,1,0,1,1
2,3,2,1,66,8,1,4.0,4,0,1.0,...,1.0,0.0,1,0,1,1,1,0,1,0
3,4,2,2,62,16,1,,3,0,1.0,...,1.0,0.0,1,0,1,1,0,0,1,1
4,5,2,2,67,30,1,,4,0,1.0,...,1.0,0.0,1,0,1,1,1,0,1,0


In [17]:
def train(df, trash_features = [], drop_susp_features = True):

    df_ = df.copy()

    target_feature = 'Result'
    golden_features = ['Feature_3', 'Feature_4', 'Feature_35', 'Feature_17', 'Feature_18', 'Feature_22', 'Feature_23', 'Feature_54', 'Feature_94', 'Feature_108']
    susp_features = ['Feature_29', 'Feature_30', 'Feature_32', 'Feature_101'] if drop_susp_features else []
    all_features = df_.columns.drop([target_feature] + golden_features + susp_features + trash_features + ['Id']).tolist()

    df_ = df_[[target_feature] + all_features]

    num_features = ['Feature_39', 'Feature_40', 'Feature_41', 'Feature_42', 'Feature_43', 'Feature_44', 'Feature_45', 'Feature_46',
                'Feature_47', 'Feature_48', 'Feature_49', 'Feature_50', 'Feature_51', 'Feature_53', 'Feature_55', 'Feature_57', 
                'Feature_58', 'Feature_59', 'Feature_64', 'Feature_70', 'Feature_71', 'Feature_72', 'Feature_73']
    
    special_features = ['Feature_17', 'Feature_18', 'Feature_23', 'Feature_74', 'Feature_75', 'Feature_76', 'Feature_77', 'Feature_78', 
                    'Feature_79', 'Feature_80', 'Feature_81', 'Feature_82', 'Feature_83', 'Feature_84', 'Feature_85', 'Feature_86', 'Feature_87', 
                    'Feature_88', 'Feature_89', 'Feature_90', 'Feature_91', 'Feature_92', 'Feature_93', 'Feature_94', 'Feature_95', 'Feature_96', 
                    'Feature_97', 'Feature_98', 'Feature_99', 'Feature_100']
    for col in special_features:
        if col in df_.columns:
            df_[col].fillna(0, inplace=True)

    cat_features = []
    for col in df_.drop(target_feature, axis=1).columns:
        if col in num_features:
            df_[col].fillna(df_[col].median(), inplace=True)
            df_[col] = df_[col].astype('float64')
        else:
            cat_features.append(col)

            df_[col].fillna(-1, inplace=True)
            df_[col] = df_[col].astype('int64')
    
    # Result=0 - alive, Result=1 died
    df_.loc[df_[target_feature] == 1, target_feature] = 0
    df_.loc[df_[target_feature] == 2, target_feature] = 1

    X = df_[all_features]
    y = df_[target_feature]

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        shuffle=True,
        test_size=0.25,
        random_state=0,
        stratify=y,
    )

    train_pool = Pool(
        X_train,
        y_train,
        cat_features=cat_features
    )

    params_frozen = {
        'eval_metric': 'Accuracy',
        'early_stopping_rounds': 50,
    }

    params_grid = {
        'depth': [4, 6, 8, 10],
        'learning_rate': [0.03, 0.1, 0.3, 0.5],
        'iterations': [100, 200, 400, 600, 800, 1000, 1200],
        'l2_leaf_reg': [2, 2.5, 3, 3.5, 4],
        'bagging_temperature': [1, 1.5, 2, 2.5],
    }

    clf = CatBoostClassifier(**params_frozen)
    rs = clf.randomized_search(
        params_grid,
        train_pool,
        n_iter=5,
        shuffle=True,
        stratified=True,
        partition_random_seed=0,
        cv=5,
        calc_cv_statistics=True,
        search_by_train_test_split=False,
        refit=True,
    )

    return clf, df_, X_train, X_test, y_train, y_test, all_features, cat_features, target_feature

In [21]:
trash_features = []
epochs = 3
for epoch in range(1, epochs+1):

    print(f"\nEpoch {epoch} from {epochs}\n{'=' * 50}")

    clf, df, X_train, X_test, y_train, y_test, all_features, cat_features, target_feature = train(
        df_init,
        trash_features=trash_features,
        drop_susp_features=True,
    )

    test_pool = Pool(
        X_test,
        y_test,
        cat_features=cat_features
    )
    y_test_pred = clf.predict(test_pool)

    print(f'Accuracy: {round(accuracy_score(y_test, y_test_pred), 2)}')
    print(f'F1 Score: {round(f1_score(y_test, y_test_pred), 2)}')
    print(f'\nClassification report: \n{classification_report(y_test, y_test_pred)}')

    feature_importances = pd.DataFrame(
        zip(X_train.columns, clf.get_feature_importance()),
        columns=['feature_name', 'importance']
    )

    feature_importances.sort_values(by='importance', ascending=False, inplace=True)
    features = feature_importances[feature_importances['importance'] < 1 ]['feature_name'].tolist()
    if len(trash_features) > 0:
        for feature in features:
            if feature not in trash_features:
                trash_features.append(feature)
    else:
        trash_features=features

    print('Fetures to drop')
    print(trash_features)


Epoch 1 from 3
Stopped by overfitting detector  (50 iterations wait)
0:	loss: 0.7551082	best: 0.7551082 (0)	total: 34.6s	remaining: 2m 18s
Stopped by overfitting detector  (50 iterations wait)
1:	loss: 0.8291775	best: 0.8291775 (1)	total: 1m 27s	remaining: 2m 11s
Stopped by overfitting detector  (50 iterations wait)
2:	loss: 0.7646320	best: 0.8291775 (1)	total: 2m 17s	remaining: 1m 31s
Stopped by overfitting detector  (50 iterations wait)
3:	loss: 0.7464935	best: 0.8291775 (1)	total: 3m 49s	remaining: 57.5s
Stopped by overfitting detector  (50 iterations wait)
4:	loss: 0.7456277	best: 0.8291775 (1)	total: 5m 25s	remaining: 0us
Accuracy: 0.89
F1 Score: 0.92

Classification report: 
              precision    recall  f1-score   support

           0       0.89      0.73      0.80        11
           1       0.89      0.96      0.92        25

    accuracy                           0.89        36
   macro avg       0.89      0.84      0.86        36
weighted avg       0.89      0.89    

In [22]:
trash_features = ['Feature_58', 'Feature_40', 'Feature_51', 'Feature_55', 'Feature_106', 'Feature_60', 'Feature_14', 'Feature_78', 'Feature_38', 
                    'Feature_48', 'Feature_77', 'Feature_67', 'Feature_64', 'Feature_86', 'Feature_65', 'Feature_80', 'Feature_39', 'Feature_66', 'Feature_31', 
                    'Feature_53', 'Feature_62', 'Feature_109', 'Feature_24', 'Feature_59', 'Feature_81', 'Feature_73', 'Feature_43', 'Feature_10', 'Feature_115', 
                    'Feature_72', 'Feature_114', 'Feature_117', 'Feature_79', 'Feature_61', 'Feature_82', 'Feature_92', 'Feature_96', 'Feature_69', 'Feature_84', 
                    'Feature_26', 'Feature_111', 'Feature_7', 'Feature_95', 'Feature_45', 'Feature_74', 'Feature_113', 'Feature_25', 'Feature_112', 'Feature_1',
                     'Feature_8', 'Feature_91', 'Feature_110', 'Feature_118', 'Feature_107', 'Feature_75', 'Feature_88', 'Feature_27', 'Feature_87', 'Feature_83', 
                     'Feature_50', 'Feature_93', 'Feature_89', 'Feature_85', 'Feature_11', 'Feature_12', 'Feature_13', 'Feature_15', 'Feature_16', 'Feature_19', 
                     'Feature_20', 'Feature_90', 'Feature_21', 'Feature_100', 'Feature_2', 'Feature_76', 'Feature_97', 'Feature_36', 'Feature_98']

clf, df, X_train, X_test, y_train, y_test, all_features, cat_features, target_feature = train(
    df_init,
    trash_features=trash_features,
    drop_susp_features=True,
)

test_pool = Pool(
    X_test,
    y_test,
    cat_features=cat_features
)
y_test_pred = clf.predict(test_pool)

print(f'Accuracy: {round(accuracy_score(y_test, y_test_pred), 2)}')
print(f'F1 Score: {round(f1_score(y_test, y_test_pred), 2)}')
print(f'\nClassification report: \n{classification_report(y_test, y_test_pred)}')

Stopped by overfitting detector  (50 iterations wait)
0:	loss: 0.8214286	best: 0.8214286 (0)	total: 19.9s	remaining: 1m 19s
Stopped by overfitting detector  (50 iterations wait)
1:	loss: 0.8114286	best: 0.8214286 (0)	total: 37.5s	remaining: 56.2s
Stopped by overfitting detector  (50 iterations wait)
2:	loss: 0.7841991	best: 0.8214286 (0)	total: 55s	remaining: 36.7s
Stopped by overfitting detector  (50 iterations wait)
3:	loss: 0.8128139	best: 0.8214286 (0)	total: 1m 32s	remaining: 23.1s
Stopped by overfitting detector  (50 iterations wait)
4:	loss: 0.7923810	best: 0.8214286 (0)	total: 2m 18s	remaining: 0us
Accuracy: 0.94
F1 Score: 0.96

Classification report: 
              precision    recall  f1-score   support

           0       1.00      0.82      0.90        11
           1       0.93      1.00      0.96        25

    accuracy                           0.94        36
   macro avg       0.96      0.91      0.93        36
weighted avg       0.95      0.94      0.94        36



In [23]:
feature_importances = pd.DataFrame(
    zip(X_train.columns, clf.get_feature_importance()),
    columns=['feature_name', 'importance']
)

feature_importances.sort_values(by='importance', ascending=False, inplace=True)
feature_importances.head(50)

Unnamed: 0,feature_name,importance
5,Feature_34,10.617119
7,Feature_41,9.879993
14,Feature_56,8.034348
18,Feature_70,6.981418
0,Feature_5,5.418372
6,Feature_37,5.186738
1,Feature_6,5.087225
20,Feature_99,4.954018
3,Feature_28,4.933184
19,Feature_71,4.268971
