In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
df_init = pd.read_csv('data-last.csv', sep=';')
df_init.head()

Unnamed: 0,Id,Result,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,...,Feature_109,Feature_110,Feature_111,Feature_112,Feature_113,Feature_114,Feature_115,Feature_116,Feature_117,Feature_118
0,1,2,2,56,12,1,7.0,4.0,0,1.0,...,1.0,0.0,1,0,1,1,1,1,1,0
1,2,2,2,69,19,1,6.0,4.0,0,1.0,...,1.0,0.0,1,0,1,1,1,0,1,1
2,3,2,1,66,8,1,4.0,4.0,0,1.0,...,1.0,0.0,1,0,1,1,1,0,1,0
3,4,2,2,62,16,1,,3.0,0,1.0,...,1.0,0.0,1,0,1,1,0,0,1,1
4,5,2,2,67,30,1,,4.0,0,1.0,...,1.0,0.0,1,0,1,1,1,0,1,0


In [3]:
df = df_init.copy()

target_feature = 'Result'
golden_features = ['Feature_3', 'Feature_4', 'Feature_30', 'Feature_32', 'Feature_35', 'Feature_101']
trash_features = [] #feature_importances[feature_importances['importance'] < 0.007870]['feature_name'].tolist()
all_features = df.columns.drop([target_feature] + golden_features + trash_features + ['Id']).tolist()

df = df[[target_feature] + all_features]

cat_features = df.select_dtypes(include='int64').columns.drop([target_feature]).tolist()

for col in df.select_dtypes(include='float64').columns:
    df.fillna(df[col].mean(), inplace=True)

for col in df.select_dtypes(include='int64').columns:
    df.fillna(-99, inplace=True)

# Result=0 - alive, Result=1 died
df.loc[df[target_feature] == 1, target_feature] = 1
df.loc[df[target_feature] == 2, target_feature] = 0

In [4]:
X = df[all_features]
y = df[target_feature]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    shuffle=True,
    test_size=0.1,
    random_state=0,
    stratify=y,
)

params_frozen = {
    'eval_metric': 'Accuracy',
    'early_stopping_rounds': 50,
}

params_grid = {
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.03, 0.1, 0.3, 0.5],
    'iterations': [100, 200, 400, 600, 800, 1000, 1200],
    'l2_leaf_reg': [2, 2.5, 3, 3.5, 4],
    'bagging_temperature': [1, 1.5, 2, 2.5],
}

clf = CatBoostClassifier(**params_frozen)
rs = clf.randomized_search(
    params_grid,
    X_train,
    y_train,
    n_iter=10,
    shuffle=True,
    stratified=True,
    partition_random_seed=0,
    cv=5,
    calc_cv_statistics=True,
    search_by_train_test_split=False,
    refit=False,
)

Stopped by overfitting detector  (50 iterations wait)
0:	loss: 0.8280000	best: 0.8280000 (0)	total: 6.84s	remaining: 1m 1s
Stopped by overfitting detector  (50 iterations wait)
1:	loss: 0.8516923	best: 0.8516923 (1)	total: 19.8s	remaining: 1m 19s
Stopped by overfitting detector  (50 iterations wait)
2:	loss: 0.8203077	best: 0.8516923 (1)	total: 25s	remaining: 58.4s
Stopped by overfitting detector  (50 iterations wait)
3:	loss: 0.8283077	best: 0.8516923 (1)	total: 32.9s	remaining: 49.4s
Stopped by overfitting detector  (50 iterations wait)
4:	loss: 0.8280000	best: 0.8516923 (1)	total: 50.1s	remaining: 50.1s
Stopped by overfitting detector  (50 iterations wait)
5:	loss: 0.8209231	best: 0.8516923 (1)	total: 1m 10s	remaining: 46.9s
Stopped by overfitting detector  (50 iterations wait)
6:	loss: 0.8280000	best: 0.8516923 (1)	total: 1m 17s	remaining: 33.2s
Stopped by overfitting detector  (50 iterations wait)
7:	loss: 0.8440000	best: 0.8516923 (1)	total: 1m 23s	remaining: 20.9s
Stopped by ove

In [5]:
clf_final=CatBoostClassifier(**params_frozen, **rs['params'])
clf_final.fit(
    X_train,
    y_train,
    cat_features=cat_features,
    silent=True,
)

<catboost.core.CatBoostClassifier at 0x201b2880788>

In [6]:
y_test_pred = clf_final.predict(X_test)

print(f'Accuracy: {round(accuracy_score(y_test, y_test_pred), 2)}')
print(f'\nClassification report: \n{classification_report(y_test, y_test_pred)}')
print(f'\nConfusion matrix:\n {confusion_matrix(y_test, y_test_pred)}')

Accuracy: 0.87

Classification report: 
              precision    recall  f1-score   support

           0       0.90      0.90      0.90        10
           1       0.80      0.80      0.80         5

    accuracy                           0.87        15
   macro avg       0.85      0.85      0.85        15
weighted avg       0.87      0.87      0.87        15


Confusion matrix:
 [[9 1]
 [1 4]]


In [7]:
feature_importances = pd.DataFrame(
    zip(X_train.columns, clf_final.get_feature_importance()),
    columns=['feature_name', 'importance']
)

feature_importances.sort_values(by='importance', ascending=False, inplace=True)
feature_importances.head(50)

Unnamed: 0,feature_name,importance
14,Feature_17,10.858437
64,Feature_70,7.673702
35,Feature_41,6.236787
97,Feature_108,6.023141
50,Feature_56,5.893131
28,Feature_33,5.257477
15,Feature_18,4.480803
29,Feature_34,4.113586
48,Feature_54,3.876616
20,Feature_23,3.572644
