## Library Imports

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
pd.options.display.float_format = '{:.2f}'.format

from sklearn.model_selection import train_test_split, GridSearchCV

from typing import List, Tuple

from catboost import CatBoostClassifier

In [2]:
df_train = pd.read_csv('data/df_train_missing_clean.csv')
df_test = pd.read_csv('data/df_test_missing_clean.csv')

In [3]:
test_data = pd.read_csv('data/test.csv')

### Based Model

In [4]:
features = df_train.drop(columns=['TARGET'])
features_numeric = features.select_dtypes(include='number')
features_cat = list(features.select_dtypes(exclude='number').columns)
target = df_train['TARGET']

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(
    features, target, test_size=0.2, random_state=1234, stratify=target)

X_valid, X_test, y_valid, y_test = train_test_split(
    X_valid, y_valid, test_size=0.2, random_state=1234, stratify=y_valid)

print("x_train.shape = {} rows, {} cols".format(*X_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*X_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*X_test.shape))

x_train.shape = 88074 rows, 104 cols
x_valid.shape = 17615 rows, 104 cols
x_test.shape = 4404 rows, 104 cols


In [6]:
cb_hyperparameters = {
    "n_estimators": 2000,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 25,
    "verbose": 100,
    "max_depth": 6,
    "l2_leaf_reg": 10,
    "early_stopping_rounds": 150,
    "thread_count": 6,
    "random_state": 1234
}

model = CatBoostClassifier(**cb_hyperparameters)

In [7]:
model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)])

0:	test: 0.6242060	test1: 0.6213521	best: 0.6213521 (0)	total: 74.8ms	remaining: 2m 29s
100:	test: 0.7091907	test1: 0.6953180	best: 0.6955438 (98)	total: 1.55s	remaining: 29.1s
200:	test: 0.7169483	test1: 0.7038345	best: 0.7038345 (200)	total: 2.98s	remaining: 26.7s
300:	test: 0.7233868	test1: 0.7101012	best: 0.7101012 (300)	total: 4.44s	remaining: 25s
400:	test: 0.7290869	test1: 0.7155094	best: 0.7155207 (399)	total: 5.9s	remaining: 23.5s
500:	test: 0.7334891	test1: 0.7188643	best: 0.7188703 (498)	total: 7.36s	remaining: 22s
600:	test: 0.7366770	test1: 0.7212519	best: 0.7212519 (600)	total: 8.81s	remaining: 20.5s
700:	test: 0.7394301	test1: 0.7224718	best: 0.7224782 (698)	total: 10.3s	remaining: 19s
800:	test: 0.7421371	test1: 0.7231220	best: 0.7231518 (795)	total: 11.7s	remaining: 17.5s
900:	test: 0.7445014	test1: 0.7236823	best: 0.7236823 (900)	total: 13.1s	remaining: 16s
1000:	test: 0.7466344	test1: 0.7243832	best: 0.7243895 (996)	total: 14.6s	remaining: 14.5s
1100:	test: 0.7486636

<catboost.core.CatBoostClassifier at 0x7fc0126b4790>

In [8]:
from sklearn.metrics import mean_absolute_error, roc_auc_score, r2_score

In [9]:
pred_train = model.predict_proba(X_train)
pred_valid = model.predict_proba(X_valid)
pred_test = model.predict_proba(X_test)
train_score = roc_auc_score(y_train, pred_train[:, 1])
valid_score = roc_auc_score(y_valid, pred_valid[:, 1])
test_score = roc_auc_score(y_test, pred_test[:, 1])
test_count = (model.predict(df_test)==1).sum()
print(f"Train-score: {round(train_score, 3)}, Valid-score: {round(valid_score, 3)}, Test-score: {round(test_score, 3)}, Target count: {test_count}")

Train-score: 0.762, Valid-score: 0.727, Test-score: 0.711, Target count: 223


In [11]:
df_test_to_submit = test_data.drop(columns=['NAME_CONTRACT_TYPE']).copy()
df_test_to_submit['TARGET'] = model.predict(df_test)
df_test_to_submit.to_csv('data/data_to_submit.csv', index=None)

In [None]:
#df_test_to_submit.shape

In [None]:
lg_model_pred = model.predict(df_test)
(lg_model_pred ==1).sum()