In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

from catboost import CatBoostClassifier

import os
print(os.listdir("../input"))

['santander-customer-transaction-prediction', 'santander-baseline-with-magic']


In [2]:
df_train = pd.read_csv('../input/santander-baseline-with-magic/df_train.csv')
df_test = pd.read_csv('../input/santander-baseline-with-magic/df_test.csv')

In [3]:
random_state = 416
np.random.seed(random_state)

In [4]:
cat_params = {
    "objective": "Logloss",
    "bootstrap_type": 'Poisson',
    "iterations": 100000,
    "learning_rate": 0.03,
    "max_depth": 5,
    "eval_metric": 'AUC',
    "random_seed": 1225,
    "subsample": 0.9992666140793471,
    "bagging_temperature": 1.8964010156765452,
    "random_strength": 1,
    "l2_leaf_reg": 91,
    "od_type": 'Iter',
    "metric_period": 1000,
    "task_type": "GPU",
    "od_wait": 1000,
    "border_count": 32,
    "max_ctr_complexity": 5,
    #"boosting_type": 'Plain'
}

In [5]:
def run_catboost(df_train, df_test, cat_params, features):
    
    X_test = df_test[features].values
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    oof = df_train[['ID_code', 'target']]
    oof['predict'] = 0
    predictions = df_test[['ID_code']]
    val_aucs = []
    #feature_importance_df = pd.DataFrame()
    
    for fold, (trn_idx, val_idx) in enumerate(skf.split(df_train, df_train['target'])):
        X_train, y_train = df_train.iloc[trn_idx][features], df_train.iloc[trn_idx]['target']
        X_valid, y_valid = df_train.iloc[val_idx][features], df_train.iloc[val_idx]['target']
    
        cat_model = CatBoostClassifier(**cat_params)
        #categorical_features_pos = column_index(X_train, categorical)
        cat_model.fit(X_train, y_train, eval_set=(X_valid, y_valid), 
                      #cat_features=categorical_features_pos,
                      use_best_model=True)
        
        oof['predict'][val_idx] = cat_model.predict_proba(X_valid)[:, 1]
        predictions['fold{}'.format(fold+1)] = cat_model.predict_proba(X_test)[:, 1]
        val_score = roc_auc_score(y_valid, oof['predict'][val_idx])
        val_aucs.append(val_score)
        
        # feature importance
        #fold_importance_df = pd.DataFrame(list(cat_model.get_feature_importance()), columns=['feature','importance'])
        #fold_importance_df["fold"] = fold + 1
        
        #feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    return oof, predictions, val_aucs#, feature_importance_df

In [6]:
features = [col for col in df_train.columns if col not in 
            ['target', 'ID_code']
           ]
oof, predictions, val_aucs = run_catboost(df_train, df_test, cat_params, features)

0:	learn: 0.5973849	test: 0.6022441	best: 0.6022441 (0)	total: 21.9ms	remaining: 36m 25s
1000:	learn: 0.9023330	test: 0.8854340	best: 0.8854340 (1000)	total: 10.7s	remaining: 17m 37s
2000:	learn: 0.9245825	test: 0.9006061	best: 0.9006061 (2000)	total: 22.9s	remaining: 18m 43s
3000:	learn: 0.9360135	test: 0.9055064	best: 0.9055079 (2998)	total: 34.8s	remaining: 18m 44s
4000:	learn: 0.9430415	test: 0.9073380	best: 0.9073420 (3998)	total: 45.8s	remaining: 18m 19s
5000:	learn: 0.9478399	test: 0.9080983	best: 0.9080983 (5000)	total: 56.7s	remaining: 17m 57s
6000:	learn: 0.9516714	test: 0.9086111	best: 0.9086166 (5981)	total: 1m 7s	remaining: 17m 41s
7000:	learn: 0.9549382	test: 0.9089089	best: 0.9089089 (7000)	total: 1m 18s	remaining: 17m 28s
8000:	learn: 0.9578491	test: 0.9091311	best: 0.9091383 (7984)	total: 1m 29s	remaining: 17m 12s
9000:	learn: 0.9604452	test: 0.9091954	best: 0.9092002 (8924)	total: 1m 40s	remaining: 16m 59s
10000:	learn: 0.9627847	test: 0.9094078	best: 0.9094247 (9910)

In [7]:
mean_auc = np.mean(val_aucs)
std_auc = np.std(val_aucs)
all_auc = roc_auc_score(oof['target'], oof['predict'])
print("Mean auc: %.9f, std: %.9f. All auc: %.9f." % (mean_auc, std_auc, all_auc))

Mean auc: 0.912579550, std: 0.001383812. All auc: 0.912500751.


In [8]:
# submission
predictions['target'] = np.mean(predictions[[col for col in predictions.columns if col not in ['ID_code', 'target']]].values, axis=1)
predictions.to_csv('cat_all_predictions.csv', index=None)
sub_df = pd.DataFrame({"ID_code":df_test["ID_code"].values})
sub_df["target"] = predictions['target']
sub_df.to_csv("cat_submission.csv", index=False)
oof.to_csv('cat_oof.csv', index=False)

sub_df.head()

Unnamed: 0,ID_code,target
0,test_0,0.074889
1,test_1,0.128429
2,test_2,0.124374
3,test_3,0.190123
4,test_4,0.038806
