In [1]:
import os
import math
import time

from contextlib import contextmanager
import numpy as np
import pandas as pd
import catboost as cat
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold

In [2]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

In [3]:
categorical_features = ['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'FLAG_MOBIL', 'FLAG_EMP_PHONE',
                        'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL',
                        'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 
                        'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY',
                        'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6',
                        'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_11', 
                        'FLAG_DOCUMENT_18', 'CODE_GENDER', 'NAME_CONTRACT_TYPE',
                        'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'EMERGENCYSTATE_MODE',
                        'HOUSETYPE_MODE', 'FONDKAPREMONT_MODE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
                        'NAME_HOUSING_TYPE', 'NAME_TYPE_SUITE', 'WALLSMATERIAL_MODE','WEEKDAY_APPR_PROCESS_START',
                        'NAME_INCOME_TYPE', 'OCCUPATION_TYPE', 'ORGANIZATION_TYPE']

In [4]:
def catboost(cat_df, categorical_features, num_folds, stratified = False):
    cat_train = cat_df[cat_df['TARGET'].notnull()]
    cat_test = cat_df[cat_df['TARGET'].isnull()]
    
    del cat_df
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1000)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=1000)
    # Create arrays and dataframes to store results
    cat_val_preds = np.zeros(cat_train.shape[0])
    cat_preds = np.zeros(cat_test.shape[0])
    
    feats = [f for f in cat_train.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    print("Starting CatBoost. Train shape: {}, test shape: {}".format(cat_train.shape, cat_test.shape))
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(cat_train[feats], cat_train['TARGET'])):
        
        data_train, data_valid = cat_train[feats].iloc[train_idx], cat_train[feats].iloc[valid_idx]
        label_train, label_valid = cat_train['TARGET'].iloc[train_idx], cat_train['TARGET'].iloc[valid_idx]
        
        cat_feat = [feature for feature in categorical_features if feature in cat_train[feats].columns] 
        cat_feat_idx = [(cat_train[feats].columns.get_loc(feature)) for feature in cat_feat]
       
        for feat in cat_feat:
            if data_train[feat].isnull().values.any():
                data_train[feat] = str(data_train[feat])
            if data_valid[feat].isnull().values.any():
                data_valid[feat] = str(data_valid[feat])
            if cat_test[feat].isnull().values.any():
                cat_test[feat] = str(cat_test[feat])
            
        
        dtrain_cat = cat.Pool(data=data_train, label=label_train, cat_features = cat_feat_idx)
        dvalid_cat = cat.Pool(data=data_valid, label=label_valid, cat_features = cat_feat_idx) 
        dtest_cat = cat.Pool(cat_test[feats], cat_features = cat_feat_idx)
        
        print(dtrain_cat.get_features())
        print(dtrain_cat.get_label())
        
        # CatBoost parameters
        params_cat = {
            'colsample_bylevel': 0.9497036,
            'reg_lambda': 0.0735294,
            'loss_function': 'Logloss',
            'boosting_type': 'Ordered',
            'sampling_frequency': 'PerTree',
            'max_depth': 8,
#             'subsample': 0.8715623,
            'random_seed': 0,
            'learning_rate': 0.02,  # 02,
            'num_boost_round': 1000,
            'eval_metric': 'AUC',
        }
        
        cat_clf = cat.CatBoost(params = params_cat)
        
        cat_clf.fit(
            dtrain_cat,
            eval_set = [dtrain_cat, dvalid_cat],
            verbose = 100, 
            early_stopping_rounds=100
        )
        
        cat_val_preds[valid_idx] = cat_clf.predict(dvalid_cat.get_features(), prediction_type = 'Probability')
        cat_preds += cat_clf.predict(dtest_cat, prediction_type = 'Probability') / folds.n_splits
        
        print('CAT fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(dvalid_cat.get_label(), cat_val_preds[valid_idx])))
        del cat_clf, data_train, data_valid, label_train, label_valid, cat_feat, cat_feat_idx, dtrain_cat
        del dvalid_cat
 
    print('CAT Full AUC score %.6f' % roc_auc_score(cat_train['TARGET'], cat_val_preds))
    # Write submission file
    pred_df = cat_test[['SK_ID_CURR']].copy()
    pred_df['TARGET'] = cat_preds
    pred_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)

In [None]:
def main(categorical_features):
    cat_df = pd.read_csv('../data/processed_data_2.2_no_encode.csv')

    float_columns = [column for column in cat_df.columns if cat_df[column].dtype is float and column is not 'TARGET']
    
    cat_df.to_string(columns = float_columns)
    
    with timer("Ran CatBoost with kfold"):
        catboost(cat_df, categorical_features, num_folds= 5, stratified = True) 

if __name__ == "__main__":
    submission_file_name = "../predictions/cat_pred.csv"
    with timer("Full model run"):
        main(categorical_features)

Starting CatBoost. Train shape: (307511, 667), test shape: (48744, 667)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
