In [1]:
model_name = "xgb_on_l1_and_basemixtures_neptune"

In [2]:
import numpy as np
import pandas as pd
import gc
from time import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

In [3]:
timesheet = [time()]
def timer(statement):
    global timesheet
    timesheet.append(time())
    print statement+" :", (timesheet[-1]-timesheet[-2]),"seconds"

def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

def load_meta(directory, prefix, modeltype):
    m_tr = pd.read_csv(directory+prefix+"_train.csv")
    m_te = pd.read_csv(directory+prefix+"_test.csv")
    data = pd.concat([m_tr, m_te], axis=0).reset_index(drop=True)
    data.columns = ["{}_{}_{}".format(c, prefix, modeltype) if c!="SK_ID_CURR" else c for c in data.columns]
    return data

def join_features(data, features):
    for item in features:
        data = data.merge(item, how = "left", on = "SK_ID_CURR")
    return data

def postprocess(df):
    cols = [c for c in df.columns if c not in ["SK_ID_CURR","SK_ID_PREV","TARGET"]]
    df = df.replace([np.inf, -np.inf], np.nan)
    return df

def load_neptune(directory, modelname):
    m_tr = pd.read_csv(directory+"lightGBM_out_of_fold_train_predictions.csv", usecols = ["SK_ID_CURR","lightGBM_prediction"])
    m_te = pd.read_csv(directory+"lightGBM_out_of_fold_test_predictions.csv", usecols = ["SK_ID_CURR","lightGBM_prediction"])
    m_te = m_te.groupby("SK_ID_CURR")["lightGBM_prediction"].mean().reset_index()
    
    data = pd.concat([m_tr, m_te], axis=0).reset_index(drop=True)
    data.columns = ["SK_ID_CURR", "neptune_{}".format(modelname)]
    return data

# Loading Meta Files

In [4]:
l1_features = [
    load_meta("../l1/csv/", "l1_gnb", "l1"),
    load_meta("../l1/csv/", "l1_lr", "l1"),
    load_meta("../l1/csv/", "l1_tree_with_flags", "l1"),
    load_meta("../l1/csv/", "l1_tree_without_flags", "l1"),
    load_meta("../l1/csv/", "tree_with_flags_without_meta", "l1"),
    load_meta("../l1/csv/", "tree_without_flags_without_meta", "l1"),
    load_meta("../l1/csv/", "tree_on_core_features_with_meta", "l1"),
]

neptune_features = [
    load_neptune("../base neptune/m1/csv/", "m1"),
    load_neptune("../base neptune/m2/csv/", "m2"),
    load_neptune("../base neptune/m3/csv/", "m3"),
    load_neptune("../base neptune/m4/csv/", "m4"),
]

mixture_models = [
    load_meta("../base mixtures/csv/", "knn_on_selected_pca", "mixtures"),
    load_meta("../base mixtures/csv/", "lgbm_on_core_features", "mixtures"),
    load_meta("../base mixtures/csv/", "lgbm_on_gp_features", "mixtures"),
    load_meta("../base mixtures/csv/", "lr_on_core_features", "mixtures"),
    load_meta("../base mixtures/csv/", "nn", "mixtures"),
    load_meta("../base mixtures/csv/", "using_lags_bureau_data", "mixtures"),

]

## Get Application Train/Test Data

In [5]:
def application_train_test(num_rows = None, nan_as_category = False):
    # Read data and merge
    df = pd.read_csv('../data/application_train.csv', nrows= num_rows)
    test_df = pd.read_csv('../data/application_test.csv', nrows= num_rows)

    print("Train samples: {}, test samples: {}".format(len(df), len(test_df)))
    df = df.append(test_df).reset_index()
    # Optional: Remove 4 applications with XNA CODE_GENDER (train set)
    df = df[df['CODE_GENDER'] != 'XNA']

    docs = [_f for _f in df.columns if 'FLAG_DOC' in _f]
    live = [_f for _f in df.columns if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f)]

    # NaN values for DAYS_EMPLOYED: 365.243 -> nan
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)

    inc_by_org = df[['AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE']].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']

    df['NEW_CREDIT_TO_ANNUITY_RATIO'] = df['AMT_CREDIT'] / df['AMT_ANNUITY'].astype("float32")
    df['NEW_CREDIT_TO_GOODS_RATIO'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE'].astype("float32")
    df['NEW_DOC_IND_KURT'] = df[docs].kurtosis(axis=1)
    df['NEW_LIVE_IND_SUM'] = df[live].sum(axis=1)
    df['NEW_INC_PER_CHLD'] = df['AMT_INCOME_TOTAL'] / (1 + df['CNT_CHILDREN']).astype("float32")
    df['NEW_INC_BY_ORG'] = df['ORGANIZATION_TYPE'].map(inc_by_org)
    df['NEW_EMPLOY_TO_BIRTH_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH'].astype("float32")
    df['NEW_ANNUITY_TO_INCOME_RATIO'] = df['AMT_ANNUITY'] / (1 + df['AMT_INCOME_TOTAL']).astype("float32")
    df['NEW_SOURCES_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
    df['NEW_EXT_SOURCES_MEAN'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    df['NEW_SCORES_STD'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
    df['NEW_SCORES_STD'] = df['NEW_SCORES_STD'].fillna(df['NEW_SCORES_STD'].mean())
    df['NEW_CAR_TO_BIRTH_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH'].astype("float32")
    df['NEW_CAR_TO_EMPLOY_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED'].astype("float32")
    df['NEW_PHONE_TO_BIRTH_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH'].astype("float32")
    df['NEW_PHONE_TO_BIRTH_RATIO_EMPLOYER'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_EMPLOYED'].astype("float32")
    df['NEW_CREDIT_TO_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL'].astype("float32")

    # Categorical features with Binary encode (0 or 1; two categories)
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])
    # Categorical features with One-Hot encode
    df, cat_cols = one_hot_encoder(df, nan_as_category)
    dropcolum=['FLAG_DOCUMENT_2','FLAG_DOCUMENT_4',
    'FLAG_DOCUMENT_5','FLAG_DOCUMENT_6','FLAG_DOCUMENT_7',
    'FLAG_DOCUMENT_8','FLAG_DOCUMENT_9','FLAG_DOCUMENT_10', 
    'FLAG_DOCUMENT_11','FLAG_DOCUMENT_12','FLAG_DOCUMENT_13',
    'FLAG_DOCUMENT_14','FLAG_DOCUMENT_15','FLAG_DOCUMENT_16',
    'FLAG_DOCUMENT_17','FLAG_DOCUMENT_18','FLAG_DOCUMENT_19',
    'FLAG_DOCUMENT_20','FLAG_DOCUMENT_21']
    df= df.drop(dropcolum,axis=1)
    del test_df
    gc.collect()
    return df

## Get Bureau & Bureau Balance Data

In [6]:
def bureau_and_balance(num_rows = None, nan_as_category = True):
    bureau = pd.read_csv('../data/bureau.csv', nrows = num_rows)
    bb = pd.read_csv('../data/bureau_balance.csv', nrows = num_rows)
    bb, bb_cat = one_hot_encoder(bb, nan_as_category)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)

    # Bureau balance: Perform aggregations and merge with bureau.csv
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
    for col in bb_cat:
        bb_aggregations[col] = ['mean']
    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace= True)
    del bb, bb_agg
    gc.collect()

    # Bureau and bureau_balance numeric features
    num_aggregations = {
        'DAYS_CREDIT': [ 'mean', 'var'],
        'DAYS_CREDIT_ENDDATE': [ 'mean'],
        'DAYS_CREDIT_UPDATE': ['mean'],
        'CREDIT_DAY_OVERDUE': ['mean'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM': [ 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': [ 'mean', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
        'AMT_ANNUITY': ['max', 'mean'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'],
        'MONTHS_BALANCE_SIZE': ['mean', 'sum']
    }
    # Bureau and bureau_balance categorical features
    cat_aggregations = {}
    for cat in bureau_cat: cat_aggregations[cat] = ['mean']
    for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']

    num_cat = num_aggregations.copy()
    num_cat.update(cat_aggregations)

    bureau_agg = bureau.groupby('SK_ID_CURR').agg(num_cat)
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
    bureau_agg = bureau_agg.reset_index()
    # Bureau: Active credits - using only numerical aggregations
    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
    active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
    active_agg = active_agg.reset_index()
    bureau_agg = bureau_agg.merge(active_agg, how='left', on='SK_ID_CURR')
    del active, active_agg
    gc.collect()
    # Bureau: Closed credits - using only numerical aggregations
    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
    closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
    del closed, closed_agg, bureau
    gc.collect()
    return bureau_agg

## Get Previous Applications Data

In [7]:
def previous_applications(num_rows = None, nan_as_category = True):
    prev = pd.read_csv('../data/previous_application.csv', nrows = num_rows)
    prev, cat_cols = one_hot_encoder(prev, nan_as_category= True)
    # Days 365.243 values -> nan
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
    # Add feature: value ask / value received percentage
    prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
    # Previous applications numeric features
    num_aggregations = {
        'AMT_ANNUITY': [ 'max', 'mean'],
        'AMT_APPLICATION': [ 'max','mean'],
        'AMT_CREDIT': [ 'max', 'mean'],
        'APP_CREDIT_PERC': [ 'max', 'mean'],
        'AMT_DOWN_PAYMENT': [ 'max', 'mean'],
        'AMT_GOODS_PRICE': [ 'max', 'mean'],
        'HOUR_APPR_PROCESS_START': [ 'max', 'mean'],
        'RATE_DOWN_PAYMENT': [ 'max', 'mean'],
        'DAYS_DECISION': [ 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
    }
    # Previous applications categorical features
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']

    num_cat = num_aggregations.copy()
    num_cat.update(cat_aggregations)

    prev_agg = prev.groupby('SK_ID_CURR').agg(num_cat)
    prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
    prev_agg = prev_agg.reset_index()
    # Previous Applications: Approved Applications - only numerical features
    approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    approved_agg = approved_agg.reset_index()
    prev_agg = prev_agg.merge(approved_agg, how='left', on='SK_ID_CURR')
    # Previous Applications: Refused Applications - only numerical features
    refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
    del refused, refused_agg, approved, approved_agg, prev
    gc.collect()
    return prev_agg

## Get POS Cash Data

In [8]:
def pos_cash(num_rows = None, nan_as_category = True):
    pos = pd.read_csv('../data/POS_CASH_balance.csv', nrows = num_rows)
    pos, cat_cols = one_hot_encoder(pos, nan_as_category= True)
    # Features
    aggregations = {
        'MONTHS_BALANCE': ['max', 'mean', 'size'],
        'SK_DPD': ['max', 'mean'],
        'SK_DPD_DEF': ['max', 'mean']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']

    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
    # Count pos cash accounts
    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
    del pos
    gc.collect()
    return pos_agg

## Get Installments Payments Data

In [9]:
def installments_payments(num_rows = None, nan_as_category = True):
    ins = pd.read_csv('../data/installments_payments.csv', nrows = num_rows)
    ins, cat_cols = one_hot_encoder(ins, nan_as_category= True)
    # Percentage and difference paid in each installment (amount paid and installment value)
    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
    # Days past due and days before due (no negative values)
    ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
    ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
    # Features: Perform aggregations
    aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DPD': ['max', 'mean', 'sum','min','std' ],
        'DBD': ['max', 'mean', 'sum','min','std'],
        'PAYMENT_PERC': [ 'max','mean',  'var','min','std'],
        'PAYMENT_DIFF': [ 'max','mean', 'var','min','std'],
        'AMT_INSTALMENT': ['max', 'mean', 'sum','min','std'],
        'AMT_PAYMENT': ['min', 'max', 'mean', 'sum','std'],
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum','std']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
    ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
    # Count installments accounts
    ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()
    del ins
    gc.collect()
    return ins_agg

## Get Credit Card Balance Data

In [10]:
def credit_card_balance(num_rows = None, nan_as_category = True):
    cc = pd.read_csv('../data/credit_card_balance.csv', nrows = num_rows)
    cc, cat_cols = one_hot_encoder(cc, nan_as_category= True)
    # General aggregations
    cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)
    cc_agg = cc.groupby('SK_ID_CURR').agg([ 'max', 'mean', 'sum', 'var'])
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    # Count credit card lines
    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
    del cc
    gc.collect()
    return cc_agg

# Generate Data For Training

In [11]:
num_rows = None
timer("Init.. ")

data = application_train_test(num_rows)
timer("Done with Application Train and Test")

data = data.merge(bureau_and_balance(num_rows),  how='left', on='SK_ID_CURR')
timer("Done with Bureau and Bureau Balance")

data = data.merge(previous_applications(num_rows),  how='left', on='SK_ID_CURR')
timer("Done with Previous Applications")

data = data.join(pos_cash(num_rows),  how='left', on='SK_ID_CURR')
timer("Done with POS Cash")

data = data.join(installments_payments(num_rows),  how='left', on='SK_ID_CURR')
timer("Done with Installment Payments")

data = data.join(credit_card_balance(num_rows),  how='left', on='SK_ID_CURR')
timer("Done with Credit Card balance")

data = postprocess(data)

data = join_features(data, l1_features)
data = join_features(data, neptune_features)
data = join_features(data, mixture_models)
timer("Done with Meta Features")

train = data[data['TARGET'].notnull()]
test = data[data['TARGET'].isnull()]

target = train.TARGET
train_id = train[["SK_ID_CURR"]]
test_id = test[["SK_ID_CURR"]]
test_id_rank = test[["SK_ID_CURR"]]

train = train.drop(['TARGET','SK_ID_CURR','index'], axis=1)
test = test.drop(['TARGET','SK_ID_CURR','index'], axis=1)

Init..  : 5.42397379875 seconds
Train samples: 307511, test samples: 48744
Done with Application Train and Test : 17.7580339909 seconds
Done with Bureau and Bureau Balance : 41.0392742157 seconds
Done with Previous Applications : 34.5574519634 seconds
Done with POS Cash : 23.4558558464 seconds
Done with Installment Payments : 66.7657830715 seconds
Done with Credit Card balance : 27.0700159073 seconds
Done with Meta Features : 57.8450059891 seconds


# Defining Model

In [12]:
import xgboost as xgb
def model_tree(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id):
    
    dtrain = xgb.DMatrix(x_train.values,label=y_train.values)
    dval = xgb.DMatrix(x_test.values, label=y_test.values)
    params = {
        "objective": "binary:logistic",
        "booster":"gbtree",
        "eval_metric" :"auc",
        "nthread":8,
        "eta":0.05,
        "max_depth":6,
        "min_child_weight":30,
        "gamma":0,
        "subsample": 0.85,
        "colsample_bytree": 0.7,
        "colsample_bylevel": 0.632,
        "alpha": 0,
        "lambda" : 0,
        "seed":1343,
        "tree_method":"gpu_exact",
        "gpu_id":1,
    }
    evallist  = [(dtrain,'train'),(dval,'eval')]
    model = xgb.train(params, dtrain, 5000,evals=evallist, early_stopping_rounds=20, verbose_eval=1)

    meta_train[test_index] = model.predict(dval, ntree_limit=model.best_ntree_limit)
    meta_test.append(model.predict(xgb.DMatrix(test.values), ntree_limit=model.best_ntree_limit))
        
    global fold_roc
    fold_roc.append(roc_auc_score(y_test, meta_train[test_index]))
 

# Training Classifier

In [16]:
meta_train = np.zeros(train.shape[0])
meta_test = []
feature_importance = pd.DataFrame(columns = ["feature","split","gain"])
fold_roc = []

kf = StratifiedKFold(n_splits= 10, shuffle=True, random_state=12323)
for fold_id, (train_index, test_index) in enumerate(kf.split(train, target)):
    x_train, x_test = train.iloc[train_index], train.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]

    model_tree(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id)

test_id["TARGET"] = np.array(meta_test).T.mean(axis=1)
test_id_rank["TARGET"] = pd.DataFrame(np.array(meta_test).T).rank(pct = True).mean(axis=1)
train_id["TARGET"] = meta_train

print "Overall ROC: {},  Mean ROC: {}, STD AUC: {}".format(roc_auc_score(target, meta_train), np.mean(fold_roc), np.std(fold_roc))

[0]	train-auc:0.799938	eval-auc:0.794194
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 20 rounds.
[1]	train-auc:0.802287	eval-auc:0.795115
[2]	train-auc:0.802852	eval-auc:0.795078
[3]	train-auc:0.803745	eval-auc:0.795673
[4]	train-auc:0.804235	eval-auc:0.795979
[5]	train-auc:0.804505	eval-auc:0.796135
[6]	train-auc:0.804909	eval-auc:0.796328
[7]	train-auc:0.805151	eval-auc:0.796214
[8]	train-auc:0.805272	eval-auc:0.79604
[9]	train-auc:0.805401	eval-auc:0.795997
[10]	train-auc:0.805544	eval-auc:0.796004
[11]	train-auc:0.805595	eval-auc:0.796019
[12]	train-auc:0.805676	eval-auc:0.796018
[13]	train-auc:0.805747	eval-auc:0.796085
[14]	train-auc:0.805817	eval-auc:0.796036
[15]	train-auc:0.805924	eval-auc:0.796093
[16]	train-auc:0.806114	eval-auc:0.796097
[17]	train-auc:0.80626	eval-auc:0.796125
[18]	train-auc:0.806466	eval-auc:0.796125
[19]	train-auc:0.806694	eval-auc:0.796114
[20]	train-auc:0.806851	eval-au

[60]	train-auc:0.814281	eval-auc:0.794432
[61]	train-auc:0.814494	eval-auc:0.794592
[62]	train-auc:0.814732	eval-auc:0.79465
[63]	train-auc:0.814938	eval-auc:0.794644
[64]	train-auc:0.815148	eval-auc:0.794643
[65]	train-auc:0.815282	eval-auc:0.794571
[66]	train-auc:0.815529	eval-auc:0.794607
[67]	train-auc:0.815678	eval-auc:0.79457
[68]	train-auc:0.815828	eval-auc:0.79457
[69]	train-auc:0.816039	eval-auc:0.794554
[70]	train-auc:0.816186	eval-auc:0.794582
[71]	train-auc:0.816438	eval-auc:0.794645
[72]	train-auc:0.816628	eval-auc:0.794636
[73]	train-auc:0.816768	eval-auc:0.794647
[74]	train-auc:0.81697	eval-auc:0.794683
[75]	train-auc:0.817159	eval-auc:0.794767
[76]	train-auc:0.817308	eval-auc:0.794814
[77]	train-auc:0.817525	eval-auc:0.794798
[78]	train-auc:0.817646	eval-auc:0.794872
[79]	train-auc:0.817894	eval-auc:0.794861
[80]	train-auc:0.818133	eval-auc:0.794847
[81]	train-auc:0.818309	eval-auc:0.794793
[82]	train-auc:0.818458	eval-auc:0.794836
[83]	train-auc:0.81868	eval-auc:0.7948

[96]	train-auc:0.819664	eval-auc:0.805545
[97]	train-auc:0.819756	eval-auc:0.80557
[98]	train-auc:0.819896	eval-auc:0.805543
[99]	train-auc:0.820153	eval-auc:0.805542
[100]	train-auc:0.820359	eval-auc:0.805575
[101]	train-auc:0.820546	eval-auc:0.805595
[102]	train-auc:0.820716	eval-auc:0.805594
[103]	train-auc:0.820851	eval-auc:0.805644
[104]	train-auc:0.821079	eval-auc:0.805602
[105]	train-auc:0.821225	eval-auc:0.805622
[106]	train-auc:0.821437	eval-auc:0.805617
[107]	train-auc:0.821641	eval-auc:0.805629
[108]	train-auc:0.821723	eval-auc:0.805649
[109]	train-auc:0.821901	eval-auc:0.805621
[110]	train-auc:0.822131	eval-auc:0.805532
[111]	train-auc:0.822399	eval-auc:0.80552
[112]	train-auc:0.822529	eval-auc:0.805483
[113]	train-auc:0.822682	eval-auc:0.805469
[114]	train-auc:0.822852	eval-auc:0.805452
[115]	train-auc:0.823069	eval-auc:0.805485
[116]	train-auc:0.823316	eval-auc:0.805489
[117]	train-auc:0.823523	eval-auc:0.805486
[118]	train-auc:0.823692	eval-auc:0.805449
[119]	train-auc:0

[94]	train-auc:0.819792	eval-auc:0.806656
[95]	train-auc:0.819972	eval-auc:0.806701
[96]	train-auc:0.820187	eval-auc:0.806688
[97]	train-auc:0.820452	eval-auc:0.806732
[98]	train-auc:0.820658	eval-auc:0.8067
[99]	train-auc:0.820875	eval-auc:0.806768
[100]	train-auc:0.821078	eval-auc:0.806748
[101]	train-auc:0.821285	eval-auc:0.806687
[102]	train-auc:0.82146	eval-auc:0.806646
[103]	train-auc:0.821677	eval-auc:0.806679
[104]	train-auc:0.821868	eval-auc:0.806674
[105]	train-auc:0.822072	eval-auc:0.806737
[106]	train-auc:0.822201	eval-auc:0.806747
[107]	train-auc:0.822344	eval-auc:0.806725
[108]	train-auc:0.822478	eval-auc:0.806759
[109]	train-auc:0.822583	eval-auc:0.806735
[110]	train-auc:0.82277	eval-auc:0.806743
[111]	train-auc:0.822851	eval-auc:0.806749
[112]	train-auc:0.822991	eval-auc:0.806779
[113]	train-auc:0.823159	eval-auc:0.806765
[114]	train-auc:0.823307	eval-auc:0.8067
[115]	train-auc:0.823432	eval-auc:0.806713
[116]	train-auc:0.823601	eval-auc:0.806707
[117]	train-auc:0.82377

[98]	train-auc:0.82036	eval-auc:0.803296
[99]	train-auc:0.82057	eval-auc:0.803292
[100]	train-auc:0.820802	eval-auc:0.803259
[101]	train-auc:0.821033	eval-auc:0.803234
[102]	train-auc:0.821268	eval-auc:0.80326
[103]	train-auc:0.821408	eval-auc:0.803189
[104]	train-auc:0.82158	eval-auc:0.803179
[105]	train-auc:0.821706	eval-auc:0.803177
[106]	train-auc:0.821871	eval-auc:0.803217
[107]	train-auc:0.822067	eval-auc:0.8032
Stopping. Best iteration:
[87]	train-auc:0.818147	eval-auc:0.803583

[0]	train-auc:0.799344	eval-auc:0.796287
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 20 rounds.
[1]	train-auc:0.801495	eval-auc:0.798101
[2]	train-auc:0.802086	eval-auc:0.798786
[3]	train-auc:0.80292	eval-auc:0.799217
[4]	train-auc:0.803478	eval-auc:0.7996
[5]	train-auc:0.803824	eval-auc:0.799921
[6]	train-auc:0.804206	eval-auc:0.800141
[7]	train-auc:0.804305	eval-auc:0.800236
[8]	train-auc:0.804542	eval-auc:0.800378
[9

[15]	train-auc:0.80606	eval-auc:0.79806
[16]	train-auc:0.806279	eval-auc:0.798101
[17]	train-auc:0.806457	eval-auc:0.79802
[18]	train-auc:0.806603	eval-auc:0.797902
[19]	train-auc:0.806727	eval-auc:0.798001
[20]	train-auc:0.806834	eval-auc:0.797948
[21]	train-auc:0.80709	eval-auc:0.79801
[22]	train-auc:0.807314	eval-auc:0.797946
[23]	train-auc:0.807488	eval-auc:0.797884
[24]	train-auc:0.807637	eval-auc:0.797952
[25]	train-auc:0.807782	eval-auc:0.79794
[26]	train-auc:0.807904	eval-auc:0.798048
[27]	train-auc:0.808092	eval-auc:0.798094
[28]	train-auc:0.808253	eval-auc:0.798026
[29]	train-auc:0.808464	eval-auc:0.798132
[30]	train-auc:0.80864	eval-auc:0.798186
[31]	train-auc:0.808828	eval-auc:0.79825
[32]	train-auc:0.808975	eval-auc:0.798386
[33]	train-auc:0.809115	eval-auc:0.798326
[34]	train-auc:0.809262	eval-auc:0.798311
[35]	train-auc:0.809424	eval-auc:0.798339
[36]	train-auc:0.809587	eval-auc:0.798333
[37]	train-auc:0.809742	eval-auc:0.798446
[38]	train-auc:0.809896	eval-auc:0.798425


In [17]:
train_id.to_csv("csv/{}_train.csv".format(model_name), index=False)
test_id.to_csv("csv/{}_test.csv".format(model_name), index=False)
test_id_rank.to_csv("csv/{}_rank_test.csv".format(model_name), index=False)

In [15]:
fold_roc

[0.7968592955141408,
 0.8046223321379965,
 0.799132238460245,
 0.8053018912644471,
 0.7948497568950683,
 0.7907930092277814,
 0.8056664771411471,
 0.7949826339854964,
 0.7977866562555906,
 0.7988201266751856]