In [1]:
model_name = "with_top3_cb"

In [2]:
import numpy as np
import pandas as pd
import gc
from time import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

In [3]:
timesheet = [time()]
def timer(statement):
    global timesheet
    timesheet.append(time())
    print statement+" :", (timesheet[-1]-timesheet[-2]),"seconds"
timer("Init...")

Init... : 0.00019097328186 seconds


In [4]:
def load_meta(directory, prefix, modeltype):
    m_tr = pd.read_csv(directory+prefix+"_train.csv")
    m_te = pd.read_csv(directory+prefix+"_test.csv")
    data = pd.concat([m_tr, m_te], axis=0).reset_index(drop=True)
    data.columns = ["{}_{}_{}".format(c, prefix, modeltype) if c!="SK_ID_CURR" else c for c in data.columns]
    return data

def load_neptune(directory, modelname):
    m_tr = pd.read_csv(directory+"lightGBM_out_of_fold_train_predictions.csv", usecols = ["SK_ID_CURR","lightGBM_prediction"])
    m_te = pd.read_csv(directory+"lightGBM_out_of_fold_test_predictions.csv", usecols = ["SK_ID_CURR","lightGBM_prediction"])
    m_te = m_te.groupby("SK_ID_CURR")["lightGBM_prediction"].mean().reset_index()
    
    data = pd.concat([m_tr, m_te], axis=0).reset_index(drop=True)
    data.columns = ["SK_ID_CURR", "neptune_{}".format(modelname)]
    return data

def join_features(data, features):
    for item in features:
        data = data.merge(item, how = "left", on = "SK_ID_CURR")
    return data

def load_data(datafile):
    global important_columns
    colnames = [c.replace(" ","_") for c in pd.read_csv(datafile, nrows= 1).columns]
    intersection = list(set(colnames).intersection(set(important_columns)))+["SK_ID_CURR"]

    df =  pd.read_csv(datafile, names = colnames, usecols = intersection, skiprows=1).dropna(axis=1, how="all")
    timer("Loaded {} with shape {} in ".format(datafile.split("/")[-1], df.shape))
    return df

## Getting Feature Importance File

In [5]:
important_columns = pd.read_csv("../feature selector/importance/shap_importances.csv")
important_columns["feature"] = important_columns["feature"].apply(lambda x: x.replace(" ","_"))
important_columns = important_columns.loc[important_columns.shapely_mean > 0.0].reset_index(drop=True)
important_columns = important_columns.sort_values(by = "shapely_mean", ascending = False).reset_index(drop=True)
important_columns = important_columns.loc[0:3].feature.values.tolist()

## Loading Features

In [6]:
applications = load_data("../extractor/csv/application_features_V2.csv")
bureau_balance_bb = load_data("../extractor/csv/bureau_features_V1.csv")
credit_card_balance = load_data("../extractor/csv/credit_card_features_V1.csv")
installment_features = load_data("../extractor/csv/installment_features_V1.csv")
pos_cash_balance = load_data("../extractor/csv/pos_cash_features_V1.csv")
previous_apps = load_data("../extractor/csv/previous_application_features_V1.csv")

credit_card_balance_v3 = load_data("../extractor/csv/credit_card_features_V3.csv")
installment_features_v3 = load_data("../extractor/csv/installment_features_V3.csv")
pos_cash_balance_v3 = load_data("../extractor/csv/pos_cash_features_V3.csv")
previous_apps_v3 = load_data("../extractor/csv/previous_application_features_V3.csv")

base_features = [applications, bureau_balance_bb, credit_card_balance, installment_features, pos_cash_balance,previous_apps, credit_card_balance_v3,installment_features_v3,pos_cash_balance_v3, previous_apps_v3]

Loaded application_features_V2.csv with shape (356255, 5) in  : 5.94688200951 seconds
Loaded bureau_features_V1.csv with shape (305811, 1) in  : 5.52722811699 seconds
Loaded credit_card_features_V1.csv with shape (103558, 1) in  : 10.1691598892 seconds
Loaded installment_features_V1.csv with shape (339587, 1) in  : 24.5087480545 seconds
Loaded pos_cash_features_V1.csv with shape (337252, 1) in  : 17.2392170429 seconds
Loaded previous_application_features_V1.csv with shape (338857, 1) in  : 15.2878460884 seconds
Loaded credit_card_features_V3.csv with shape (103558, 1) in  : 16.7036290169 seconds
Loaded installment_features_V3.csv with shape (339587, 1) in  : 21.8799698353 seconds
Loaded pos_cash_features_V3.csv with shape (337252, 1) in  : 6.37836503983 seconds
Loaded previous_application_features_V3.csv with shape (338857, 1) in  : 8.11299610138 seconds


In [7]:
tree_meta_features = [
    load_meta("../base trees/csv/", "application", "trees"),
    load_meta("../base trees/csv/", "bureau_balance_bb", "trees"),
    load_meta("../base trees/csv/", "credit_card_balance", "trees"),
    load_meta("../base trees/csv/", "installment", "trees"),
    load_meta("../base trees/csv/", "pos_cash_balance", "trees"),
    load_meta("../base trees/csv/", "previous_apps", "trees"),
]

lr_meta_features = [
    load_meta("../base lr/csv/", "application", "lr"),
    load_meta("../base lr/csv/", "bureau_balance_bb", "lr"),
    load_meta("../base lr/csv/", "credit_card_balance", "lr"),
    load_meta("../base lr/csv/", "installment", "lr"),
    load_meta("../base lr/csv/", "pos_cash_balance", "lr"),
    load_meta("../base lr/csv/", "previous_apps", "lr"),
]

nb_meta_features = [
    load_meta("../base nb/csv/", "application", "nb"),
    load_meta("../base nb/csv/", "bureau_balance_bb", "nb"),
    load_meta("../base nb/csv/", "credit_card_balance", "nb"),
    load_meta("../base nb/csv/", "installment", "nb"),
    load_meta("../base nb/csv/", "pos_cash_balance", "nb"),
    load_meta("../base nb/csv/", "previous_apps", "nb"),
]

neptune_features = [
    load_neptune("../base neptune/m1/csv/", "m1"),
    load_neptune("../base neptune/m2/csv/", "m2"),
    load_neptune("../base neptune/m3/csv/", "m3"),
    load_neptune("../base neptune/m4/csv/", "m4"),
]

mixture_models = [
    load_meta("../base mixtures/csv/", "knn_on_selected_pca", "mixtures"),
    load_meta("../base mixtures/csv/", "lgbm_on_core_features", "mixtures"),
    load_meta("../base mixtures/csv/", "lgbm_on_gp_features", "mixtures"),
    load_meta("../base mixtures/csv/", "lr_on_core_features", "mixtures"),
    load_meta("../base mixtures/csv/", "nn", "mixtures"),
    load_meta("../base mixtures/csv/", "using_lags_bureau_data", "mixtures"),

]

l1_features = [
    load_meta("../l1/csv/", "l1_gnb", "l1"),
    load_meta("../l1/csv/", "l1_lr", "l1"),
    load_meta("../l1/csv/", "l1_tree_with_flags", "l1"),
    load_meta("../l1/csv/", "l1_tree_without_flags", "l1"),
    load_meta("../l1/csv/", "tree_with_flags_without_meta", "l1"),
    load_meta("../l1/csv/", "tree_without_flags_without_meta", "l1"),
    load_meta("../l1/csv/", "tree_on_core_features_with_meta", "l1"),
]

l2_features = [
    load_meta("../l2/csv/", "tree_on_l1_and_basemixtures_neptune_1", "l2"),
    load_meta("../l2/csv/", "tree_on_l1_and_basemixtures_neptune_2", "l2"),
]

final_models = [
    load_meta("../final_models/csv/", "my500_neptuneselected_meta", "final_models"),
    load_meta("../final_models/csv/", "my500_neptuneselected_nometa", "final_models"),
]

## Joining All Files

In [8]:
train = pd.read_csv("../data/application_train.csv", usecols = ["SK_ID_CURR","TARGET"])
test = pd.read_csv("../data/application_test.csv", usecols = ["SK_ID_CURR"])

data = pd.concat([train, test], axis=0).reset_index(drop=True)
data = join_features(data, base_features)
data = join_features(data, tree_meta_features)
data = join_features(data, lr_meta_features)
data = join_features(data, nb_meta_features)
data = join_features(data, neptune_features)
data = join_features(data, mixture_models)
data = join_features(data, l1_features)
data = join_features(data, l2_features)
data = join_features(data, final_models)

del base_features,applications, bureau_balance_bb, credit_card_balance, installment_features, pos_cash_balance,previous_apps, credit_card_balance_v3,installment_features_v3,pos_cash_balance_v3, previous_apps_v3
gc.collect()

train = data.loc[data.TARGET.notnull()].reset_index(drop=True)
test = data.loc[data.TARGET.isnull()].reset_index(drop=True)

train_id = train[["SK_ID_CURR"]]
test_id = test[["SK_ID_CURR"]]
test_id_rank = test[["SK_ID_CURR"]]
target =train.TARGET

train.drop(["SK_ID_CURR", "TARGET"], axis=1, inplace=True)
test.drop(["SK_ID_CURR","TARGET"], axis=1, inplace = True)

# Defining Model

In [9]:
from catboost import CatBoostClassifier
def model_tree(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id):
    model = CatBoostClassifier(thread_count = 16,iterations=1000,learning_rate=0.05,depth=5,l2_leaf_reg=40,bootstrap_type='Bernoulli',subsample=0.7,scale_pos_weight=5,eval_metric='AUC',metric_period=50,od_type='Iter',od_wait=45, random_seed=17, allow_writing_files=False)
    model.fit(x_train, y_train, eval_set=(x_test, y_test),use_best_model=True,verbose=True)
    
    meta_train[test_index] = model.predict_proba(x_test)[:,1]
    meta_test.append(model.predict_proba(test)[:,1])
        
    global fold_roc
    fold_roc.append(roc_auc_score(y_test, meta_train[test_index]))
 

# Training Model

In [10]:
meta_train = np.zeros(train.shape[0])
meta_test = []
feature_importance = pd.DataFrame(columns = ["feature","split","gain"])
fold_roc = []

kf = StratifiedKFold(n_splits= 10, shuffle=True, random_state=12323)
for fold_id, (train_index, test_index) in enumerate(kf.split(train, target)):
    x_train, x_test = train.iloc[train_index], train.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]

    model_tree(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id)

test_id["TARGET"] = np.array(meta_test).T.mean(axis=1)
test_id_rank["TARGET"] = pd.DataFrame(np.array(meta_test).T).rank(pct = True).mean(axis=1)
train_id["TARGET"] = meta_train

print "Overall ROC: {},  Mean ROC: {}, STD AUC: {}".format(roc_auc_score(target, meta_train), np.mean(fold_roc), np.std(fold_roc))

0:	test: 0.7870240	best: 0.7870240 (0)	total: 170ms	remaining: 2m 50s
50:	test: 0.7987612	best: 0.7987612 (50)	total: 4.47s	remaining: 1m 23s
100:	test: 0.7998072	best: 0.7998072 (100)	total: 7.83s	remaining: 1m 9s
150:	test: 0.8009001	best: 0.8009001 (150)	total: 11.1s	remaining: 1m 2s
200:	test: 0.8016155	best: 0.8016155 (200)	total: 14.8s	remaining: 58.7s
250:	test: 0.8021512	best: 0.8021512 (250)	total: 18.2s	remaining: 54.3s
300:	test: 0.8024553	best: 0.8024553 (300)	total: 21.5s	remaining: 49.8s
350:	test: 0.8027949	best: 0.8027949 (350)	total: 24.7s	remaining: 45.6s
400:	test: 0.8031205	best: 0.8031205 (400)	total: 28s	remaining: 41.8s
450:	test: 0.8033179	best: 0.8033179 (450)	total: 31.3s	remaining: 38.1s
500:	test: 0.8034070	best: 0.8034070 (500)	total: 34.7s	remaining: 34.5s
550:	test: 0.8036061	best: 0.8036061 (550)	total: 37.9s	remaining: 30.9s
600:	test: 0.8035977	best: 0.8036061 (550)	total: 41.4s	remaining: 27.5s
Stopped by overfitting detector  (45 iterations wait)

be

350:	test: 0.8072393	best: 0.8072393 (350)	total: 37.1s	remaining: 1m 8s
400:	test: 0.8075784	best: 0.8075784 (400)	total: 40.7s	remaining: 1m
450:	test: 0.8078699	best: 0.8078699 (450)	total: 44.2s	remaining: 53.8s
500:	test: 0.8080631	best: 0.8080631 (500)	total: 47.6s	remaining: 47.4s
550:	test: 0.8081712	best: 0.8081712 (550)	total: 51.1s	remaining: 41.6s
600:	test: 0.8084095	best: 0.8084095 (600)	total: 54.6s	remaining: 36.3s
650:	test: 0.8086285	best: 0.8086285 (650)	total: 58.1s	remaining: 31.1s
700:	test: 0.8088712	best: 0.8088712 (700)	total: 1m 1s	remaining: 26.3s
750:	test: 0.8089742	best: 0.8089742 (750)	total: 1m 5s	remaining: 21.6s
800:	test: 0.8090414	best: 0.8090414 (800)	total: 1m 9s	remaining: 17.3s
850:	test: 0.8090484	best: 0.8090484 (850)	total: 1m 13s	remaining: 12.8s
Stopped by overfitting detector  (45 iterations wait)

bestTest = 0.8090484259
bestIteration = 850

Shrink model to first 851 iterations.
0:	test: 0.7820835	best: 0.7820835 (0)	total: 65.5ms	remainin

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Overall ROC: 0.805216718597,  Mean ROC: 0.805452528497, STD AUC: 0.00494115815854


In [13]:
train_id.to_csv("csv/{}_train.csv".format(model_name), index=False)
test_id.to_csv("csv/{}_test.csv".format(model_name), index=False)
test_id_rank.to_csv("csv/{}_rank_test.csv".format(model_name), index=False)

In [14]:
fold_roc

[0.8036061469006258,
 0.8003078758615646,
 0.8132977315183267,
 0.7966039883760421,
 0.8125804695460206,
 0.8024476859516076,
 0.8063682470202609,
 0.8090484258701708,
 0.8045758811112287,
 0.8056888328097578]

In [None]:
-