In [1]:
model_name = "with_top20_xgb"

In [2]:
import numpy as np
import pandas as pd
import gc
from time import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

In [3]:
timesheet = [time()]
def timer(statement):
    global timesheet
    timesheet.append(time())
    print statement+" :", (timesheet[-1]-timesheet[-2]),"seconds"
timer("Init...")

Init... : 0.000195026397705 seconds


In [4]:
def load_meta(directory, prefix, modeltype):
    m_tr = pd.read_csv(directory+prefix+"_train.csv")
    m_te = pd.read_csv(directory+prefix+"_test.csv")
    data = pd.concat([m_tr, m_te], axis=0).reset_index(drop=True)
    data.columns = ["{}_{}_{}".format(c, prefix, modeltype) if c!="SK_ID_CURR" else c for c in data.columns]
    return data

def load_neptune(directory, modelname):
    m_tr = pd.read_csv(directory+"lightGBM_out_of_fold_train_predictions.csv", usecols = ["SK_ID_CURR","lightGBM_prediction"])
    m_te = pd.read_csv(directory+"lightGBM_out_of_fold_test_predictions.csv", usecols = ["SK_ID_CURR","lightGBM_prediction"])
    m_te = m_te.groupby("SK_ID_CURR")["lightGBM_prediction"].mean().reset_index()
    
    data = pd.concat([m_tr, m_te], axis=0).reset_index(drop=True)
    data.columns = ["SK_ID_CURR", "neptune_{}".format(modelname)]
    return data

def join_features(data, features):
    for item in features:
        data = data.merge(item, how = "left", on = "SK_ID_CURR")
    return data

def load_data(datafile):
    global important_columns
    colnames = [c.replace(" ","_") for c in pd.read_csv(datafile, nrows= 1).columns]
    intersection = list(set(colnames).intersection(set(important_columns)))+["SK_ID_CURR"]

    df =  pd.read_csv(datafile, names = colnames, usecols = intersection, skiprows=1).dropna(axis=1, how="all")
    timer("Loaded {} with shape {} in ".format(datafile.split("/")[-1], df.shape))
    return df

## Getting Feature Importance File

In [5]:
important_columns = pd.read_csv("../feature selector/importance/shap_importances.csv")
important_columns["feature"] = important_columns["feature"].apply(lambda x: x.replace(" ","_"))
important_columns = important_columns.loc[important_columns.shapely_mean > 0.0].reset_index(drop=True)
important_columns = important_columns.sort_values(by = "shapely_mean", ascending = False).reset_index(drop=True)
important_columns = important_columns.loc[0:20].feature.values.tolist()

## Loading Features

In [6]:
applications = load_data("../extractor/csv/application_features_V2.csv")
bureau_balance_bb = load_data("../extractor/csv/bureau_features_V1.csv")
credit_card_balance = load_data("../extractor/csv/credit_card_features_V1.csv")
installment_features = load_data("../extractor/csv/installment_features_V1.csv")
pos_cash_balance = load_data("../extractor/csv/pos_cash_features_V1.csv")
previous_apps = load_data("../extractor/csv/previous_application_features_V1.csv")

credit_card_balance_v3 = load_data("../extractor/csv/credit_card_features_V3.csv")
installment_features_v3 = load_data("../extractor/csv/installment_features_V3.csv")
pos_cash_balance_v3 = load_data("../extractor/csv/pos_cash_features_V3.csv")
previous_apps_v3 = load_data("../extractor/csv/previous_application_features_V3.csv")

base_features = [applications, bureau_balance_bb, credit_card_balance, installment_features, pos_cash_balance,previous_apps, credit_card_balance_v3,installment_features_v3,pos_cash_balance_v3, previous_apps_v3]

Loaded application_features_V2.csv with shape (356255, 19) in  : 4.71745109558 seconds
Loaded bureau_features_V1.csv with shape (305811, 4) in  : 5.60352396965 seconds
Loaded credit_card_features_V1.csv with shape (103558, 1) in  : 10.0072329044 seconds
Loaded installment_features_V1.csv with shape (339587, 1) in  : 24.2160470486 seconds
Loaded pos_cash_features_V1.csv with shape (337252, 1) in  : 17.2374010086 seconds
Loaded previous_application_features_V1.csv with shape (338857, 1) in  : 15.2979490757 seconds
Loaded credit_card_features_V3.csv with shape (103558, 1) in  : 17.0072720051 seconds
Loaded installment_features_V3.csv with shape (339587, 1) in  : 22.0882189274 seconds
Loaded pos_cash_features_V3.csv with shape (337252, 1) in  : 6.40458488464 seconds
Loaded previous_application_features_V3.csv with shape (338857, 1) in  : 8.19625616074 seconds


In [7]:
tree_meta_features = [
    load_meta("../base trees/csv/", "application", "trees"),
    load_meta("../base trees/csv/", "bureau_balance_bb", "trees"),
    load_meta("../base trees/csv/", "credit_card_balance", "trees"),
    load_meta("../base trees/csv/", "installment", "trees"),
    load_meta("../base trees/csv/", "pos_cash_balance", "trees"),
    load_meta("../base trees/csv/", "previous_apps", "trees"),
]

lr_meta_features = [
    load_meta("../base lr/csv/", "application", "lr"),
    load_meta("../base lr/csv/", "bureau_balance_bb", "lr"),
    load_meta("../base lr/csv/", "credit_card_balance", "lr"),
    load_meta("../base lr/csv/", "installment", "lr"),
    load_meta("../base lr/csv/", "pos_cash_balance", "lr"),
    load_meta("../base lr/csv/", "previous_apps", "lr"),
]

nb_meta_features = [
    load_meta("../base nb/csv/", "application", "nb"),
    load_meta("../base nb/csv/", "bureau_balance_bb", "nb"),
    load_meta("../base nb/csv/", "credit_card_balance", "nb"),
    load_meta("../base nb/csv/", "installment", "nb"),
    load_meta("../base nb/csv/", "pos_cash_balance", "nb"),
    load_meta("../base nb/csv/", "previous_apps", "nb"),
]

neptune_features = [
    load_neptune("../base neptune/m1/csv/", "m1"),
    load_neptune("../base neptune/m2/csv/", "m2"),
    load_neptune("../base neptune/m3/csv/", "m3"),
    load_neptune("../base neptune/m4/csv/", "m4"),
]

mixture_models = [
    load_meta("../base mixtures/csv/", "knn_on_selected_pca", "mixtures"),
    load_meta("../base mixtures/csv/", "lgbm_on_core_features", "mixtures"),
    load_meta("../base mixtures/csv/", "lgbm_on_gp_features", "mixtures"),
    load_meta("../base mixtures/csv/", "lr_on_core_features", "mixtures"),
    load_meta("../base mixtures/csv/", "nn", "mixtures"),
    load_meta("../base mixtures/csv/", "using_lags_bureau_data", "mixtures"),

]

l1_features = [
    load_meta("../l1/csv/", "l1_gnb", "l1"),
    load_meta("../l1/csv/", "l1_lr", "l1"),
    load_meta("../l1/csv/", "l1_tree_with_flags", "l1"),
    load_meta("../l1/csv/", "l1_tree_without_flags", "l1"),
    load_meta("../l1/csv/", "tree_with_flags_without_meta", "l1"),
    load_meta("../l1/csv/", "tree_without_flags_without_meta", "l1"),
    load_meta("../l1/csv/", "tree_on_core_features_with_meta", "l1"),
]

l2_features = [
    load_meta("../l2/csv/", "tree_on_l1_and_basemixtures_neptune_1", "l2"),
    load_meta("../l2/csv/", "tree_on_l1_and_basemixtures_neptune_2", "l2"),
]

final_models = [
    load_meta("../final_models/csv/", "my500_neptuneselected_meta", "final_models"),
    load_meta("../final_models/csv/", "my500_neptuneselected_nometa", "final_models"),
]

## Joining All Files

In [8]:
train = pd.read_csv("../data/application_train.csv", usecols = ["SK_ID_CURR","TARGET"])
test = pd.read_csv("../data/application_test.csv", usecols = ["SK_ID_CURR"])

data = pd.concat([train, test], axis=0).reset_index(drop=True)
data = join_features(data, base_features)
data = join_features(data, tree_meta_features)
data = join_features(data, lr_meta_features)
data = join_features(data, nb_meta_features)
data = join_features(data, neptune_features)
data = join_features(data, mixture_models)
data = join_features(data, l1_features)
data = join_features(data, l2_features)
data = join_features(data, final_models)

del base_features,applications, bureau_balance_bb, credit_card_balance, installment_features, pos_cash_balance,previous_apps, credit_card_balance_v3,installment_features_v3,pos_cash_balance_v3, previous_apps_v3
gc.collect()

train = data.loc[data.TARGET.notnull()].reset_index(drop=True)
test = data.loc[data.TARGET.isnull()].reset_index(drop=True)

train_id = train[["SK_ID_CURR"]]
test_id = test[["SK_ID_CURR"]]
test_id_rank = test[["SK_ID_CURR"]]
target =train.TARGET

train.drop(["SK_ID_CURR", "TARGET"], axis=1, inplace=True)
test.drop(["SK_ID_CURR","TARGET"], axis=1, inplace = True)

# Defining Model

In [19]:
import xgboost as xgb
def model_tree(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id):
    dTrain = xgb.DMatrix(x_train, y_train)
    dVal = xgb.DMatrix(x_test,y_test)
    params = {
        "objective": "binary:logistic",
        "booster": "gbtree",
        "eval_metric": "auc",
        "eta": 0.2,
        "max_depth" : 5,
        "min_child_weight":30,
        "gamma":0,
        "subsample": 0.85,
        "colsample_bytree":0.7,
        "colsample_bylevel": 0.6,
        "alpha":0,
        "lambda":0,
        "seed": 98798,
        "tree_method":"gpu_exact"
        
    }
    
    model = xgb.train(params, dTrain, 5000, evals = [(dTrain, "train"),(dVal,"val")], early_stopping_rounds = 30)
    
    meta_train[test_index] = model.predict(dVal, ntree_limit=model.best_ntree_limit)
    meta_test.append(model.predict(xgb.DMatrix(test), ntree_limit=model.best_ntree_limit))
        
    global fold_roc
    fold_roc.append(roc_auc_score(y_test, meta_train[test_index]))
 

# Training Model

In [20]:
meta_train = np.zeros(train.shape[0])
meta_test = []
feature_importance = pd.DataFrame(columns = ["feature","split","gain"])
fold_roc = []

kf = StratifiedKFold(n_splits= 10, shuffle=True, random_state=12323)
for fold_id, (train_index, test_index) in enumerate(kf.split(train, target)):
    x_train, x_test = train.iloc[train_index], train.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]

    model_tree(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id)

test_id["TARGET"] = np.array(meta_test).T.mean(axis=1)
test_id_rank["TARGET"] = pd.DataFrame(np.array(meta_test).T).rank(pct = True).mean(axis=1)
train_id["TARGET"] = meta_train

print "Overall ROC: {},  Mean ROC: {}, STD AUC: {}".format(roc_auc_score(target, meta_train), np.mean(fold_roc), np.std(fold_roc))

[0]	train-auc:0.799521	val-auc:0.794395
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[1]	train-auc:0.80167	val-auc:0.796515
[2]	train-auc:0.802301	val-auc:0.797133
[3]	train-auc:0.802825	val-auc:0.797561
[4]	train-auc:0.803098	val-auc:0.797598
[5]	train-auc:0.803705	val-auc:0.797885
[6]	train-auc:0.804018	val-auc:0.797916
[7]	train-auc:0.804418	val-auc:0.797759
[8]	train-auc:0.804715	val-auc:0.797959
[9]	train-auc:0.80502	val-auc:0.797929
[10]	train-auc:0.805193	val-auc:0.79796
[11]	train-auc:0.805529	val-auc:0.797943
[12]	train-auc:0.80583	val-auc:0.798422
[13]	train-auc:0.806151	val-auc:0.798538
[14]	train-auc:0.806494	val-auc:0.798753
[15]	train-auc:0.806811	val-auc:0.798655
[16]	train-auc:0.80719	val-auc:0.798439
[17]	train-auc:0.807572	val-auc:0.798532
[18]	train-auc:0.807985	val-auc:0.798584
[19]	train-auc:0.808398	val-auc:0.798632
[20]	train-auc:0.808802	val-auc:0.79863
[21]	train-auc:0

[42]	train-auc:0.814216	val-auc:0.807351
[43]	train-auc:0.814564	val-auc:0.807216
[44]	train-auc:0.8148	val-auc:0.807146
[45]	train-auc:0.81487	val-auc:0.807167
[46]	train-auc:0.815261	val-auc:0.807282
[47]	train-auc:0.815454	val-auc:0.807261
[48]	train-auc:0.81569	val-auc:0.80721
[49]	train-auc:0.815991	val-auc:0.80716
[50]	train-auc:0.816451	val-auc:0.807194
[51]	train-auc:0.816792	val-auc:0.807491
[52]	train-auc:0.817094	val-auc:0.807452
[53]	train-auc:0.817103	val-auc:0.807457
[54]	train-auc:0.817834	val-auc:0.807931
[55]	train-auc:0.817922	val-auc:0.807946
[56]	train-auc:0.81853	val-auc:0.807918
[57]	train-auc:0.818886	val-auc:0.808026
[58]	train-auc:0.819371	val-auc:0.807911
[59]	train-auc:0.819551	val-auc:0.807963
[60]	train-auc:0.819569	val-auc:0.807973
[61]	train-auc:0.820227	val-auc:0.807722
[62]	train-auc:0.820504	val-auc:0.807769
[63]	train-auc:0.820514	val-auc:0.807758
[64]	train-auc:0.820588	val-auc:0.807771
[65]	train-auc:0.820816	val-auc:0.807811
[66]	train-auc:0.821108

[22]	train-auc:0.809335	val-auc:0.807331
[23]	train-auc:0.809636	val-auc:0.807108
[24]	train-auc:0.810039	val-auc:0.807078
[25]	train-auc:0.810371	val-auc:0.807069
[26]	train-auc:0.810758	val-auc:0.807128
[27]	train-auc:0.811304	val-auc:0.807232
[28]	train-auc:0.811732	val-auc:0.807359
[29]	train-auc:0.812103	val-auc:0.807396
[30]	train-auc:0.812844	val-auc:0.807428
[31]	train-auc:0.813133	val-auc:0.807423
[32]	train-auc:0.813232	val-auc:0.807375
[33]	train-auc:0.813439	val-auc:0.807382
[34]	train-auc:0.813586	val-auc:0.807312
[35]	train-auc:0.814026	val-auc:0.807493
[36]	train-auc:0.814271	val-auc:0.807415
[37]	train-auc:0.81441	val-auc:0.807418
[38]	train-auc:0.814469	val-auc:0.807391
[39]	train-auc:0.814854	val-auc:0.807333
[40]	train-auc:0.815952	val-auc:0.807841
[41]	train-auc:0.816053	val-auc:0.807859
[42]	train-auc:0.816493	val-auc:0.807688
[43]	train-auc:0.81669	val-auc:0.807672
[44]	train-auc:0.816956	val-auc:0.807799
[45]	train-auc:0.817173	val-auc:0.807796
[46]	train-auc:0.8

[72]	train-auc:0.821442	val-auc:0.805216
[73]	train-auc:0.821756	val-auc:0.805288
[74]	train-auc:0.821856	val-auc:0.805261
[75]	train-auc:0.821896	val-auc:0.805282
[76]	train-auc:0.822121	val-auc:0.805195
[77]	train-auc:0.822325	val-auc:0.805296
[78]	train-auc:0.822845	val-auc:0.805199
[79]	train-auc:0.823195	val-auc:0.8051
[80]	train-auc:0.823536	val-auc:0.805114
[81]	train-auc:0.823913	val-auc:0.805082
[82]	train-auc:0.825625	val-auc:0.806145
[83]	train-auc:0.825838	val-auc:0.806002
[84]	train-auc:0.826314	val-auc:0.805907
[85]	train-auc:0.826487	val-auc:0.805809
[86]	train-auc:0.826753	val-auc:0.805888
[87]	train-auc:0.827148	val-auc:0.805841
[88]	train-auc:0.827537	val-auc:0.805942
[89]	train-auc:0.827817	val-auc:0.805803
[90]	train-auc:0.828213	val-auc:0.805629
[91]	train-auc:0.828404	val-auc:0.805725
[92]	train-auc:0.828461	val-auc:0.805727
[93]	train-auc:0.828737	val-auc:0.805638
[94]	train-auc:0.828819	val-auc:0.805664
[95]	train-auc:0.829317	val-auc:0.805627
[96]	train-auc:0.8

[7]	train-auc:0.80436	val-auc:0.798722
[8]	train-auc:0.804616	val-auc:0.798497
[9]	train-auc:0.804872	val-auc:0.798477
[10]	train-auc:0.805336	val-auc:0.798668
[11]	train-auc:0.805635	val-auc:0.798995
[12]	train-auc:0.806043	val-auc:0.798923
[13]	train-auc:0.806307	val-auc:0.798895
[14]	train-auc:0.806627	val-auc:0.799001
[15]	train-auc:0.807054	val-auc:0.798936
[16]	train-auc:0.807507	val-auc:0.79945
[17]	train-auc:0.807864	val-auc:0.799598
[18]	train-auc:0.80826	val-auc:0.799721
[19]	train-auc:0.808598	val-auc:0.79964
[20]	train-auc:0.809017	val-auc:0.799719
[21]	train-auc:0.809455	val-auc:0.79983
[22]	train-auc:0.809824	val-auc:0.799815
[23]	train-auc:0.810325	val-auc:0.800043
[24]	train-auc:0.810831	val-auc:0.800097
[25]	train-auc:0.811141	val-auc:0.80011
[26]	train-auc:0.811438	val-auc:0.80006
[27]	train-auc:0.811734	val-auc:0.800235
[28]	train-auc:0.812064	val-auc:0.800237
[29]	train-auc:0.812498	val-auc:0.800195
[30]	train-auc:0.812913	val-auc:0.800283
[31]	train-auc:0.813184	va

[73]	train-auc:0.824487	val-auc:0.802206
[74]	train-auc:0.824615	val-auc:0.802133
[75]	train-auc:0.824884	val-auc:0.802104
[76]	train-auc:0.825192	val-auc:0.802121
[77]	train-auc:0.825293	val-auc:0.802121
[78]	train-auc:0.825461	val-auc:0.802084
[79]	train-auc:0.825688	val-auc:0.802089
[80]	train-auc:0.82582	val-auc:0.802095
[81]	train-auc:0.82585	val-auc:0.802081
[82]	train-auc:0.826623	val-auc:0.802653
[83]	train-auc:0.826939	val-auc:0.802734
[84]	train-auc:0.827489	val-auc:0.802656
[85]	train-auc:0.827769	val-auc:0.802786
[86]	train-auc:0.827841	val-auc:0.802822
[87]	train-auc:0.828192	val-auc:0.802706
[88]	train-auc:0.828658	val-auc:0.802809
[89]	train-auc:0.829427	val-auc:0.803077
[90]	train-auc:0.829733	val-auc:0.802934
[91]	train-auc:0.83012	val-auc:0.803011
[92]	train-auc:0.830131	val-auc:0.803018
[93]	train-auc:0.830345	val-auc:0.803146
[94]	train-auc:0.830562	val-auc:0.803056
[95]	train-auc:0.830603	val-auc:0.803046
[96]	train-auc:0.831143	val-auc:0.803422
[97]	train-auc:0.83

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Overall ROC: 0.801500139872,  Mean ROC: 0.802409315254, STD AUC: 0.00526507053218


In [21]:
train_id.to_csv("csv/{}_train.csv".format(model_name), index=False)
test_id.to_csv("csv/{}_test.csv".format(model_name), index=False)
test_id_rank.to_csv("csv/{}_rank_test.csv".format(model_name), index=False)

In [22]:
fold_roc

[0.8007267701882583,
 0.7956552681621064,
 0.808026491422582,
 0.7929348199829306,
 0.809982051069776,
 0.7976715310367655,
 0.8063523051552044,
 0.8053605244587121,
 0.8039198487779053,
 0.8034635422872616]

In [None]:
-