In [1]:
model_name = "l1_nn"

In [2]:
import numpy as np
import pandas as pd
import gc
from time import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

In [3]:
timesheet = [time()]
def timer(statement):
    global timesheet
    timesheet.append(time())
    print statement+" :", (timesheet[-1]-timesheet[-2]),"seconds"

In [4]:
def load_meta(directory, prefix, modeltype):
    m_tr = pd.read_csv(directory+prefix+"_train.csv")
    m_te = pd.read_csv(directory+prefix+"_test.csv")
    data = pd.concat([m_tr, m_te], axis=0).reset_index(drop=True)
    data.columns = ["{}_{}_{}".format(c, prefix, modeltype) if c!="SK_ID_CURR" else c for c in data.columns]
    return data

def join_features(data, features):
    for item in features:
        data = data.merge(item, how = "left", on = "SK_ID_CURR")
    return data

def postprocess(df):
    cols = [c for c in df.columns if c not in ["SK_ID_CURR","SK_ID_PREV","TARGET"]]
    df = df.replace([np.inf, -np.inf], np.nan)
    df.loc[:, cols] = MinMaxScaler().fit_transform(df.loc[:, cols].fillna(df.loc[:, cols].mean()).clip(-1e11,1e11))
    return df

def processColNames(df):
    df.columns = [c.replace(" ","_") for c in df.columns]
    return df


## Loading Features

Loading All Features File 

In [5]:
applications = processColNames(pd.read_csv("../extractor/csv/application_features_normalized_V1.csv").dropna(axis=1, how="all")).drop(["TARGET"], axis=1)
bureau_balance_bb = processColNames(pd.read_csv("../extractor/csv/bureau_features_normalized_V1.csv").dropna(axis=1, how="all"))
credit_card_balance = processColNames(pd.read_csv("../extractor/csv/credit_card_features_normalized_V1.csv").dropna(axis=1, how="all"))
installment_features = processColNames(pd.read_csv("../extractor/csv/installment_features_normalized_V1.csv").dropna(axis=1, how="all"))
pos_cash_balance = processColNames(pd.read_csv("../extractor/csv/pos_cash_features_normalized_V1.csv").dropna(axis=1, how="all"))
previous_apps = processColNames(pd.read_csv("../extractor/csv/previous_application_features_normalized_V1.csv").dropna(axis=1, how="all"))

base_features = [applications, bureau_balance_bb, credit_card_balance, installment_features, pos_cash_balance,previous_apps ]

## Loading Meta Features

In [6]:
tree_meta_features = [
    load_meta("../base trees/csv/", "application", "trees"),
    load_meta("../base trees/csv/", "bureau_balance_bb", "trees"),
    load_meta("../base trees/csv/", "credit_card_balance", "trees"),
    load_meta("../base trees/csv/", "installment", "trees"),
    load_meta("../base trees/csv/", "pos_cash_balance", "trees"),
    load_meta("../base trees/csv/", "previous_apps", "trees"),
]

lr_meta_features = [
    load_meta("../base lr/csv/", "application", "lr"),
    load_meta("../base lr/csv/", "bureau_balance_bb", "lr"),
    load_meta("../base lr/csv/", "credit_card_balance", "lr"),
    load_meta("../base lr/csv/", "installment", "lr"),
    load_meta("../base lr/csv/", "pos_cash_balance", "lr"),
    load_meta("../base lr/csv/", "previous_apps", "lr"),
]

nb_meta_features = [
    load_meta("../base nb/csv/", "application", "nb"),
    load_meta("../base nb/csv/", "bureau_balance_bb", "nb"),
    load_meta("../base nb/csv/", "credit_card_balance", "nb"),
    load_meta("../base nb/csv/", "installment", "nb"),
    load_meta("../base nb/csv/", "pos_cash_balance", "nb"),
    load_meta("../base nb/csv/", "previous_apps", "nb"),
]

nn_meta_features = [
    load_meta("../base nn/csv/", "application", "nb"),
    load_meta("../base nn/csv/", "bureau_balance_bb", "nb"),
    load_meta("../base nn/csv/", "credit_card_balance", "nb"),
    load_meta("../base nn/csv/", "installment", "nb"),
    load_meta("../base nn/csv/", "pos_cash_balance", "nb"),
    load_meta("../base nn/csv/", "previous_apps", "nb"),
]

In [7]:
train = pd.read_csv("../data/application_train.csv", usecols = ["SK_ID_CURR","TARGET"])
test = pd.read_csv("../data/application_test.csv", usecols = ["SK_ID_CURR"])

data = pd.concat([train, test], axis=0).reset_index(drop=True)
data = join_features(data, base_features)
data = join_features(data, tree_meta_features)
data = join_features(data, lr_meta_features)
data = join_features(data, nb_meta_features)
data = join_features(data, nn_meta_features)

data = postprocess(data)
train = data.loc[data.TARGET.notnull()].reset_index(drop=True)
test = data.loc[data.TARGET.isnull()].reset_index(drop=True)

train_id = train[["SK_ID_CURR"]]
test_id = test[["SK_ID_CURR"]]
target =train.TARGET

train.drop(["SK_ID_CURR", "TARGET"], axis=1, inplace=True)
test.drop(["SK_ID_CURR","TARGET"], axis=1, inplace = True)

# Defining Model

In [14]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"
from keras.models import Sequential
from keras.models import Model
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Activation,InputLayer, Dropout, PReLU, BatchNormalization
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
from keras import metrics
from keras import optimizers
from keras import backend as K

K.tf.set_random_seed(0)
cfg = K.tf.ConfigProto()
cfg.gpu_options.allow_growth = True 
K.set_session(K.tf.Session(config=cfg))
np.random.seed(0)

def model_nn(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id):
    model = Sequential()
    model.add(Dense(units = 400 , kernel_initializer = 'normal', input_dim = x_train.shape[1]))
    model.add(PReLU())
    model.add(Dropout(.4))
    model.add(BatchNormalization())
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(loss="binary_crossentropy", optimizer="rmsprop")
    
    checkpoint = ModelCheckpoint("weights/{}.hdf5".format(model_name), monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
    early_stop = EarlyStopping(monitor='val_loss', patience=5, mode='auto') 

    model.fit(x_train, y_train, epochs=100, batch_size=30000, validation_data =(x_test, y_test), callbacks=[checkpoint, early_stop], class_weight={1:np.mean(y_train), 0: 1-np.mean(y_train)})

    model.load_weights("weights/{}.hdf5".format(model_name))
    meta_train[test_index] = model.predict(x_test).flatten()
    meta_test.append(model.predict(test).flatten())
    print "ROC_AUC_SCORE: {}".format(roc_auc_score(y_test, meta_train[test_index]))

    del model
    gc.collect()
    K.clear_session()
    K.tf.set_random_seed(0)

# Training Model

In [15]:
meta_train = np.zeros(train.shape[0])
meta_test = []

kf = StratifiedKFold(n_splits= 10, shuffle=True, random_state=47)
for fold_id, (train_index, test_index) in enumerate(kf.split(train, target)):
    x_train, x_test = train.iloc[train_index].as_matrix(), train.iloc[test_index].as_matrix()
    y_train, y_test = target[train_index], target[test_index]

    model_nn(x_train, x_test, y_train, y_test, test.as_matrix(), meta_train, meta_test,train_index, test_index,fold_id)

test_id["TARGET"] = np.array(meta_test).T.mean(axis=1)
train_id["TARGET"] = meta_train
print "Overall ROC AUC SCORE: ",roc_auc_score(target,meta_train)

Train on 276759 samples, validate on 30752 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 0.64768, saving model to weights/l1_nn.hdf5
Epoch 2/100
Epoch 00002: val_loss did not improve
Epoch 3/100
Epoch 00003: val_loss did not improve
Epoch 4/100
Epoch 00004: val_loss improved from 0.64768 to 0.62927, saving model to weights/l1_nn.hdf5
Epoch 5/100
Epoch 00005: val_loss improved from 0.62927 to 0.58114, saving model to weights/l1_nn.hdf5
Epoch 6/100
Epoch 00006: val_loss did not improve
Epoch 7/100
Epoch 00007: val_loss improved from 0.58114 to 0.57002, saving model to weights/l1_nn.hdf5
Epoch 8/100
Epoch 00008: val_loss improved from 0.57002 to 0.56191, saving model to weights/l1_nn.hdf5
Epoch 9/100
Epoch 00009: val_loss improved from 0.56191 to 0.52601, saving model to weights/l1_nn.hdf5
Epoch 10/100
Epoch 00010: val_loss did not improve
Epoch 11/100
Epoch 00011: val_loss did not improve
Epoch 12/100
Epoch 00012: val_loss improved from 0.52601 to 0.49656, saving model t

Epoch 31/100
Epoch 00031: val_loss did not improve
Epoch 32/100
Epoch 00032: val_loss did not improve
Epoch 33/100
Epoch 00033: val_loss did not improve
ROC_AUC_SCORE: 0.574396461576
Train on 276759 samples, validate on 30752 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 0.68939, saving model to weights/l1_nn.hdf5
Epoch 2/100
Epoch 00002: val_loss did not improve
Epoch 3/100
Epoch 00003: val_loss improved from 0.68939 to 0.65332, saving model to weights/l1_nn.hdf5
Epoch 4/100
Epoch 00004: val_loss did not improve
Epoch 5/100
Epoch 00005: val_loss improved from 0.65332 to 0.62191, saving model to weights/l1_nn.hdf5
Epoch 6/100
Epoch 00006: val_loss improved from 0.62191 to 0.58375, saving model to weights/l1_nn.hdf5
Epoch 7/100
Epoch 00007: val_loss improved from 0.58375 to 0.58073, saving model to weights/l1_nn.hdf5
Epoch 8/100
Epoch 00008: val_loss improved from 0.58073 to 0.54745, saving model to weights/l1_nn.hdf5
Epoch 9/100
Epoch 00009: val_loss improved from 0.54

Epoch 29/100
Epoch 00029: val_loss did not improve
Epoch 30/100
Epoch 00030: val_loss did not improve
Epoch 31/100
Epoch 00031: val_loss did not improve
Epoch 32/100
Epoch 00032: val_loss did not improve
ROC_AUC_SCORE: 0.62643642053
Train on 276759 samples, validate on 30752 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 0.66838, saving model to weights/l1_nn.hdf5
Epoch 2/100
Epoch 00002: val_loss improved from 0.66838 to 0.66097, saving model to weights/l1_nn.hdf5
Epoch 3/100
Epoch 00003: val_loss improved from 0.66097 to 0.63227, saving model to weights/l1_nn.hdf5
Epoch 4/100
Epoch 00004: val_loss did not improve
Epoch 5/100
Epoch 00005: val_loss did not improve
Epoch 6/100
Epoch 00006: val_loss improved from 0.63227 to 0.61928, saving model to weights/l1_nn.hdf5
Epoch 7/100
Epoch 00007: val_loss did not improve
Epoch 8/100
Epoch 00008: val_loss improved from 0.61928 to 0.55075, saving model to weights/l1_nn.hdf5
Epoch 9/100
Epoch 00009: val_loss did not improve
Epoch

Epoch 28/100
Epoch 00028: val_loss did not improve
Epoch 29/100
Epoch 00029: val_loss did not improve
Epoch 30/100
Epoch 00030: val_loss did not improve
Epoch 31/100
Epoch 00031: val_loss did not improve
Epoch 32/100
Epoch 00032: val_loss did not improve
ROC_AUC_SCORE: 0.570511563246
Train on 276759 samples, validate on 30752 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 0.68365, saving model to weights/l1_nn.hdf5
Epoch 2/100
Epoch 00002: val_loss improved from 0.68365 to 0.64989, saving model to weights/l1_nn.hdf5
Epoch 3/100
Epoch 00003: val_loss did not improve
Epoch 4/100
Epoch 00004: val_loss improved from 0.64989 to 0.63239, saving model to weights/l1_nn.hdf5
Epoch 5/100
Epoch 00005: val_loss improved from 0.63239 to 0.60526, saving model to weights/l1_nn.hdf5
Epoch 6/100
Epoch 00006: val_loss improved from 0.60526 to 0.58797, saving model to weights/l1_nn.hdf5
Epoch 7/100
Epoch 00007: val_loss did not improve
Epoch 8/100
Epoch 00008: val_loss improved from 0.587

Epoch 27/100
Epoch 00027: val_loss improved from 0.28188 to 0.28063, saving model to weights/l1_nn.hdf5
Epoch 28/100
Epoch 00028: val_loss did not improve
Epoch 29/100
Epoch 00029: val_loss did not improve
Epoch 30/100
Epoch 00030: val_loss did not improve
Epoch 31/100
Epoch 00031: val_loss did not improve
Epoch 32/100
Epoch 00032: val_loss did not improve
ROC_AUC_SCORE: 0.590824098617
Train on 276760 samples, validate on 30751 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 0.75395, saving model to weights/l1_nn.hdf5
Epoch 2/100
Epoch 00002: val_loss improved from 0.75395 to 0.70338, saving model to weights/l1_nn.hdf5
Epoch 3/100
Epoch 00003: val_loss did not improve
Epoch 4/100
Epoch 00004: val_loss did not improve
Epoch 5/100
Epoch 00005: val_loss improved from 0.70338 to 0.66760, saving model to weights/l1_nn.hdf5
Epoch 6/100
Epoch 00006: val_loss improved from 0.66760 to 0.60103, saving model to weights/l1_nn.hdf5
Epoch 7/100
Epoch 00007: val_loss did not improve
Ep

Epoch 00024: val_loss improved from 0.29496 to 0.28873, saving model to weights/l1_nn.hdf5
Epoch 25/100
Epoch 00025: val_loss improved from 0.28873 to 0.28428, saving model to weights/l1_nn.hdf5
Epoch 26/100
Epoch 00026: val_loss improved from 0.28428 to 0.28157, saving model to weights/l1_nn.hdf5
Epoch 27/100
Epoch 00027: val_loss improved from 0.28157 to 0.28058, saving model to weights/l1_nn.hdf5
Epoch 28/100
Epoch 00028: val_loss did not improve
Epoch 29/100
Epoch 00029: val_loss did not improve
Epoch 30/100
Epoch 00030: val_loss did not improve
Epoch 31/100
Epoch 00031: val_loss did not improve
Epoch 32/100
Epoch 00032: val_loss did not improve
ROC_AUC_SCORE: 0.58197166621
Train on 276761 samples, validate on 30750 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 0.69155, saving model to weights/l1_nn.hdf5
Epoch 2/100
Epoch 00002: val_loss improved from 0.69155 to 0.67161, saving model to weights/l1_nn.hdf5
Epoch 3/100
Epoch 00003: val_loss improved from 0.67161 to 0

Epoch 23/100
Epoch 00023: val_loss improved from 0.30364 to 0.29802, saving model to weights/l1_nn.hdf5
Epoch 24/100
Epoch 00024: val_loss improved from 0.29802 to 0.29043, saving model to weights/l1_nn.hdf5
Epoch 25/100
Epoch 00025: val_loss improved from 0.29043 to 0.28426, saving model to weights/l1_nn.hdf5
Epoch 26/100
Epoch 00026: val_loss improved from 0.28426 to 0.28207, saving model to weights/l1_nn.hdf5
Epoch 27/100
Epoch 00027: val_loss improved from 0.28207 to 0.28041, saving model to weights/l1_nn.hdf5
Epoch 28/100
Epoch 00028: val_loss did not improve
Epoch 29/100
Epoch 00029: val_loss did not improve
Epoch 30/100
Epoch 00030: val_loss did not improve
Epoch 31/100
Epoch 00031: val_loss did not improve
Epoch 32/100
Epoch 00032: val_loss did not improve
ROC_AUC_SCORE: 0.601878786923
Train on 276761 samples, validate on 30750 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 0.70341, saving model to weights/l1_nn.hdf5
Epoch 2/100
Epoch 00002: val_loss improved fr

Epoch 23/100
Epoch 00023: val_loss improved from 0.30320 to 0.29142, saving model to weights/l1_nn.hdf5
Epoch 24/100
Epoch 00024: val_loss improved from 0.29142 to 0.28583, saving model to weights/l1_nn.hdf5
Epoch 25/100
Epoch 00025: val_loss improved from 0.28583 to 0.28217, saving model to weights/l1_nn.hdf5
Epoch 26/100
Epoch 00026: val_loss improved from 0.28217 to 0.28184, saving model to weights/l1_nn.hdf5
Epoch 27/100
Epoch 00027: val_loss improved from 0.28184 to 0.27996, saving model to weights/l1_nn.hdf5
Epoch 28/100
Epoch 00028: val_loss did not improve
Epoch 29/100
Epoch 00029: val_loss did not improve
Epoch 30/100
Epoch 00030: val_loss did not improve
Epoch 31/100
Epoch 00031: val_loss did not improve
Epoch 32/100
Epoch 00032: val_loss did not improve
ROC_AUC_SCORE: 0.597116509279
Train on 276761 samples, validate on 30750 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 0.70612, saving model to weights/l1_nn.hdf5
Epoch 2/100
Epoch 00002: val_loss improved fr

Epoch 00022: val_loss improved from 0.30206 to 0.29588, saving model to weights/l1_nn.hdf5
Epoch 23/100
Epoch 00023: val_loss improved from 0.29588 to 0.29128, saving model to weights/l1_nn.hdf5
Epoch 24/100
Epoch 00024: val_loss improved from 0.29128 to 0.28570, saving model to weights/l1_nn.hdf5
Epoch 25/100
Epoch 00025: val_loss improved from 0.28570 to 0.28236, saving model to weights/l1_nn.hdf5
Epoch 26/100
Epoch 00026: val_loss improved from 0.28236 to 0.28150, saving model to weights/l1_nn.hdf5
Epoch 27/100
Epoch 00027: val_loss improved from 0.28150 to 0.28024, saving model to weights/l1_nn.hdf5
Epoch 28/100
Epoch 00028: val_loss did not improve
Epoch 29/100
Epoch 00029: val_loss did not improve
Epoch 30/100
Epoch 00030: val_loss did not improve
Epoch 31/100
Epoch 00031: val_loss did not improve
Epoch 32/100
Epoch 00032: val_loss did not improve
ROC_AUC_SCORE: 0.580120813539


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Overall ROC AUC SCORE:  0.520250874507322


In [16]:
train_id.to_csv("csv/{}_train.csv".format(model_name), index=False)
test_id.to_csv("csv/{}_test.csv".format(model_name), index=False)