In [15]:
model_name = "testing_consistency"

In [16]:
import numpy as np
import pandas as pd
import gc
from time import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

In [17]:
timesheet = [time()]
def timer(statement):
    global timesheet
    timesheet.append(time())
    print statement+" :", (timesheet[-1]-timesheet[-2]),"seconds"

def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

def load_meta(directory, prefix, modeltype):
    m_tr = pd.read_csv(directory+prefix+"_train.csv")
    m_te = pd.read_csv(directory+prefix+"_test.csv")
    data = pd.concat([m_tr, m_te], axis=0).reset_index(drop=True)
    data.columns = ["{}_{}_{}".format(c, prefix, modeltype) if c!="SK_ID_CURR" else c for c in data.columns]
    return data

def join_features(data, features):
    for item in features:
        data = data.merge(item, how = "left", on = "SK_ID_CURR")
    return data

def postprocess(df):
    cols = [c for c in df.columns if c not in ["SK_ID_CURR","SK_ID_PREV","TARGET"]]
    df = df.replace([np.inf, -np.inf], np.nan)
    return df

def load_neptune(directory, modelname):
    m_tr = pd.read_csv(directory+"lightGBM_out_of_fold_train_predictions.csv", usecols = ["SK_ID_CURR","lightGBM_prediction"])
    m_te = pd.read_csv(directory+"lightGBM_out_of_fold_test_predictions.csv", usecols = ["SK_ID_CURR","lightGBM_prediction"])
    m_te = m_te.groupby("SK_ID_CURR")["lightGBM_prediction"].mean().reset_index()
    
    data = pd.concat([m_tr, m_te], axis=0).reset_index(drop=True)
    data.columns = ["SK_ID_CURR", "neptune_{}".format(modelname)]
    return data

# Loading Meta Files

In [60]:
l1_features = [
    load_meta("../l1/csv/", "l1_gnb", "l1"),
    load_meta("../l1/csv/", "l1_lr", "l1"),
    load_meta("../l1/csv/", "l1_tree_with_flags", "l1"),
    load_meta("../l1/csv/", "l1_tree_without_flags", "l1"),
    load_meta("../l1/csv/", "tree_with_flags_without_meta", "l1"),
    load_meta("../l1/csv/", "tree_without_flags_without_meta", "l1"),
    load_meta("../l1/csv/", "tree_on_core_features_with_meta", "l1"),
]

neptune_features = [
    load_neptune("../base neptune/m1/csv/", "m1"),
    load_neptune("../base neptune/m2/csv/", "m2"),
    load_neptune("../base neptune/m3/csv/", "m3"),
    load_neptune("../base neptune/m4/csv/", "m4"),
]

extra_models = [
    load_meta("../base mixtures/csv/", "knn_on_selected_pca", "extras"),
    load_meta("../base mixtures/csv/", "lgbm_on_core_features", "extras"),
    load_meta("../base mixtures/csv/", "lgbm_on_gp_features", "extras"),
    load_meta("../base mixtures/csv/", "lr_on_core_features", "extras"),
    load_meta("../base mixtures/csv/", "nn", "extras"),
    load_meta("../base mixtures/csv/", "using_lags_bureau_data", "extras"),

]

In [61]:
def application_train_test(num_rows = None, nan_as_category = False):
    # Read data and merge
    df = pd.read_csv('../data/application_train.csv', nrows= num_rows, usecols=["TARGET","SK_ID_CURR"])
    test_df = pd.read_csv('../data/application_test.csv', nrows= num_rows, usecols=["SK_ID_CURR"])

    print("Train samples: {}, test samples: {}".format(len(df), len(test_df)))
    df = df.append(test_df).reset_index()
#     # Optional: Remove 4 applications with XNA CODE_GENDER (train set)
#     df = df[df['CODE_GENDER'] != 'XNA']

#     docs = [_f for _f in df.columns if 'FLAG_DOC' in _f]
#     live = [_f for _f in df.columns if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f)]

#     # NaN values for DAYS_EMPLOYED: 365.243 -> nan
#     df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)

#     inc_by_org = df[['AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE']].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']

#     df['NEW_CREDIT_TO_ANNUITY_RATIO'] = df['AMT_CREDIT'] / df['AMT_ANNUITY'].astype("float32")
#     df['NEW_CREDIT_TO_GOODS_RATIO'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE'].astype("float32")
#     df['NEW_DOC_IND_KURT'] = df[docs].kurtosis(axis=1)
#     df['NEW_LIVE_IND_SUM'] = df[live].sum(axis=1)
#     df['NEW_INC_PER_CHLD'] = df['AMT_INCOME_TOTAL'] / (1 + df['CNT_CHILDREN']).astype("float32")
#     df['NEW_INC_BY_ORG'] = df['ORGANIZATION_TYPE'].map(inc_by_org)
#     df['NEW_EMPLOY_TO_BIRTH_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH'].astype("float32")
#     df['NEW_ANNUITY_TO_INCOME_RATIO'] = df['AMT_ANNUITY'] / (1 + df['AMT_INCOME_TOTAL']).astype("float32")
#     df['NEW_SOURCES_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
#     df['NEW_EXT_SOURCES_MEAN'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
#     df['NEW_SCORES_STD'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
#     df['NEW_SCORES_STD'] = df['NEW_SCORES_STD'].fillna(df['NEW_SCORES_STD'].mean())
#     df['NEW_CAR_TO_BIRTH_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH'].astype("float32")
#     df['NEW_CAR_TO_EMPLOY_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED'].astype("float32")
#     df['NEW_PHONE_TO_BIRTH_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH'].astype("float32")
#     df['NEW_PHONE_TO_BIRTH_RATIO_EMPLOYER'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_EMPLOYED'].astype("float32")
#     df['NEW_CREDIT_TO_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL'].astype("float32")

#     # Categorical features with Binary encode (0 or 1; two categories)
#     for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
#         df[bin_feature], uniques = pd.factorize(df[bin_feature])
#     # Categorical features with One-Hot encode
#     df, cat_cols = one_hot_encoder(df, nan_as_category)
#     dropcolum=['FLAG_DOCUMENT_2','FLAG_DOCUMENT_4',
#     'FLAG_DOCUMENT_5','FLAG_DOCUMENT_6','FLAG_DOCUMENT_7',
#     'FLAG_DOCUMENT_8','FLAG_DOCUMENT_9','FLAG_DOCUMENT_10', 
#     'FLAG_DOCUMENT_11','FLAG_DOCUMENT_12','FLAG_DOCUMENT_13',
#     'FLAG_DOCUMENT_14','FLAG_DOCUMENT_15','FLAG_DOCUMENT_16',
#     'FLAG_DOCUMENT_17','FLAG_DOCUMENT_18','FLAG_DOCUMENT_19',
#     'FLAG_DOCUMENT_20','FLAG_DOCUMENT_21']
#     df= df.drop(dropcolum,axis=1)
#     del test_df
#     gc.collect()
    return df

# Generate Data For Training

In [62]:
num_rows = None
timer("Init.. ")

data = application_train_test(num_rows)
timer("Done with Application Train and Test")

data = postprocess(data)

data = join_features(data, l1_features)
data = join_features(data, neptune_features)
data = join_features(data, extra_models)
timer("Done with Meta Features")

train = data[data['TARGET'].notnull()]
test = data[data['TARGET'].isnull()]

target = train.TARGET
train_id = train[["SK_ID_CURR"]]
test_id = test[["SK_ID_CURR"]]

train = train.drop(['TARGET','SK_ID_CURR','index'], axis=1)
test = test.drop(['TARGET','SK_ID_CURR','index'], axis=1)

Init..  : 577.667269945 seconds
Train samples: 307511, test samples: 48744
Done with Application Train and Test : 2.33161091805 seconds
Done with Meta Features : 0.841617107391 seconds


# Defining Model

In [63]:
import lightgbm as lgb
def model_tree(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id):
    dtrain = lgb.Dataset(x_train, label=y_train)
    dval = lgb.Dataset(x_test, label=y_test)
    params = {
        'num_leaves': 8, 
        'reg_alpha': 0.6, 
        'n_jobs': -1, 
        'colsample_bytree': 0.6, 
        'silent': -1, 
        'subsample_for_bin': 200000, 
        'subsample_freq': 1, 
        'learning_rate': 0.02, 
        'boosting_type': 'gbdt', 
        'nthread': 8, 
        'min_child_weight': 20, 
        'min_split_gain': 0.0222415, 
        'n_estimators': 10000, 
        'subsample': 0.6, 
        'reg_lambda': 0.2, 
        'objective': "binary",
        'verbose': -1, 
        'min_child_samples': 20, 
        'max_depth': 3, 
        'class_weight': None,
        "bagging_seed" : 3143,
        "seed":1343,
        "metric":"auc"
    }
    model = lgb.train(params, dtrain, num_boost_round=5000,valid_sets=[dtrain, dval], early_stopping_rounds=200, verbose_eval=100)
    
    p_train = model.predict(x_test, num_iteration=model.best_iteration or 5000)
    p_test = model.predict(test, num_iteration=model.best_iteration or 5000)
    meta_train[test_index] = pd.Series(p_train).rank(pct  =True)
    meta_test.append(pd.Series(p_test).rank(pct  =True))
    
    global roc_scores
    roc_scores.append(roc_auc_score(y_test,meta_train[test_index]))
    # Calculate Feature Importance
    global feature_importance
    gain = model.feature_importance('gain')
    fold_feature_importance = pd.DataFrame({'feature':model.feature_name(), 'split':model.feature_importance('split'), 'gain':100 * gain / gain.sum()})
    feature_importance = feature_importance.append(fold_feature_importance, ignore_index=True) 

# Training Classifier

In [64]:
meta_train = np.zeros(train.shape[0])
meta_test = []
feature_importance = pd.DataFrame(columns = ["feature","split","gain"])
roc_scores = []
kf = StratifiedKFold(n_splits= 10, shuffle=True, random_state=47)
for fold_id, (train_index, test_index) in enumerate(kf.split(train, target)):
    x_train, x_test = train.iloc[train_index], train.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]

    model_tree(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id)

test_id["TARGET"] = np.array(meta_test).T.mean(axis=1)
train_id["TARGET"] = meta_train

Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.797609	valid_1's auc: 0.790266
[200]	training's auc: 0.79774	valid_1's auc: 0.790294
Early stopping, best iteration is:
[77]	training's auc: 0.797436	valid_1's auc: 0.790454
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.79639	valid_1's auc: 0.800933
[200]	training's auc: 0.796583	valid_1's auc: 0.801004
[300]	training's auc: 0.796822	valid_1's auc: 0.801032
[400]	training's auc: 0.797107	valid_1's auc: 0.801089
[500]	training's auc: 0.797379	valid_1's auc: 0.801107
[600]	training's auc: 0.797631	valid_1's auc: 0.801121
[700]	training's auc: 0.79787	valid_1's auc: 0.80111
Early stopping, best iteration is:
[555]	training's auc: 0.797511	valid_1's auc: 0.801155
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.796417	valid_1's auc: 0.800361
[200]	training's auc: 0.796603	valid_1's auc: 0.800512
[300]	training's auc: 0.796874	

In [65]:
print roc_auc_score(target, meta_train), np.mean(roc_scores), np.std(roc_scores)

0.7964162223585018 0.7963935960078921 0.005805475029734116


In [66]:
# All 0.7994254463684968 0.7993540322311966 0.005739664094703111
# Removing GNB 0.7994209223662303 0.7994101785078225 0.005698645371701253
# Removing NN 0.7995554382613302 0.7995178641615923 0.005721003196940527
# Removing GNB and NN 0.7995550394828136 0.7995350363356779 0.005760394888299419
# Removing LR and NN 0.7995640485856014 0.7995391210455766 0.005786565411124571
# Removing KNN, GP Feats and Nn 0.7992251925082449 0.7991880177095584 0.005715337388051275
# Using Only Neptune Feats 0.7964162223585018 0.7963935960078921 0.005805475029734116