In [19]:
model_name = "top500_myfeatures_with_meta_neptune_parameters_relevant_features"

In [20]:
import numpy as np
import pandas as pd
import gc
from time import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

In [21]:
timesheet = [time()]
def timer(statement):
    global timesheet
    timesheet.append(time())
    print statement+" :", (timesheet[-1]-timesheet[-2]),"seconds"
timer("Init...")

Init... : 0.000316143035889 seconds


In [22]:
def load_meta(directory, prefix, modeltype):
    m_tr = pd.read_csv(directory+prefix+"_train.csv")
    m_te = pd.read_csv(directory+prefix+"_test.csv")
    data = pd.concat([m_tr, m_te], axis=0).reset_index(drop=True)
    data.columns = ["{}_{}_{}".format(c, prefix, modeltype) if c!="SK_ID_CURR" else c for c in data.columns]
    return data

def join_features(data, features):
    for item in features:
        data = data.merge(item, how = "left", on = "SK_ID_CURR")
    return data

def load_data(datafile):
    global important_columns
    colnames = [c.replace(" ","_") for c in pd.read_csv(datafile, nrows= 1).columns]
    intersection = list(set(colnames).intersection(set(important_columns)))+["SK_ID_CURR"]

    df =  pd.read_csv(datafile, names = colnames, usecols = intersection, skiprows=1).dropna(axis=1, how="all")
    timer("Loaded {} with shape {} in ".format(datafile.split("/")[-1], df.shape))
    return df

def load_neptune(directory, modelname):
    m_tr = pd.read_csv(directory+"lightGBM_out_of_fold_train_predictions.csv", usecols = ["SK_ID_CURR","lightGBM_prediction"])
    m_te = pd.read_csv(directory+"lightGBM_out_of_fold_test_predictions.csv", usecols = ["SK_ID_CURR","lightGBM_prediction"])
    m_te = m_te.groupby("SK_ID_CURR")["lightGBM_prediction"].mean().reset_index()
    
    data = pd.concat([m_tr, m_te], axis=0).reset_index(drop=True)
    data.columns = ["SK_ID_CURR", "neptune_{}".format(modelname)]
    return data

## Getting Feature Importance File

In [23]:
important_columns = pd.read_csv("../feature selector/importance/shap_importances.csv")
important_columns["feature"] = important_columns["feature"].apply(lambda x: x.replace(" ","_"))
important_columns = important_columns.loc[important_columns.shapely_mean > 0.0].reset_index(drop=True)
important_columns = important_columns.sort_values(by = "shapely_mean", ascending = False).reset_index(drop=True)
important_columns = important_columns.loc[0:500].feature.values.tolist()


important_columns_neptune = pd.read_csv("../feature selector/importance/shap_importances_neptune_features.csv")
important_columns_neptune["feature"] = important_columns_neptune["feature"].apply(lambda x: x.replace(" ","_"))
important_columns_neptune = important_columns_neptune.loc[important_columns_neptune.shapely_mean > 0.0].reset_index(drop=True)
important_columns_neptune = important_columns_neptune.sort_values(by = "shapely_mean", ascending = False).reset_index(drop=True)
important_columns_neptune = important_columns_neptune.feature.values.tolist()

## Loading Features

In [24]:
applications = load_data("../extractor/csv/application_features_V2.csv")
bureau_balance_bb = load_data("../extractor/csv/bureau_features_V1.csv")
credit_card_balance = load_data("../extractor/csv/credit_card_features_V1.csv")
installment_features = load_data("../extractor/csv/installment_features_V1.csv")
pos_cash_balance = load_data("../extractor/csv/pos_cash_features_V1.csv")
previous_apps = load_data("../extractor/csv/previous_application_features_V1.csv")

credit_card_balance_v3 = load_data("../extractor/csv/credit_card_features_V3.csv")
installment_features_v3 = load_data("../extractor/csv/installment_features_V3.csv")
pos_cash_balance_v3 = load_data("../extractor/csv/pos_cash_features_V3.csv")
previous_apps_v3 = load_data("../extractor/csv/previous_application_features_V3.csv")

base_features = [applications, bureau_balance_bb, credit_card_balance, installment_features, pos_cash_balance,previous_apps, credit_card_balance_v3,installment_features_v3,pos_cash_balance_v3, previous_apps_v3]

Loaded application_features_V2.csv with shape (356255, 75) in  : 32.8808329105 seconds
Loaded bureau_features_V1.csv with shape (305811, 84) in  : 10.5154500008 seconds
Loaded credit_card_features_V1.csv with shape (103558, 16) in  : 10.457201004 seconds
Loaded installment_features_V1.csv with shape (339587, 100) in  : 33.572437048 seconds
Loaded pos_cash_features_V1.csv with shape (337252, 48) in  : 21.3200819492 seconds
Loaded previous_application_features_V1.csv with shape (338857, 93) in  : 22.2427499294 seconds
Loaded credit_card_features_V3.csv with shape (103558, 6) in  : 17.0387401581 seconds
Loaded installment_features_V3.csv with shape (339587, 69) in  : 27.8438420296 seconds
Loaded pos_cash_features_V3.csv with shape (337252, 11) in  : 7.02572488785 seconds
Loaded previous_application_features_V3.csv with shape (338857, 9) in  : 8.48745703697 seconds


In [25]:
l1_features = [
    load_meta("../l1/csv/", "l1_gnb", "l1"),
    load_meta("../l1/csv/", "l1_lr", "l1"),
    load_meta("../l1/csv/", "l1_tree_with_flags", "l1"),
    load_meta("../l1/csv/", "l1_tree_without_flags", "l1"),
    load_meta("../l1/csv/", "tree_with_flags_without_meta", "l1"),
    load_meta("../l1/csv/", "tree_without_flags_without_meta", "l1"),
    load_meta("../l1/csv/", "tree_on_core_features_with_meta", "l1"),
]

neptune_features = [
    load_neptune("../base neptune/m1/csv/", "m1"),
    load_neptune("../base neptune/m2/csv/", "m2"),
    load_neptune("../base neptune/m3/csv/", "m3"),
    load_neptune("../base neptune/m4/csv/", "m4"),
]

extra_models = [
    load_meta("../some extra models/csv/", "knn_on_selected_pca", "extras"),
    load_meta("../some extra models/csv/", "lgbm_on_core_features", "extras"),
    load_meta("../some extra models/csv/", "lgbm_on_gp_features", "extras"),
    load_meta("../some extra models/csv/", "lr_on_core_features", "extras"),
#     load_meta("../some extra models/csv/", "nn", "extras"),
    load_meta("../some extra models/csv/", "using_lags_bureau_data", "extras"),

]

## Joining All Files

In [26]:
train = pd.read_csv("../data/application_train.csv", usecols = ["SK_ID_CURR","TARGET"])
test = pd.read_csv("../data/application_test.csv", usecols = ["SK_ID_CURR"])

neptune_train = pd.read_csv("../neptune extractor/data/train.csv", usecols = important_columns_neptune)
neptune_test = pd.read_csv("../neptune extractor/data/test.csv", usecols = important_columns_neptune)

train = pd.concat([train, neptune_train], axis=1)
test = pd.concat([test, neptune_test], axis=1)

del neptune_train, neptune_test
gc.collect()

data = pd.concat([train, test], axis=0).reset_index(drop=True)
data = join_features(data, base_features)
data = join_features(data, l1_features)
data = join_features(data, neptune_features)
data = join_features(data, extra_models)
del base_features,applications, bureau_balance_bb, credit_card_balance, installment_features, pos_cash_balance,previous_apps, credit_card_balance_v3,installment_features_v3,pos_cash_balance_v3, previous_apps_v3
gc.collect()

train = data.loc[data.TARGET.notnull()].reset_index(drop=True)
test = data.loc[data.TARGET.isnull()].reset_index(drop=True)

train_id = train[["SK_ID_CURR"]]
test_id = test[["SK_ID_CURR"]]
test_id_rank = test[["SK_ID_CURR"]]
target =train.TARGET

train.drop(["SK_ID_CURR", "TARGET"], axis=1, inplace=True)
test.drop(["SK_ID_CURR","TARGET"], axis=1, inplace = True)

# Defining Model

In [27]:
import lightgbm as lgb
def model_tree(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id):
    dtrain = lgb.Dataset(x_train, label=y_train)
    dval = lgb.Dataset(x_test, label=y_test)
    params = {
        'num_leaves': 30, #32
        'reg_alpha': 0.0, 
        'colsample_bytree': 0.05, 
        'subsample_freq': 1, 
        'learning_rate': 0.02, # 0.02
        'boosting_type': 'gbdt', 
        'nthread': 16, 
        'min_split_gain': 0.5, 
        'n_estimators': 10000, 
        'subsample': 1, 
        'reg_lambda': 100, 
        'objective': "binary", 
        'min_child_samples': 70, 
        'max_depth': 3, #-1
        'class_weight': None,
        "bagging_seed" : 3143,
        "seed":1343,
        "metric":"auc",
        "is_unbalance": False,
        "scale_pos_weight": 1,
        "max_bin":300
    }
    model = lgb.train(params, dtrain, num_boost_round=5000,valid_sets=[dtrain, dval], early_stopping_rounds=100, verbose_eval=100)
    meta_train[test_index] = model.predict(x_test, num_iteration=model.best_iteration or 5000)
    meta_test.append(model.predict(test, num_iteration=model.best_iteration or 5000))
    
    global fold_roc
    fold_roc.append(roc_auc_score(y_test, meta_train[test_index]))
    # Calculate Feature Importance
    global feature_importance
    gain = model.feature_importance('gain')
    fold_feature_importance = pd.DataFrame({'feature':model.feature_name(), 'split':model.feature_importance('split'), 'gain':100 * gain / gain.sum()})
    feature_importance = feature_importance.append(fold_feature_importance, ignore_index=True) 

# Training Model

In [29]:
meta_train = np.zeros(train.shape[0])
meta_test = []
feature_importance = pd.DataFrame(columns = ["feature","split","gain"])
fold_roc = []

kf = StratifiedKFold(n_splits= 10, shuffle=True, random_state=47)
for fold_id, (train_index, test_index) in enumerate(kf.split(train, target)):
    x_train, x_test = train.iloc[train_index], train.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]

    model_tree(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id)

test_id["TARGET"] = np.array(meta_test).T.mean(axis=1)
test_id_rank["TARGET"] = pd.DataFrame(np.array(meta_test).T).rank(pct = True).mean(axis=1)
train_id["TARGET"] = meta_train

print "Overall ROC: {},  Mean ROC: {}, STD AUC: {}".format(roc_auc_score(target, meta_train), np.mean(fold_roc), np.std(fold_roc))

Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.790975	valid_1's auc: 0.780485
[200]	training's auc: 0.795982	valid_1's auc: 0.78533
[300]	training's auc: 0.79972	valid_1's auc: 0.788352
[400]	training's auc: 0.801902	valid_1's auc: 0.789652
[500]	training's auc: 0.80352	valid_1's auc: 0.790499
[600]	training's auc: 0.80485	valid_1's auc: 0.791022
[700]	training's auc: 0.805981	valid_1's auc: 0.791262
[800]	training's auc: 0.806959	valid_1's auc: 0.791443
[900]	training's auc: 0.808007	valid_1's auc: 0.791659
[1000]	training's auc: 0.80886	valid_1's auc: 0.791711
[1100]	training's auc: 0.809726	valid_1's auc: 0.791763
[1200]	training's auc: 0.810552	valid_1's auc: 0.791841
[1300]	training's auc: 0.811321	valid_1's auc: 0.791839
[1400]	training's auc: 0.812104	valid_1's auc: 0.791898
[1500]	training's auc: 0.812892	valid_1's auc: 0.791943
[1600]	training's auc: 0.813659	valid_1's auc: 0.791997
[1700]	training's auc: 0.81436	valid_1's auc: 0.791955


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Overall ROC: 0.759046430295,  Mean ROC: 0.798449379328, STD AUC: 0.0054548874113


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [30]:
# [0.7920392753998619,
#  0.805254456114305,
#  0.8008227783801973,
#  0.8041775089035524,
#  0.7886149143618753,
#  0.7890053280859445,
#  0.8052696693681416,
#  0.7933513828217474,
#  0.8278584369794485,
#  0.8007763438856839]

In [31]:
train_id.to_csv("csv/{}_train.csv".format(model_name), index=False)
test_id.to_csv("csv/{}_test.csv".format(model_name), index=False)
test_id_rank.to_csv("csv/{}_rank_test.csv".format(model_name), index=False)

# Get Feature Importance

In [32]:
# Print Feature Importance
feature_importance.to_csv("csv/{}_all_fi.csv".format(model_name), index = False)
feature_importance = feature_importance.groupby("feature")[["gain","split"]].mean().sort_values('gain', ascending=False).reset_index()

plt.figure()
feature_importance[['feature','gain']].head(60).plot(kind='barh', x='feature', y='gain', legend=False, figsize=(30, 100))
plt.gcf().savefig('csv/{}.png'.format(model_name))


In [None]:
-