In [1]:
model_name = "using_lags_bureau_data"

In [2]:
import numpy as np
import pandas as pd
import gc
import time
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

from keras.models import Sequential
from keras.models import Model
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Activation,InputLayer
from keras import metrics
from keras import optimizers
from keras import backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Reading Files & Preparing

In [3]:
train = pd.read_csv('../data/application_train.csv', usecols=["SK_ID_CURR", "TARGET"])
test = pd.read_csv('../data/application_test.csv', usecols=["SK_ID_CURR"])
bureau = pd.read_csv("../data/bureau.csv").sort_values(by =["SK_ID_CURR","DAYS_CREDIT"]).reset_index(drop=True)
bureau_balance = pd.read_csv("../data/bureau_balance.csv")

bureau_balance["STATUS"] = LabelEncoder().fit_transform(bureau_balance["STATUS"].fillna("NAN"))

# Extracting Features from Bureau


In [4]:
# Generating Integer Columns
bureau["credit_is_active"] = (bureau["CREDIT_ACTIVE"] == "Active").astype(int)
bureau["credit_is_closed"] = (bureau["CREDIT_ACTIVE"] == "Closed").astype(int)
bureau["credit_is_sold"] = (bureau["CREDIT_ACTIVE"] == "Sold").astype(int)

# Various Ratio Features
r2annuity_cols = ["AMT_CREDIT_SUM", "AMT_CREDIT_MAX_OVERDUE","AMT_CREDIT_SUM_DEBT","AMT_CREDIT_SUM_LIMIT", "AMT_CREDIT_SUM_OVERDUE"]
for col in r2annuity_cols:
    bureau[col+"_to_annuity_ratio"] = bureau[col]/bureau["AMT_ANNUITY"].astype("float32")

r2creditsum_cols = ["AMT_CREDIT_SUM_DEBT","AMT_CREDIT_SUM_LIMIT"]
for col in r2creditsum_cols:
    bureau[col+"_to_amt_credit_sum"] = bureau[col]/bureau["AMT_CREDIT_SUM"].astype("float32")

bureau["debt_to_limit_ratio"] = bureau.AMT_CREDIT_SUM_DEBT / bureau.AMT_CREDIT_SUM_LIMIT.astype("float32")
bureau["overdue_to_debt_ratio"] = bureau.AMT_CREDIT_SUM_OVERDUE / bureau.AMT_CREDIT_SUM_DEBT.astype("float32")

# Generating a Groupby
bureau_groupby = bureau.groupby("SK_ID_CURR")

# Aggregate Features
aggregates_df = pd.DataFrame()
aggregates_df["total_counts"] = bureau_groupby["CREDIT_ACTIVE"].count()
aggregates_df["active_counts"] = bureau_groupby["credit_is_active"].sum()
aggregates_df["close_counts"] = bureau_groupby["credit_is_closed"].sum()
aggregates_df["sold_counts"] = bureau_groupby["credit_is_sold"].sum()

aggregates_df["currency_counts"] = bureau_groupby["CREDIT_CURRENCY"].nunique()
aggregates_df["credit_type_counts"] = bureau_groupby["CREDIT_TYPE"].nunique()
aggregates_df["min_days_credit"] = bureau_groupby["DAYS_CREDIT"].min()

aggregates_df["active_ratio"] = aggregates_df["active_counts"]/aggregates_df["total_counts"].astype("float32")
aggregates_df["close_ratio"] = aggregates_df["close_counts"]/aggregates_df["total_counts"].astype("float32")
aggregates_df["sold_ratio"] = aggregates_df["sold_counts"]/aggregates_df["total_counts"].astype("float32")
aggregates_df["loan_type_diversification"] = aggregates_df["credit_type_counts"]/aggregates_df["total_counts"].astype("float32")

# Various Difference Rate Features
p_change_cols =["DAYS_CREDIT_ENDDATE","CREDIT_DAY_OVERDUE"]
p_change_cols+=["AMT_CREDIT_SUM", "AMT_CREDIT_MAX_OVERDUE","AMT_CREDIT_SUM_DEBT","AMT_CREDIT_SUM_LIMIT","AMT_CREDIT_SUM_OVERDUE"]
p_change_cols+=[col+"_to_annuity_ratio" for col in r2annuity_cols]
p_change_cols+=[col+"_to_amt_credit_sum" for col in r2creditsum_cols]
p_change_cols+=["debt_to_limit_ratio","overdue_to_debt_ratio"]

p_changes = bureau_groupby[p_change_cols].diff().bfill()/bureau_groupby[p_change_cols].shift(1).bfill().astype("float32")
p_changes.columns = [i+"_diff" for i in p_change_cols]
bureau = bureau.merge(p_changes, how="left", left_on = "SK_ID_CURR", right_index = True)
del p_changes
gc.collect()

# Other Generated Features
bureau["DAYS_CREDIT_diff_mean"] = bureau_groupby["DAYS_CREDIT"].diff().bfill().mean()
bureau["DAYS_CREDIT_diff_std"] = bureau_groupby["DAYS_CREDIT"].diff().bfill().std()
bureau["ratio_days_credit"] = bureau["DAYS_CREDIT"]/bureau["SK_ID_CURR"].map(aggregates_df["min_days_credit"]).astype("float32")

del bureau_groupby
gc.collect()

for col in [c for c in bureau.columns if bureau[c].dtype == "object"]:
        bureau[col] = LabelEncoder().fit_transform(bureau[col])
print "TRAIN SHAPE: {}, TEST SHAPE: {}, BUREAU SHAPE: {}, BUREAU BALANCE SHAPE: {}".format(train.shape[0], test.shape[0], bureau.shape[0], bureau_balance.shape[0])

TRAIN SHAPE: 307511, TEST SHAPE: 48744, BUREAU SHAPE: 1716428, BUREAU BALANCE SHAPE: 27299925


# Extracting features from Bureau Balance

In [5]:
bb_means = pd.get_dummies(bureau_balance[["SK_ID_BUREAU","STATUS"]], columns= ["STATUS"], dummy_na= True)
bb_means = bb_means.groupby("SK_ID_BUREAU")[bb_means.columns.drop(["SK_ID_BUREAU"])].apply(lambda x: np.mean(x, axis=0))
bureau = bureau.merge(bb_means, how = "left", left_on = "SK_ID_BUREAU", right_index = True)
print "BUREAU SHAPE: {}".format(bureau.shape)
del bb_means
gc.collect()

bureau_balance  = bureau_balance.pivot(index = "SK_ID_BUREAU",columns = "MONTHS_BALANCE", values = "STATUS")
bureau_balance.columns = ["BB_PIVOT_{}".format(i) for i in bureau_balance.columns]
bureau = bureau.merge(bureau_balance, how = "left", left_on = "SK_ID_BUREAU", right_index=True).drop(["SK_ID_BUREAU"], axis=1)
print "BUREAU SHAPE: {}".format(bureau.shape)
del bureau_balance
gc.collect()

BUREAU SHAPE: (1716428, 57)
BUREAU SHAPE: (1716428, 153)


99

# Combining all features to make dataset 


In [6]:
# Aggregating Bureau Dataset by SK_ID_CURR
bureau_feats = bureau.columns.drop(["SK_ID_CURR"])
bureau_groups = bureau.groupby("SK_ID_CURR")
group_keys = bureau_groups.groups.keys()

data = pd.DataFrame(bureau_groups[bureau_feats].apply(lambda x: list(x[-24:].values.ravel())).values.tolist())
data.columns = sum([["{}_{}".format(i,c) for c in bureau_feats] for i in range(int(data.shape[1]/len(bureau_feats)))],[])
data["SK_ID_CURR"] = group_keys
del bureau
gc.collect()

# Adding Aggregated Features to Data
data = data.merge(aggregates_df, how="left",left_on = "SK_ID_CURR", right_index = True)
del aggregates_df
gc.collect()

# Merging to Train
data = data.merge(train, how = "left", on = "SK_ID_CURR")

print "DATA SHAPE: {}".format(data.shape)

DATA SHAPE: (305811, 3661)


# PREPARING FOR TRAINING


In [7]:
train_b = data.loc[data.TARGET.isin([0,1])].reset_index(drop=True)
test_b = data.loc[~data.TARGET.isin([0,1])].reset_index(drop=True)
del data
gc.collect()

target = train_b["TARGET"]
train_b_index = train_b[["SK_ID_CURR"]]
test_b_index = test_b[["SK_ID_CURR"]]

train_b.drop(["SK_ID_CURR", "TARGET"], axis = 1, inplace = True)
test_b.drop(["SK_ID_CURR","TARGET"], axis = 1, inplace = True)

train_b = train_b
test_b = test_b
print "TRAIN_B SHAPE: {}, TEST_B SHAPE: {}".format(train_b.shape[0], test_b.shape[0])

TRAIN_B SHAPE: 263491, TEST_B SHAPE: 42320


# Running Model

In [8]:
import lightgbm as lgb
def model_tree(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id):
    dtrain = lgb.Dataset(x_train, label=y_train)
    dval = lgb.Dataset(x_test, label=y_test)
    params = {
        "nthread":14,
        "metric":"auc",
        "objective": "binary",
        "n_estimators":10000,
        "learning_rate":0.02,
        "num_leaves":60,
        "colsample_bytree":0.9497036,
        "subsample":0.8715623,
        "max_depth":8,
        "reg_alpha":0.04,
        "reg_lambda":0.073,
        "min_split_gain":0.0222415,
        "min_child_weight":40,
        "silent":-1,
        "verbose":-1,
        "bagging_seed" : 42,
        "seed":98
    }
    model = lgb.train(params, dtrain, num_boost_round=5000,valid_sets=[dtrain, dval], early_stopping_rounds=200, verbose_eval=100)
    meta_train[test_index] = model.predict(x_test, num_iteration=model.best_iteration or 5000)
    meta_test.append(model.predict(test, num_iteration=model.best_iteration or 5000))

    # Calculate Feature Importance
    global feature_importance
    gain = model.feature_importance('gain')
    fold_feature_importance = pd.DataFrame({'feature':model.feature_name(), 'split':model.feature_importance('split'), 'gain':100 * gain / gain.sum()})
    feature_importance = feature_importance.append(fold_feature_importance, ignore_index=True) 

# Training Model

In [9]:
meta_train = np.zeros(train_b.shape[0])
meta_test = []
feature_importance = pd.DataFrame(columns = ["feature","split","gain"])

kf = StratifiedKFold(n_splits= 10, shuffle=True, random_state=47)
for fold_id, (train_index, test_index) in enumerate(kf.split(train_b, target)):
    x_train, x_test = train_b.iloc[train_index], train_b.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]

    model_tree(x_train, x_test, y_train, y_test, test_b, meta_train, meta_test,train_index, test_index,fold_id)

test_b_index["TARGET"] = np.array(meta_test).T.mean(axis=1)
train_b_index["TARGET"] = meta_train

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.683764	valid_1's auc: 0.652722
[200]	training's auc: 0.71128	valid_1's auc: 0.667089
[300]	training's auc: 0.728414	valid_1's auc: 0.673956
[400]	training's auc: 0.739566	valid_1's auc: 0.677509
[500]	training's auc: 0.74845	valid_1's auc: 0.678563
[600]	training's auc: 0.756144	valid_1's auc: 0.679651
[700]	training's auc: 0.763024	valid_1's auc: 0.680052
[800]	training's auc: 0.770257	valid_1's auc: 0.679954
[900]	training's auc: 0.776944	valid_1's auc: 0.680159
[1000]	training's auc: 0.782605	valid_1's auc: 0.679775
Early stopping, best iteration is:
[839]	training's auc: 0.772769	valid_1's auc: 0.680251
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.68368	valid_1's auc: 0.657988
[200]	training's auc: 0.711173	valid_1's auc: 0.674617
[300]	training's auc: 0.726526	valid_1's auc: 0.681482
[400]	training's auc: 0.738045	valid_1's auc: 0.684142
[500]	training's

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [14]:
test = test[["SK_ID_CURR"]].merge(test_b_index, how = "left", on = "SK_ID_CURR").fillna(0.5)
train = train[["SK_ID_CURR"]].merge(train_b_index, how = "left", on = "SK_ID_CURR").fillna(0.5)

print "TRAIN SHAPE: {}, TEST SHAPE: {}".format(train.shape[0],test.shape[0])

train.to_csv("csv/{}_train.csv".format(model_name), index=False)
test.to_csv("csv/{}_test.csv".format(model_name), index=False)

TRAIN SHAPE: 307511, TEST SHAPE: 48744


# Generating Feature Importance

In [15]:
# Print Feature Importance
feature_importance = feature_importance.groupby("feature")["gain"].mean().reset_index().sort_values('gain', ascending=False).reset_index(drop=True)
plt.figure()
feature_importance[['feature','gain']].head(60).plot(kind='barh', x='feature', y='gain', legend=False, figsize=(30, 100))
plt.gcf().savefig('csv/{}.png'.format(model_name))
