In [5]:
model_name = "bureau_balance_bb"

In [6]:
import numpy as np
import pandas as pd
import gc
from time import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

In [7]:
timesheet = [time()]
def timer(statement):
    global timesheet
    timesheet.append(time())
    print statement+" :", (timesheet[-1]-timesheet[-2]),"seconds"

# Loading Data

In [8]:
bureau_balance_bb = pd.read_csv("../extractor/csv/bureau_features_V1.csv").dropna(axis=1, how="all")

train_id = pd.read_csv("../data/application_train.csv", usecols = ["SK_ID_CURR","TARGET"])
test_id = pd.read_csv("../data/application_test.csv", usecols = ["SK_ID_CURR"])

bureau_balance_bb = bureau_balance_bb.merge(train_id, how = "left", on = "SK_ID_CURR")

train = bureau_balance_bb.loc[bureau_balance_bb.TARGET.notnull()].reset_index(drop=True)
test = bureau_balance_bb.loc[bureau_balance_bb.TARGET.isnull()].reset_index(drop=True)

partial_train_id = train[["SK_ID_CURR"]]
partial_test_id = test[["SK_ID_CURR"]]
target =train.TARGET

train.drop(["SK_ID_CURR", "TARGET"], axis=1, inplace=True)
test.drop(["SK_ID_CURR"], axis=1, inplace = True)

# Defining Model

In [9]:
import lightgbm as lgb
def model_tree(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id):
    dtrain = lgb.Dataset(x_train, label=y_train)
    dval = lgb.Dataset(x_test, label=y_test)
    params = {
        'num_leaves': 32, #32
        'reg_alpha': 0.04, 
        'n_jobs': -1, 
        'colsample_bytree': 0.9497036, 
        'silent': -1, 
        'subsample_for_bin': 200000, 
        'subsample_freq': 1, 
        'learning_rate': 0.02, # 0.02
        'boosting_type': 'gbdt', 
        'nthread': 8, 
        'min_child_weight': 40, 
        'min_split_gain': 0.0222415, 
        'n_estimators': 10000, 
        'subsample': 0.8715623, 
        'reg_lambda': 10, 
        'objective': "binary", 
        'verbose': -1, 
        'min_child_samples': 20, 
        'max_depth': 8, #8
        'class_weight': None,
        "bagging_seed" : 3143,
        "seed":1343,
        "metric":"auc"
    }
    model = lgb.train(params, dtrain, num_boost_round=5000,valid_sets=[dtrain, dval], early_stopping_rounds=200, verbose_eval=100)
    meta_train[test_index] = model.predict(x_test, num_iteration=model.best_iteration or 5000)
    meta_test.append(model.predict(test, num_iteration=model.best_iteration or 5000))

    # Calculate Feature Importance
    global feature_importance
    gain = model.feature_importance('gain')
    fold_feature_importance = pd.DataFrame({'feature':model.feature_name(), 'split':model.feature_importance('split'), 'gain':100 * gain / gain.sum()})
    feature_importance = feature_importance.append(fold_feature_importance, ignore_index=True) 

# Training Model

In [10]:
meta_train = np.zeros(train.shape[0])
meta_test = []
feature_importance = pd.DataFrame(columns = ["feature","split","gain"])

kf = StratifiedKFold(n_splits= 10, shuffle=True, random_state=47)
for fold_id, (train_index, test_index) in enumerate(kf.split(train, target)):
    x_train, x_test = train.iloc[train_index], train.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]

    model_tree(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id)

partial_test_id["TARGET"] = np.array(meta_test).T.mean(axis=1)
partial_train_id["TARGET"] = meta_train

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.666089	valid_1's auc: 0.652104
[200]	training's auc: 0.686227	valid_1's auc: 0.660826
[300]	training's auc: 0.701515	valid_1's auc: 0.665061
[400]	training's auc: 0.713261	valid_1's auc: 0.66677
[500]	training's auc: 0.722913	valid_1's auc: 0.667132
[600]	training's auc: 0.73186	valid_1's auc: 0.667231
[700]	training's auc: 0.740355	valid_1's auc: 0.667604
[800]	training's auc: 0.748088	valid_1's auc: 0.667263
Early stopping, best iteration is:
[699]	training's auc: 0.740285	valid_1's auc: 0.66762
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.665232	valid_1's auc: 0.658992
[200]	training's auc: 0.685855	valid_1's auc: 0.667842
[300]	training's auc: 0.701078	valid_1's auc: 0.671627
[400]	training's auc: 0.712981	valid_1's auc: 0.67284
[500]	training's auc: 0.723017	valid_1's auc: 0.672972
[600]	training's auc: 0.732216	valid_1's auc: 0.673081
[700]	training's a

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [11]:
train_id = train_id[["SK_ID_CURR"]].merge(partial_train_id, how="left", on="SK_ID_CURR")
test_id = test_id[["SK_ID_CURR"]].merge(partial_test_id, how="left", on="SK_ID_CURR")

train_id.to_csv("csv/{}_train.csv".format(model_name), index=False)
test_id.to_csv("csv/{}_test.csv".format(model_name), index=False)

# Get Feature Importance

In [12]:
# Print Feature Importance
feature_importance.to_csv("csv/{}_all_fi.csv".format(model_name), index = False)
feature_importance = feature_importance.groupby("feature")[["gain","split"]].mean().sort_values('gain', ascending=False).reset_index()

plt.figure()
feature_importance[['feature','gain']].head(60).plot(kind='barh', x='feature', y='gain', legend=False, figsize=(30, 100))
plt.gcf().savefig('csv/{}.png'.format(model_name))
