In [1]:
model_name = "application"

In [2]:
import numpy as np
import pandas as pd
import gc
from time import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

In [3]:
timesheet = [time()]
def timer(statement):
    global timesheet
    timesheet.append(time())
    print statement+" :", (timesheet[-1]-timesheet[-2]),"seconds"

# Loading Data

In [4]:
applications = pd.read_csv("../extractor/csv/application_features_V1.csv").dropna(axis=1, how="all").drop(["TARGET"], axis=1)

train_id = pd.read_csv("../data/application_train.csv", usecols = ["SK_ID_CURR","TARGET"])
test_id = pd.read_csv("../data/application_test.csv", usecols = ["SK_ID_CURR"])

applications = applications.merge(train_id, how = "left", on = "SK_ID_CURR")

train = applications.loc[applications.TARGET.notnull()].reset_index(drop=True)
test = applications.loc[applications.TARGET.isnull()].reset_index(drop=True)

partial_train_id = train[["SK_ID_CURR"]]
partial_test_id = test[["SK_ID_CURR"]]
target =train.TARGET

train.drop(["SK_ID_CURR", "TARGET"], axis=1, inplace=True)
test.drop(["SK_ID_CURR"], axis=1, inplace = True)

# Defining Model

In [5]:
import lightgbm as lgb
def model_tree(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id):
    dtrain = lgb.Dataset(x_train, label=y_train)
    dval = lgb.Dataset(x_test, label=y_test)
    params = {
        'num_leaves': 32, #32
        'reg_alpha': 0.04, 
        'n_jobs': -1, 
        'colsample_bytree': 0.9497036, 
        'silent': -1, 
        'subsample_for_bin': 200000, 
        'subsample_freq': 1, 
        'learning_rate': 0.02, # 0.02
        'boosting_type': 'gbdt', 
        'nthread': 8, 
        'min_child_weight': 40, 
        'min_split_gain': 0.0222415, 
        'n_estimators': 10000, 
        'subsample': 0.8715623, 
        'reg_lambda': 10, 
        'objective': "binary", 
        'verbose': -1, 
        'min_child_samples': 20, 
        'max_depth': 8, #8
        'class_weight': None,
        "bagging_seed" : 3143,
        "seed":1343,
        "metric":"auc"
    }
    model = lgb.train(params, dtrain, num_boost_round=5000,valid_sets=[dtrain, dval], early_stopping_rounds=200, verbose_eval=100)
    meta_train[test_index] = model.predict(x_test, num_iteration=model.best_iteration or 5000)
    meta_test.append(model.predict(test, num_iteration=model.best_iteration or 5000))

    # Calculate Feature Importance
    global feature_importance
    gain = model.feature_importance('gain')
    fold_feature_importance = pd.DataFrame({'feature':model.feature_name(), 'split':model.feature_importance('split'), 'gain':100 * gain / gain.sum()})
    feature_importance = feature_importance.append(fold_feature_importance, ignore_index=True) 

# Training Model

In [6]:
meta_train = np.zeros(train.shape[0])
meta_test = []
feature_importance = pd.DataFrame(columns = ["feature","split","gain"])

kf = StratifiedKFold(n_splits= 10, shuffle=True, random_state=47)
for fold_id, (train_index, test_index) in enumerate(kf.split(train, target)):
    x_train, x_test = train.iloc[train_index], train.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]

    model_tree(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id)

partial_test_id["TARGET"] = np.array(meta_test).T.mean(axis=1)
partial_train_id["TARGET"] = meta_train

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.745846	valid_1's auc: 0.731993
[200]	training's auc: 0.763468	valid_1's auc: 0.743486
[300]	training's auc: 0.776466	valid_1's auc: 0.750482
[400]	training's auc: 0.785717	valid_1's auc: 0.754042
[500]	training's auc: 0.793194	valid_1's auc: 0.75627
[600]	training's auc: 0.799616	valid_1's auc: 0.757453
[700]	training's auc: 0.805382	valid_1's auc: 0.758589
[800]	training's auc: 0.810787	valid_1's auc: 0.759052
[900]	training's auc: 0.815511	valid_1's auc: 0.759252
[1000]	training's auc: 0.820109	valid_1's auc: 0.759118
[1100]	training's auc: 0.824549	valid_1's auc: 0.759201
Early stopping, best iteration is:
[916]	training's auc: 0.816231	valid_1's auc: 0.759388
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.74418	valid_1's auc: 0.745794
[200]	training's auc: 0.761744	valid_1's auc: 0.759834
[300]	training's auc: 0.774843	valid_1's auc: 0.767878
[400]	training

[1300]	training's auc: 0.83195	valid_1's auc: 0.769091
[1400]	training's auc: 0.835914	valid_1's auc: 0.769271
[1500]	training's auc: 0.839608	valid_1's auc: 0.769249
[1600]	training's auc: 0.84328	valid_1's auc: 0.769519
[1700]	training's auc: 0.846952	valid_1's auc: 0.769458
[1800]	training's auc: 0.850383	valid_1's auc: 0.7694
Early stopping, best iteration is:
[1614]	training's auc: 0.843783	valid_1's auc: 0.769577
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.7452	valid_1's auc: 0.735521
[200]	training's auc: 0.762514	valid_1's auc: 0.750776
[300]	training's auc: 0.775595	valid_1's auc: 0.7603
[400]	training's auc: 0.784786	valid_1's auc: 0.764773
[500]	training's auc: 0.792262	valid_1's auc: 0.767336
[600]	training's auc: 0.798893	valid_1's auc: 0.769232
[700]	training's auc: 0.804484	valid_1's auc: 0.769851
[800]	training's auc: 0.809449	valid_1's auc: 0.770244
[900]	training's auc: 0.814638	valid_1's auc: 0.770528
[1000]	training's auc: 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [7]:
train_id = train_id[["SK_ID_CURR"]].merge(partial_train_id, how="left", on="SK_ID_CURR")
test_id = test_id[["SK_ID_CURR"]].merge(partial_test_id, how="left", on="SK_ID_CURR")

train_id.to_csv("csv/{}_train.csv".format(model_name), index=False)
test_id.to_csv("csv/{}_test.csv".format(model_name), index=False)

# Get Feature Importance

In [8]:
# Print Feature Importance
feature_importance.to_csv("csv/{}_all_fi.csv".format(model_name), index = False)
feature_importance = feature_importance.groupby("feature")[["gain","split"]].mean().sort_values('gain', ascending=False).reset_index()

plt.figure()
feature_importance[['feature','gain']].head(60).plot(kind='barh', x='feature', y='gain', legend=False, figsize=(30, 100))
plt.gcf().savefig('csv/{}.png'.format(model_name))
