In [1]:
model_name = "tree_without_flags_without_meta"

In [2]:
import numpy as np
import pandas as pd
import gc
from time import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

In [3]:
timesheet = [time()]
def timer(statement):
    global timesheet
    timesheet.append(time())
    print statement+" :", (timesheet[-1]-timesheet[-2]),"seconds"

In [4]:
def load_meta(directory, prefix, modeltype):
    m_tr = pd.read_csv(directory+prefix+"_train.csv")
    m_te = pd.read_csv(directory+prefix+"_test.csv")
    data = pd.concat([m_tr, m_te], axis=0).reset_index(drop=True)
    data.columns = ["{}_{}_{}".format(c, prefix, modeltype) if c!="SK_ID_CURR" else c for c in data.columns]
    return data

def join_features(data, features):
    for item in features:
        data = data.merge(item, how = "left", on = "SK_ID_CURR")
    return data


def processColNames(df):
    df.columns = [c.replace(" ","_") for c in df.columns]
    return df


## Loading Features

Loading All Features File 

In [5]:
applications = processColNames(pd.read_csv("../extractor/csv/application_features_V1.csv").dropna(axis=1, how="all").drop(["TARGET"], axis=1))
bureau_balance_bb = processColNames(pd.read_csv("../extractor/csv/bureau_features_V1.csv").dropna(axis=1, how="all"))
credit_card_balance = processColNames(pd.read_csv("../extractor/csv/credit_card_features_V1.csv").dropna(axis=1, how="all"))
installment_features = processColNames(pd.read_csv("../extractor/csv/installment_features_V1.csv").dropna(axis=1, how="all"))
pos_cash_balance = processColNames(pd.read_csv("../extractor/csv/pos_cash_features_V1.csv").dropna(axis=1, how="all"))
previous_apps = processColNames(pd.read_csv("../extractor/csv/previous_application_features_V1.csv").dropna(axis=1, how="all"))

base_features = [applications, bureau_balance_bb, credit_card_balance, installment_features, pos_cash_balance,previous_apps ]

## Loading Meta Features

In [6]:
train = pd.read_csv("../data/application_train.csv", usecols = ["SK_ID_CURR","TARGET"])
test = pd.read_csv("../data/application_test.csv", usecols = ["SK_ID_CURR"])

data = pd.concat([train, test], axis=0).reset_index(drop=True)
data = join_features(data, base_features)

train = data.loc[data.TARGET.notnull()].reset_index(drop=True)
test = data.loc[data.TARGET.isnull()].reset_index(drop=True)

train_id = train[["SK_ID_CURR"]]
test_id = test[["SK_ID_CURR"]]
target =train.TARGET

train.drop(["SK_ID_CURR", "TARGET"], axis=1, inplace=True)
test.drop(["SK_ID_CURR","TARGET"], axis=1, inplace = True)

# Defining Model

In [7]:
import lightgbm as lgb
def model_tree(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id):
    dtrain = lgb.Dataset(x_train, label=y_train)
    dval = lgb.Dataset(x_test, label=y_test)
    params = {
        'num_leaves': 32, #32
        'reg_alpha': 0.04, 
        'n_jobs': -1, 
        'colsample_bytree': 0.9497036, 
        'silent': -1, 
        'subsample_for_bin': 200000, 
        'subsample_freq': 1, 
        'learning_rate': 0.02, # 0.02
        'boosting_type': 'gbdt', 
        'nthread': 8, 
        'min_child_weight': 40, 
        'min_split_gain': 0.0222415, 
        'n_estimators': 10000, 
        'subsample': 0.8715623, 
        'reg_lambda': 10, 
        'objective': "binary", 
        'verbose': -1, 
        'min_child_samples': 20, 
        'max_depth': 8, #8
        'class_weight': None,
        "bagging_seed" : 3143,
        "seed":1343,
        "metric":"auc"
    }
    model = lgb.train(params, dtrain, num_boost_round=5000,valid_sets=[dtrain, dval], early_stopping_rounds=200, verbose_eval=100)
    meta_train[test_index] = model.predict(x_test, num_iteration=model.best_iteration or 5000)
    meta_test.append(model.predict(test, num_iteration=model.best_iteration or 5000))
    
    global fold_roc
    fold_roc.append(roc_auc_score(y_test, meta_train[test_index]))
    # Calculate Feature Importance
    global feature_importance
    gain = model.feature_importance('gain')
    fold_feature_importance = pd.DataFrame({'feature':model.feature_name(), 'split':model.feature_importance('split'), 'gain':100 * gain / gain.sum()})
    feature_importance = feature_importance.append(fold_feature_importance, ignore_index=True) 

# Training Model

In [8]:
meta_train = np.zeros(train.shape[0])
meta_test = []
feature_importance = pd.DataFrame(columns = ["feature","split","gain"])
fold_roc = []

kf = StratifiedKFold(n_splits= 10, shuffle=True, random_state=47)
for fold_id, (train_index, test_index) in enumerate(kf.split(train, target)):
    x_train, x_test = train.iloc[train_index], train.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]

    model_tree(x_train, x_test, y_train, y_test, test, meta_train, meta_test,train_index, test_index,fold_id)

test_id["TARGET"] = np.array(meta_test).T.mean(axis=1)
train_id["TARGET"] = meta_train

print "Overall ROC: {},  Mean ROC: {}, STD AUC: {}".format(roc_auc_score(target, meta_train), np.mean(fold_roc), np.std(fold_roc))

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.760068	valid_1's auc: 0.745276
[200]	training's auc: 0.783459	valid_1's auc: 0.76155
[300]	training's auc: 0.801637	valid_1's auc: 0.772806
[400]	training's auc: 0.813669	valid_1's auc: 0.77837
[500]	training's auc: 0.823265	valid_1's auc: 0.781329
[600]	training's auc: 0.831228	valid_1's auc: 0.783213
[700]	training's auc: 0.838738	valid_1's auc: 0.784663
[800]	training's auc: 0.84515	valid_1's auc: 0.7855
[900]	training's auc: 0.851118	valid_1's auc: 0.786227
[1000]	training's auc: 0.856717	valid_1's auc: 0.786372
[1100]	training's auc: 0.861826	valid_1's auc: 0.78678
[1200]	training's auc: 0.86696	valid_1's auc: 0.786838
[1300]	training's auc: 0.871932	valid_1's auc: 0.786908
[1400]	training's auc: 0.876524	valid_1's auc: 0.78737
[1500]	training's auc: 0.880923	valid_1's auc: 0.787337
[1600]	training's auc: 0.885115	valid_1's auc: 0.787481
[1700]	training's auc: 0.88911	valid_1's auc: 0.787471
[18

Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.760007	valid_1's auc: 0.741711
[200]	training's auc: 0.783028	valid_1's auc: 0.758917
[300]	training's auc: 0.801121	valid_1's auc: 0.771187
[400]	training's auc: 0.813281	valid_1's auc: 0.777165
[500]	training's auc: 0.822874	valid_1's auc: 0.780815
[600]	training's auc: 0.830932	valid_1's auc: 0.782837
[700]	training's auc: 0.838297	valid_1's auc: 0.784429
[800]	training's auc: 0.845009	valid_1's auc: 0.785453
[900]	training's auc: 0.851273	valid_1's auc: 0.786208
[1000]	training's auc: 0.856934	valid_1's auc: 0.786873
[1100]	training's auc: 0.862056	valid_1's auc: 0.787351
[1200]	training's auc: 0.867009	valid_1's auc: 0.787445
[1300]	training's auc: 0.871835	valid_1's auc: 0.787757
[1400]	training's auc: 0.876412	valid_1's auc: 0.78785
[1500]	training's auc: 0.880703	valid_1's auc: 0.788158
[1600]	training's auc: 0.884898	valid_1's auc: 0.788307
[1700]	training's auc: 0.888863	valid_1's auc: 0.78

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [9]:
train_id.to_csv("csv/{}_train.csv".format(model_name), index=False)
test_id.to_csv("csv/{}_test.csv".format(model_name), index=False)

# Get Feature Importance

In [10]:
# Print Feature Importance
feature_importance.to_csv("csv/{}_all_fi.csv".format(model_name), index = False)
feature_importance = feature_importance.groupby("feature")[["gain","split"]].mean().sort_values('gain', ascending=False).reset_index()

plt.figure()
feature_importance[['feature','gain']].head(60).plot(kind='barh', x='feature', y='gain', legend=False, figsize=(30, 100))
plt.gcf().savefig('csv/{}.png'.format(model_name))
