In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
%matplotlib inline  

In [2]:
app_train = pd.read_csv("data/application_train.csv")
app_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
target = app_train.TARGET
app_train.drop(['SK_ID_CURR','TARGET'],inplace=True,axis=1)
app_train.head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,...,0,0,0,0,,,,,,
4,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
app_test = pd.read_csv("data/application_test.csv")
app_test.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,


In [5]:
sk_id_curr = app_test['SK_ID_CURR']
app_test.drop(['SK_ID_CURR'],inplace=True,axis=1)

In [6]:
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [7]:
app_train['type'] = 0
app_test['type'] = 1
df = pd.concat([app_train, app_test], axis=0)

In [8]:
df, cat_cols = one_hot_encoder(df)

In [9]:
app_train = df.loc[df['type'] == 0]
app_test = df.loc[df['type'] == 1]
app_train.drop(['type'],inplace=True,axis=1)
app_test.drop(['type'],inplace=True,axis=1)
print(app_train.shape)
print(app_test.shape)

(307511, 260)
(48744, 260)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [10]:
def gini_normalized(y_actual, y_pred):
    """Simple normalized Gini based on Scikit-Learn's roc_auc_score"""
    gini = lambda a, p: 2 * roc_auc_score(a, p) - 1
    return gini(y_actual, y_pred) / gini(y_actual, y_actual)

In [14]:
seed_val = 2018
num_folds = 5
pred_val = np.zeros(app_test.shape[0])
# Cross validation model
folds = StratifiedShuffleSplit(n_splits= num_folds, random_state=seed_val)
        
    # Create arrays and dataframes to store results
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(app_train, target)):
    cat_X_train, cat_y_train = app_train.values[train_idx], target[train_idx]
    cat_X_valid, cat_y_valid = app_train.values[valid_idx], target[valid_idx]
    
    dtrain = lgb.Dataset(data=app_train.values[train_idx], 
                              label=target[train_idx], 
                              free_raw_data=False, silent=True)
    dvalid = lgb.Dataset(data=app_train.values[valid_idx], 
                              label=target[valid_idx], 
                              free_raw_data=False, silent=True)
        
    print("Running fold ", n_fold+1)
    params = {
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'nthread': 4,
            'learning_rate': 0.02,  # 02,
            'num_leaves': 10,
            'colsample_bytree': 0.9497036,
            'subsample': 0.8715623,
            'subsample_freq': 1,
            'max_depth': 8,
            'reg_alpha': 0.041545473,
            'reg_lambda': 0.0735294,
            'min_split_gain': 0.0222415,
            'min_child_weight': 60, # 39.3259775,
            'seed': seed_val,
            'verbose': -1,
            'metric': 'auc',
        }
    
    model = lgb.train(
            params=params,
            train_set=dtrain,
            num_boost_round=4000,
            valid_sets=[dtrain, dvalid],
            early_stopping_rounds=200,
            verbose_eval=100
        )
    
    pred_val += model.predict(app_test)
    print('fold:', n_fold+1, '/', num_folds,
                 ' -> oof gini score:', gini_normalized(cat_y_valid, model.predict(cat_X_valid)))

pred_val = pred_val/num_folds

Running fold  1
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.740167	valid_1's auc: 0.732279
[200]	training's auc: 0.753367	valid_1's auc: 0.744032
[300]	training's auc: 0.759587	valid_1's auc: 0.749389
[400]	training's auc: 0.763613	valid_1's auc: 0.752327
[500]	training's auc: 0.766483	valid_1's auc: 0.753948
[600]	training's auc: 0.768822	valid_1's auc: 0.754881
[700]	training's auc: 0.770998	valid_1's auc: 0.755702
[800]	training's auc: 0.773031	valid_1's auc: 0.756355
[900]	training's auc: 0.774972	valid_1's auc: 0.756861
[1000]	training's auc: 0.776707	valid_1's auc: 0.757213
[1100]	training's auc: 0.778509	valid_1's auc: 0.757595
[1200]	training's auc: 0.780119	valid_1's auc: 0.757837
[1300]	training's auc: 0.78168	valid_1's auc: 0.758222
[1400]	training's auc: 0.783244	valid_1's auc: 0.758486
[1500]	training's auc: 0.78479	valid_1's auc: 0.758517
[1600]	training's auc: 0.786268	valid_1's auc: 0.758851
[1700]	training's auc: 0.787753	vali

In [16]:
sub_fm = pd.DataFrame({"SK_ID_CURR":sk_id_curr, "TARGET":pred_val})
sub_fm.to_csv("./submit/lightgbm.csv",index=False)