In [1]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_rows', 800)
pd.set_option('display.max_columns', 100)

import gc
import time
from tqdm import tqdm

In [3]:
%%time
PATH="../input/"
train_df = pd.read_csv(PATH+"train.csv")
test_df = pd.read_csv(PATH+"test.csv")
subm_df = pd.read_csv(PATH+"sample_submission.csv")

CPU times: user 1.9 s, sys: 188 ms, total: 2.09 s
Wall time: 2.12 s


In [4]:
targetcol = 'loan_default'
train_df.shape, test_df.shape

((233154, 41), (112392, 40))

In [5]:
train_df.head()

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,DisbursalDate,State_ID,Employee_code_ID,MobileNo_Avl_Flag,Aadhar_flag,PAN_flag,VoterID_flag,Driving_flag,Passport_flag,PERFORM_CNS.SCORE,PERFORM_CNS.SCORE.DESCRIPTION,PRI.NO.OF.ACCTS,PRI.ACTIVE.ACCTS,PRI.OVERDUE.ACCTS,PRI.CURRENT.BALANCE,PRI.SANCTIONED.AMOUNT,PRI.DISBURSED.AMOUNT,SEC.NO.OF.ACCTS,SEC.ACTIVE.ACCTS,SEC.OVERDUE.ACCTS,SEC.CURRENT.BALANCE,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,loan_default
0,420825,50578,58400,89.55,67,22807,45,1441,01-01-84,Salaried,03-08-18,6,1998,1,1,0,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0,0
1,537409,47145,65550,73.23,67,22807,45,1502,31-07-85,Self employed,26-09-18,6,1998,1,1,0,0,0,0,598,I-Medium Risk,1,1,1,27600,50200,50200,0,0,0,0,0,0,1991,0,0,1,1yrs 11mon,1yrs 11mon,0,1
2,417566,53278,61360,89.63,67,22807,45,1497,24-08-85,Self employed,01-08-18,6,1998,1,1,0,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0,0
3,624493,57513,66113,88.48,67,22807,45,1501,30-12-93,Self employed,26-10-18,6,1998,1,1,0,0,0,0,305,L-Very High Risk,3,0,0,0,0,0,0,0,0,0,0,0,31,0,0,0,0yrs 8mon,1yrs 3mon,1,1
4,539055,52378,60300,88.39,67,22807,45,1495,09-12-77,Self employed,26-09-18,6,1998,1,1,0,0,0,0,0,No Bureau History Available,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,1,1


Preprocessing 

In [6]:
def preproc(data):
    if 'date_of_birth' not in list(data.columns):
        data['date_of_birth']=pd.to_datetime(data['Date.of.Birth'],format='%d-%m-%y')
    data['age']=(pd.to_datetime('today') - data['date_of_birth']).dt.days / 365.25

    print('age completed')
    
    if 'disbursal_date' not in list(data.columns):
        data['disbursal_date']=pd.to_datetime(data['DisbursalDate'],format='%d-%m-%y')
    data['disbursal_month']=data['disbursal_date'].dt.month
    data['disbursal_year']=data['disbursal_date'].dt.year
    data['disbursal_day']=data['disbursal_date'].dt.day
    data['disbursal_dayofweek']=data['disbursal_date'].dt.dayofweek
    
    print('disbursal_date completed')

    data['PERFORM_CNS.SCORE.CATEGORY'],indexer=pd.factorize(data['PERFORM_CNS.SCORE.DESCRIPTION'])
    data['Employment.Type.Category'],indexer=pd.factorize(data['Employment.Type'])
    print('factorize completed')
    
    data['PERFORM_CNS.SCORE.CATEGORY'] = data['PERFORM_CNS.SCORE.CATEGORY'].astype('category')
    data['Employment.Type.Category'] = data['Employment.Type.Category'].astype('category')

    col = 'AVERAGE.ACCT.AGE'
    newcol = col + '_MONTHS'
    data[newcol]=convertyearmonthstring(col,data)
    print('completed:',col)
    col = 'CREDIT.HISTORY.LENGTH'
    newcol = col + '_MONTHS'
    data[newcol]=convertyearmonthstring(col,data)
    print('completed:',col)

    return data

def convertyearmonthstring(col,data):
    temp_df = data[col].str.split(" ", n = 1, expand = True) 
    temp_df[0] = temp_df[0].str.extract('(\d+)').astype('int') * 12
    temp_df[1] = temp_df[1].str.extract('(\d+)').astype('int')

    newseries = temp_df[0] + temp_df[1] 
    return newseries

In [7]:
train_df['istrain']=1
test_df['istrain']=0
combined = pd.concat([train_df,test_df])

In [8]:
combined = preproc(combined)
newcols = ['AVERAGE.ACCT.AGE','AVERAGE.ACCT.AGE_MONTHS','CREDIT.HISTORY.LENGTH','CREDIT.HISTORY.LENGTH_MONTHS',
           'DisbursalDate','disbursal_month','Date.of.Birth','age']
print(combined[newcols].head())
print(combined[newcols].tail())

age completed
disbursal_date completed
factorize completed
completed: AVERAGE.ACCT.AGE
completed: CREDIT.HISTORY.LENGTH
  AVERAGE.ACCT.AGE  AVERAGE.ACCT.AGE_MONTHS CREDIT.HISTORY.LENGTH  \
0        0yrs 0mon                        0             0yrs 0mon   
1       1yrs 11mon                       23            1yrs 11mon   
2        0yrs 0mon                        0             0yrs 0mon   
3        0yrs 8mon                        8             1yrs 3mon   
4        0yrs 0mon                        0             0yrs 0mon   

   CREDIT.HISTORY.LENGTH_MONTHS DisbursalDate  disbursal_month Date.of.Birth  \
0                             0      03-08-18                8      01-01-84   
1                            23      26-09-18                9      31-07-85   
2                             0      01-08-18                8      24-08-85   
3                            15      26-10-18               10      30-12-93   
4                             0      26-09-18                9   

In [9]:
mask =combined['istrain']==1
train_df = combined[mask]
test_df = combined[~mask]
del train_df['istrain'],test_df['istrain'], combined

In [10]:
train_df.to_csv('train_preproc.csv')
test_df.to_csv('test_preproc.csv')

Permute Features

In [11]:
def permutation_feature_selection(model, X_val, y_val, score_function,subset_feats=None,pred_proba=False, rep=3, max_delta_score=0.0001):

    # to do: predict_proba from Booster

    """""
    Perform permutation feature importance calculation for trained LightGBM model. 
    Scorer - ROC AUC. 
    The lower score with permuted feature - the more important feature is. 
    
    Parameters
    ----------
    model : lightGBM.Booster
        Trained model to perform feature importance calculation 
        
    X_val : pandas.DataFrame
        Validation dataset
            
    y_val : pandas.Series 
        Targets for validation dataset
       
    rep : integer (default = 3)
        Number of permutations. More permutations lead to more robust results, but requires more calculation time.
        It is recommended to use values in the range [3; 10].
        
    min_delta_score : float (default = 0.0)
        Minimum delta ins score to keep feature 
        It is recommended to use values in the range [-0.0001; 0].
        
    Output
    ----------
    selected_features : list
        Important features 
        
    importance_df : pandas.DataFrame 
        Records of calculations 
    """""

    print('Permutation feature importance is calculating...')

    columns = []
    scores = []
    stds = []
    score_max = []
    score_min = []

    # calculate the score of model with no permuted features, this is our baseline
    if pred_proba:
        y_hat_nopert = model.predict_proba(X_val)[:,1]
    else:
        y_hat_nopert = model.predict(X_val,model.best_iteration)
    score_init = score_function(y_val, y_hat_nopert)
#     score_init = np.sqrt(mean_squared_error(y_val, y_hat_nopert))
#     print('score init:',score_init)
    col_iter = 1
    
    if subset_feats!=None:
        cols = subset_feats
    else:
        cols = X_val.columns
    
    for cc in tqdm(cols):
#         if col_iter > 3:
#             break
        scores_cc = []
        for seed in range(rep):
            # shuffle single column of dataset
            data_temp = X_val.copy()
            if isinstance(data_temp[cc].dtype, pd.api.types.CategoricalDtype):
#             if data_temp[cc].dtype.name == 'category':
                data_temp[cc] = 0
                data_temp[cc] = data_temp[cc].astype('category')
            else:
                data_temp[cc] = np.nan
#             data_temp[cc] = data_temp[cc].sample(n=X_val.shape[0], random_state=seed).reset_index(drop=True)

            # make prediction on 'shuffled' dataset and score it
            if pred_proba:
                # calibration classifier fit
#                 model.fit(data_temp, y_val)
                y_hat = model.predict_proba(data_temp)[:,1]
            else:
                y_hat = model.predict(data_temp,model.best_iteration)
            score = score_function(y_val, y_hat)
#             print('col:',cc)
#             print('score new:',score)
#             score = np.sqrt(mean_squared_error(y_val, y_hat))
            scores_cc.append(score)

        columns.append(cc)
        scores.append(np.mean(scores_cc) - score_init)
        stds.append(np.std(scores_cc))
        score_max.append(np.max(scores_cc) - score_init)
        score_min.append(np.min(scores_cc) - score_init)
        
        col_iter+=1

    importance_df = pd.DataFrame({'delta_score_mean': scores,
                                  'delta_score_std': stds,
                                  'delta_score_max': score_max,
                                  'delta_score_min': score_min,
                                  'feature': columns
                                  })
    importance_df = importance_df.sort_values(by='delta_score_mean')
    selected_features = list(importance_df.loc[importance_df['delta_score_mean'] >= max_delta_score, 'feature'])

    print('Permutation feature importance calculation is done. Overall number of features: ', importance_df.shape[0],
          'Number of selected features:', len(selected_features))

    return selected_features, importance_df


In [12]:
def get_auc_score_complement(labels,preds):
     return 1 - roc_auc_score(labels, preds)

In [13]:
import time
def runlgb(ispermutefeats,train,test,param,cur_features,score_function=None):

    overall_sel_feats =[]
    overall_imp_df = pd.DataFrame()
    overall_imp_df['feature']= np.array(cur_features)
    overall_imp_df['overall_score_mean'] =0 
    overall_imp_df['overall_score_max'] =-9999 
    overall_imp_df['overall_score_min'] =9999 
    
    oof = np.zeros(len(train))
    predictions = np.zeros(len(test))
    start = time.time()
    valid_scores =[]
    fold_importance_df = pd.DataFrame()
    

    folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=4590)
    indices = folds.split(train.values, target.values)
        
    for fold_, (trn_idx, val_idx) in enumerate(indices):
        print()
        print("fold n°{}".format(fold_))

        tr = train.iloc[trn_idx]
        val = train.iloc[val_idx]
        y_val = target.iloc[val_idx]
        y_tr = target.iloc[trn_idx]
        
#         val_index_ser = pd.Series(np.array(val.index))
#         print('val shape:',val.shape)
#         print('val index head:',val_index_ser.head(20))
#         print('val index tail:',val_index_ser.tail(20))
        
        trn_data = lgb.Dataset(tr[cur_features], label=y_tr)#,, categorical_feature=categorical_feats)
        val_data = lgb.Dataset(val[cur_features], label=y_val)#,, categorical_feature=categorical_feats)
        
        clf = lgb.train(param, trn_data, num_round, valid_sets = [val_data], verbose_eval=500, 
                        early_stopping_rounds = 300)

        #Prediction based on current fold selected features
        if ispermutefeats:
            
            selected_features, importance_df = permutation_feature_selection(clf, val[cur_features], 
                                                                             y_val,score_function,
                                                                             rep=4,max_delta_score=max_delta_score)
            overall_sel_feats += [selected_features]
            print(selected_features)

#             print('overal imp shape:{0} importance_df shape:{1}'.format(overall_imp_df.shape,importance_df.shape))
            
            overall_imp_df['fold_'+str(fold_)+'score_mean'] = importance_df['delta_score_mean']
            overall_imp_df['fold_'+str(fold_)+'score_max'] = importance_df['delta_score_max']
            overall_imp_df['fold_'+str(fold_)+'score_min'] = importance_df['delta_score_min']
        else:
            oof[val_idx] = clf.predict(val[cur_features], num_iteration=clf.best_iteration)

            fold_importance_df["feature"] = cur_features
            if fold_==0:
                fold_importance_df["importance"] =0
            fold_importance_df["importance"] += clf.feature_importance() / n_splits
            valid_scores+=[clf.best_score['valid_0'][param['metric']]]
            predictions += clf.predict(test[cur_features], num_iteration=clf.best_iteration) / folds.n_splits

    if ispermutefeats:
        fold_mean_cols = [col for col in overall_imp_df.columns if ('score_mean' in col) and ('fold_' in col) ]
        fold_max_cols = [col for col in overall_imp_df.columns if ('score_max' in col) and ('fold_' in col) ]
        fold_min_cols = [col for col in overall_imp_df.columns if ('score_min' in col) and ('fold_' in col) ]
        overall_imp_df['overall_score_mean'] = overall_imp_df[fold_mean_cols].mean(axis=1)
        overall_imp_df['overall_score_max'] = overall_imp_df[fold_max_cols].max(axis=1)
        overall_imp_df['overall_score_min'] = overall_imp_df[fold_min_cols].min(axis=1)
    else:
        print('valid scores:',valid_scores)
        print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

    return fold_importance_df,predictions,oof,overall_imp_df,overall_sel_feats

Model

In [14]:
exclude_cols =['Date.of.Birth','Employment.Type','DisbursalDate',
               'PERFORM_CNS.SCORE.DESCRIPTION','AVERAGE.ACCT.AGE','CREDIT.HISTORY.LENGTH',
               'MobileNo_Avl_Flag','disbursal_year','disbursal_day','disbursal_dayofweek',
               'date_of_birth', 'disbursal_date',
               'UniqueID',targetcol]
features = [c for c in train_df.columns if c not in exclude_cols]
print(features)
target = train_df[targetcol]

['Aadhar_flag', 'Current_pincode_ID', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'Driving_flag', 'Employee_code_ID', 'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'NO.OF_INQUIRIES', 'PAN_flag', 'PERFORM_CNS.SCORE', 'PRI.ACTIVE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.DISBURSED.AMOUNT', 'PRI.NO.OF.ACCTS', 'PRI.OVERDUE.ACCTS', 'PRI.SANCTIONED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'Passport_flag', 'SEC.ACTIVE.ACCTS', 'SEC.CURRENT.BALANCE', 'SEC.DISBURSED.AMOUNT', 'SEC.INSTAL.AMT', 'SEC.NO.OF.ACCTS', 'SEC.OVERDUE.ACCTS', 'SEC.SANCTIONED.AMOUNT', 'State_ID', 'VoterID_flag', 'asset_cost', 'branch_id', 'disbursed_amount', 'ltv', 'manufacturer_id', 'supplier_id', 'age', 'disbursal_month', 'PERFORM_CNS.SCORE.CATEGORY', 'Employment.Type.Category', 'AVERAGE.ACCT.AGE_MONTHS', 'CREDIT.HISTORY.LENGTH_MONTHS']


In [15]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'auc',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 4,
         'n_estimators' : 10000,
         "random_state": 4590}

In [16]:
n_splits=5
num_round = 10
max_delta_score =0.0001

In [17]:
# #permute feature importances
# features = ['Employment.Type.Category','PERFORM_CNS.SCORE.CATEGORY',
#                   'PRI.NO.OF.ACCTS', 'PRI.OVERDUE.ACCTS',
#                   'PRI.SANCTIONED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'Passport_flag', 'SEC.ACTIVE.ACCTS']
# fold_importance_df,predictions,oof,overall_imp_df,overall_sel_feats = \
#         runlgb(True,train_df,test_df,param,features,score_function=get_auc_score_complement)

In [18]:
# selected_features = list(overall_imp_df.loc[overall_imp_df['overall_score_mean'] >= max_delta_score, 'feature'])
# selected_features.sort()
# df = pd.DataFrame( np.array(selected_features))
# df.to_csv("overall_selected_features.csv")
# selected_features

In [19]:
# overall_sel_feats[0].sort()
# df = pd.DataFrame(overall_sel_feats[0])
# df.to_csv("selected_feats_fold_0.csv")
# overall_sel_feats[0]

In [20]:
# overall_sel_feats[1].sort()
# df = pd.DataFrame(overall_sel_feats[0])
# df.to_csv("selected_feats_fold_1.csv")
# overall_sel_feats[1]

In [21]:
# overall_sel_feats[2].sort()
# df = pd.DataFrame(overall_sel_feats[0])
# df.to_csv("selected_feats_fold_2.csv")
# overall_sel_feats[2]

In [22]:
# overall_sel_feats[3].sort()
# df = pd.DataFrame(overall_sel_feats[0])
# df.to_csv("selected_feats_fold_3.csv")
# overall_sel_feats[3]

In [23]:
# overall_sel_feats[4].sort()
# df = pd.DataFrame(overall_sel_feats[0])
# df.to_csv("selected_feats_fold_4.csv")
# overall_sel_feats[4]

In [24]:
# overall_imp_df.sort_values(by='overall_score_mean',ascending=False,inplace=True)
# overall_imp_df.to_csv('overall_feats_allfolds.csv')
# overall_imp_df

In [25]:
selected_features = ['AVERAGE.ACCT.AGE_MONTHS',
 'Aadhar_flag',
 'CREDIT.HISTORY.LENGTH_MONTHS',
 'Current_pincode_ID',
 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS',
 'Employee_code_ID',
 'Employment.Type.Category',
 'NEW.ACCTS.IN.LAST.SIX.MONTHS',
 'NO.OF_INQUIRIES',
 'PAN_flag',
 'PERFORM_CNS.SCORE',
 'PERFORM_CNS.SCORE.CATEGORY',
 'PRI.ACTIVE.ACCTS',
 'PRI.CURRENT.BALANCE',
 'PRI.DISBURSED.AMOUNT',
 'PRI.NO.OF.ACCTS',
 'PRI.OVERDUE.ACCTS',
 'PRI.SANCTIONED.AMOUNT',
 'PRIMARY.INSTAL.AMT',
 'State_ID',
 'VoterID_flag',
 'age',
 'asset_cost',
 'branch_id',
 'disbursal_month',
 'disbursed_amount',
 'ltv',
 'manufacturer_id',
 'supplier_id']

In [26]:
n_splits=10

fold_importance_df,predictions_1,oof_1,overall_imp_df,overall_sel_feats = \
        runlgb(False,train_df,test_df,param,features,score_function=get_auc_score_complement)

n_splits=5


fold n°0
Training until validation scores don't improve for 300 rounds.
[500]	valid_0's auc: 0.668308
[1000]	valid_0's auc: 0.673034
[1500]	valid_0's auc: 0.674765
[2000]	valid_0's auc: 0.675585
[2500]	valid_0's auc: 0.676273
[3000]	valid_0's auc: 0.67698
[3500]	valid_0's auc: 0.677364
[4000]	valid_0's auc: 0.677561
[4500]	valid_0's auc: 0.677566
Early stopping, best iteration is:
[4291]	valid_0's auc: 0.6777

fold n°1
Training until validation scores don't improve for 300 rounds.
[500]	valid_0's auc: 0.662681
[1000]	valid_0's auc: 0.66748
[1500]	valid_0's auc: 0.669627
[2000]	valid_0's auc: 0.670703
[2500]	valid_0's auc: 0.671327
[3000]	valid_0's auc: 0.671813
[3500]	valid_0's auc: 0.67211
Early stopping, best iteration is:
[3593]	valid_0's auc: 0.672165

fold n°2
Training until validation scores don't improve for 300 rounds.
[500]	valid_0's auc: 0.665361
[1000]	valid_0's auc: 0.66887
[1500]	valid_0's auc: 0.669929
[2000]	valid_0's auc: 0.670581
[2500]	valid_0's auc: 0.670839
Early s

In [27]:
sub_df = pd.DataFrame({"UniqueID":test_df["UniqueID"].values})
sub_df[targetcol] = predictions_1
sub_df.to_csv("submission_10folds.csv", index=False)

In [28]:
param = {'colsample_bytree': 0.7196484570790651,
   'min_child_samples': 235,
   'num_leaves': 36,
   'reg_alpha': 0.6474702076362333,
   'reg_lambda': 0.021458900986429996,
   'subsample': 0.8873887256306612,
   'subsample_for_bin': 110000,
   'learning_rate': 0.01,
   'boosting': 'gbdt',
   'bagging_seed': 2018,
   'bagging_freq': 2,
   'min_data_in_bin': 100,
   'n_estimators': 10000,
   'objective': 'binary',
   'metric': 'auc',
   'random_state': 2333,
   'max_depth': 15,
   'scale_pos_weight': 1}

In [29]:
%%time

train_df = pd.read_csv('train_preproc.csv',index_col=0)
test_df = pd.read_csv('test_preproc.csv',index_col=0)

CPU times: user 2.4 s, sys: 104 ms, total: 2.51 s
Wall time: 2.51 s


In [30]:
fold_importance_df,predictions_2,oof_2,overall_imp_df,overall_sel_feats = \
        runlgb(False,train_df,test_df,param,features,score_function=get_auc_score_complement)


fold n°0
Training until validation scores don't improve for 300 rounds.
[500]	valid_0's auc: 0.667691
[1000]	valid_0's auc: 0.672585
[1500]	valid_0's auc: 0.674725
[2000]	valid_0's auc: 0.675616
[2500]	valid_0's auc: 0.676097
[3000]	valid_0's auc: 0.676247
Early stopping, best iteration is:
[3007]	valid_0's auc: 0.676268

fold n°1
Training until validation scores don't improve for 300 rounds.
[500]	valid_0's auc: 0.663716
[1000]	valid_0's auc: 0.668114
[1500]	valid_0's auc: 0.669879
[2000]	valid_0's auc: 0.67104
[2500]	valid_0's auc: 0.671472
[3000]	valid_0's auc: 0.671552
Early stopping, best iteration is:
[3179]	valid_0's auc: 0.671742

fold n°2
Training until validation scores don't improve for 300 rounds.
[500]	valid_0's auc: 0.663908
[1000]	valid_0's auc: 0.66928
[1500]	valid_0's auc: 0.67081
[2000]	valid_0's auc: 0.671479
[2500]	valid_0's auc: 0.672008
[3000]	valid_0's auc: 0.672211
[3500]	valid_0's auc: 0.672309
Early stopping, best iteration is:
[3563]	valid_0's auc: 0.67236



In [31]:
# folds = StratifiedKFold(n_splits=5, shuffle=False, random_state=888)
# oof_1 = np.zeros(len(train_df))
# predictions = np.zeros(len(test_df))
# feature_importance_df = pd.DataFrame()

# for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
#     print("Fold {}".format(fold_))
#     trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
#     val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])

#     num_round = 1000000
#     clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=500, 
#                     early_stopping_rounds = 300)
#     oof_1[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
#     fold_importance_df = pd.DataFrame()
#     fold_importance_df["Feature"] = features
#     fold_importance_df["importance"] = clf.feature_importance()
#     fold_importance_df["fold"] = fold_ + 1
#     feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
#     predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits

# print("CV score: {:<8.5f}".format(roc_auc_score(target, oof_1)))

In [32]:
sub_df = pd.DataFrame({"UniqueID":test_df["UniqueID"].values})
sub_df[targetcol] = predictions_2
sub_df.to_csv("submission_bayesian.csv", index=False)

In [33]:
oof = oof_1 + oof_2
print('Ens AUC:',roc_auc_score(target,oof))
predictions = predictions_1 + predictions_2

Ens AUC: 0.6732370853711219


In [34]:
sub_df = pd.DataFrame({"UniqueID":test_df["UniqueID"].values})
sub_df[targetcol] = predictions
sub_df.to_csv("submission_ens_10folds_bayesian.csv", index=False)

In [35]:
# cols = (feature_importance_df[["Feature", "importance"]]
#         .groupby("Feature")
#         .mean()
#         .sort_values(by="importance", ascending=False)[:150].index)
# best_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]

# plt.figure(figsize=(14,28))
# sns.barplot(x="importance", y="Feature", data=best_features.sort_values(by="importance",ascending=False))
# plt.title('Features importance (averaged/folds)')
# plt.tight_layout()
# plt.savefig('FI.png')