Baseline для соревнования Home Credit Default Risk
https://www.kaggle.com/c/home-credit-default-risk

In [2]:
import pandas as pd
import numpy as np
import sklearn
import os
import pyprind
import gc
import re
import math
import seaborn as sns
import warnings
import copy
import time
import matplotlib.pyplot as plt

In [3]:
import lightgbm
from lightgbm import LGBMClassifier

from sklearn.preprocessing import LabelEncoder,MinMaxScaler,StandardScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split,StratifiedKFold,KFold
from sklearn.metrics import roc_auc_score,classification_report

from scipy.stats import kendalltau, pearsonr, linregress
from pyprind import ProgBar
from collections import OrderedDict

In [5]:
pd.options.display.max_columns = 150
pd.options.display.max_rows = 150
%matplotlib inline
warnings.simplefilter('ignore')
PATH_TO_DATA = 'D:\home_bank_data'
stat_list = ['mean','median','std','max','min']

In [6]:
def reduce_caregorical(columns,count_lim):
    placeholder = 'rare_val' if columns.dtype==object else -1
    vals_count = columns.value_counts()
    vals_count = set(vals_count[vals_count.map(lambda x : x > count_lim )].index)
    return columns.map(lambda x : x if x in vals_count else placeholder )

generate_name = lambda names,prefix : [ '{}_{}'.format(prefix,name) for name in names ]

def calc_stat_fea(groups,columns,stat_list):
    
    result = pd.DataFrame()
    for fea in columns:
        if fea not in ['SK_ID_PREV','SK_ID_CURR']:
            stats = groups[fea].aggregate(stat_list)
            stats.columns = generate_name(stats,fea)
            result = pd.concat( (result, stats),axis=1 )
    return result

def get_last_fea(df,bar,last_name,order_col):
    bar.update()
    return df.loc[df[order_col].idxmax()][last_name]

def extracrt_index_and_diff(df,bar,fea_name,period_count,order_name):
    df = df.sort_values(order_name)
    placeholder = pd.Series( index = [ 'index_{}_{}'.format(fea_name,val) for val in range(period_count) ] \
                            + ['diff_{}_{}'.format(fea_name,val) for val in range(period_count)] )
    if len(df)<period_count+1:
        period_count = len(df)-1
    fea_col = df[fea_name]
    for i in range(0,period_count):
        vals_1,vals_2 = fea_col.iloc[-(i+1)],fea_col.iloc[-(i+2)]
        placeholder['index_{}_{}'.format(fea_name,i)] = vals_1  / vals_2
        placeholder['diff_{}_{}'.format(fea_name,i)] = vals_1 -vals_2
    bar.update()
    return placeholder

In [63]:
def validate(data,model_type, param,nfolds, fea_list, validate_size = 0.1, stratify=False, seed = 42,
             validate_seed = 42, is_debug = True,use_early_stop = False , early_stop_verbose = 10, 
             early_stopping_rounds = 10):
    
    #Tracer()()
    labeled_data = data[data.TARGET.notnull()]
    if not is_debug:
        submit_data = data[data.TARGET.isnull()][fea_list]
    train_data,validate_data = train_test_split(labeled_data, test_size=validate_size, random_state=validate_seed)
    validate_set,validate_target = validate_data[fea_list], validate_data['TARGET']
    del validate_data,labeled_data
    
    splits = KFold(n_splits=nfolds,shuffle=True,random_state=seed)
    validate_preditction = np.zeros(len(validate_set))
    if not is_debug:
        submit_prediction = np.zeros(len(submit_data))
    else:
        fea_imp = np.zeros((len(fea_list),nfolds))

    param = copy.deepcopy(param)
    param['random_state'] = seed
    oof_score = np.zeros(nfolds)
    for i,(train_idx,test_idx) in enumerate(splits.split(train_data)):
        
        train_set,test_set = train_data.iloc[train_idx][fea_list],train_data.iloc[test_idx][fea_list]
        train_target,test_target = train_data.iloc[train_idx]['TARGET'],train_data.iloc[test_idx]['TARGET']
        
        model = model_type(**param)
        if use_early_stop:
            model.fit(train_set,train_target,eval_set = [(test_set,test_target)], verbose = early_stop_verbose,
                      early_stopping_rounds=early_stopping_rounds, eval_metric='auc')
        else:
            model.fit(train_set,train_target)
        
        test_prediction = model.predict_proba(test_set)[:,1]
        train_prediction = model.predict_proba(train_set)[:,1]
        validate_preditction += model.predict_proba(validate_set)[:,1]
        test_auc = roc_auc_score(y_score=test_prediction,y_true=test_target)
        oof_score[i] = test_auc 
        print('fold = {} oof score = {}, train score = {} , validate_score = {} '.format( i+1,
            test_auc,
            roc_auc_score(y_score=train_prediction,y_true=train_target),
            roc_auc_score(y_score=model.predict_proba(validate_set)[:,1],y_true=validate_target)))
                        
        if not is_debug:
            submit_prediction += model.predict_proba(submit_data)[:,1]
        else:
            fea_imp[:,i] = model.feature_importances_
        
        del train_set,train_target,test_set,test_target,model
        gc.collect()
    
    validate_preditction/= nfolds 
    print('with seed {} mean oof = {} , std oof = {}'.format(seed,np.mean(oof_score),np.std(oof_score)))
    print('with validation seed {} validate score = {}'.format( validate_seed, 
                                                    roc_auc_score(y_score=validate_preditction,y_true=validate_target)))
    
    if not is_debug:
        submit_prediction/= nfolds
        return pd.Series(submit_prediction,index = submit_data.index)
    else:
        return fea_imp
    

In [52]:
def get_app_data(train_df,test_df):
    
    train_df = train_df.copy(deep=True)
    train_df.drop(axis=0,index=train_df.query(
    '(NAME_FAMILY_STATUS=="Unknown")|(CODE_GENDER=="XNA")|(NAME_INCOME_TYPE=="Maternity leave")').index,inplace=True)
    drop_list = [ fea for fea in test_df if test_df[fea].nunique()==1 ]
    app_data = pd.concat((train_df,test_df))
    app_data.drop(drop_list,axis=1,inplace=True)

    bin_fea = [fea for fea in app_data if app_data[fea].nunique()==2 and app_data[fea].dtype==object]
    for fea in bin_fea:
        app_data.loc[:,fea],_ = pd.factorize(app_data[fea])

    app_data.loc[:,'over_sum_amount'] = app_data.AMT_CREDIT - app_data.AMT_GOODS_PRICE
    app_data.loc[:,'over_sum_amount'] = app_data['over_sum_amount'].fillna(0)
    app_data.loc[:,'relation_amount'] = app_data['over_sum_amount'] / app_data.AMT_CREDIT
    app_data.loc[:,'part_of_income'] =  app_data['AMT_ANNUITY'] / app_data['AMT_INCOME_TOTAL']
    app_data.loc[:,'part_of_income_per_person'] = (app_data['AMT_INCOME_TOTAL']-app_data['AMT_ANNUITY'])\
                                                /(app_data['CNT_CHILDREN']+1)
    app_data.loc[:,'payment_rate'] = app_data['AMT_ANNUITY'] / app_data['AMT_CREDIT']
    app_data.loc[:,'rel_size'] = app_data['AMT_CREDIT'] / app_data['AMT_INCOME_TOTAL']
    app_data.loc[:,'credit_rate'] = app_data['AMT_CREDIT']  / app_data['AMT_ANNUITY']
    app_data.loc[:,'work_rate'] = app_data['DAYS_EMPLOYED'] / app_data['DAYS_BIRTH'] 
    app_data.loc[:,'price_rate'] = app_data['AMT_CREDIT'] / app_data['AMT_GOODS_PRICE']
    app_data.loc[:,'EXT_SOURCE_2 / DAYS_BIRTH'] = app_data['EXT_SOURCE_2'] / app_data['DAYS_BIRTH']
    app_data.loc[:,'EXT_SOURCE_1*EXT_SOURCE_2'] = app_data['EXT_SOURCE_1']*app_data['EXT_SOURCE_2']
    app_data.loc[:,'DAYS_EMPLOYED-DAYS_BIRTH'] = app_data['DAYS_EMPLOYED'] - app_data['DAYS_BIRTH']
    ext_name = ['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']

    app_data.loc[:,'ext_means'] = app_data[ext_name].mean(axis=1)
    app_data.loc[:,'ext_std'] = app_data[ext_name].std(axis=1)
    app_data.loc[:,'delta_work_both'] = app_data['DAYS_EMPLOYED'] - app_data['DAYS_BIRTH']

    dummy_col = [ fea for fea in app_data if app_data[fea].dtype==object ]
    app_data = pd.get_dummies(app_data,columns=dummy_col)
    app_data.set_index('SK_ID_CURR',inplace=True)
    return app_data

In [44]:
def bureau_and_balance_data(df_bureau,balance_data):
    
    df_bureau = df_bureau.copy(deep=True)
    df_bureau.loc[:,'credit_period_waited'] = df_bureau.DAYS_CREDIT_ENDDATE - df_bureau.DAYS_CREDIT
    df_bureau.loc[:,'credit_period_fact'] = df_bureau.DAYS_ENDDATE_FACT - df_bureau.DAYS_CREDIT
    df_bureau.loc[:,'debt'] =  (df_bureau.AMT_CREDIT_SUM - df_bureau.AMT_CREDIT_SUM_DEBT)/ \
                            df_bureau.AMT_CREDIT_SUM
    df_bureau.loc[:,'annuity'] = df_bureau.AMT_CREDIT_SUM / (df_bureau.credit_period_waited/30.4)
    df_bureau.loc[:,'payment_rate'] = df_bureau.annuity / df_bureau.AMT_CREDIT_SUM
    df_bureau.loc[:,'diffs_period'] = df_bureau.credit_period_waited - df_bureau.credit_period_fact
    df_bureau.loc[:,'rate_period'] = df_bureau.credit_period_waited / df_bureau.credit_period_fact
    
    df_bureau = pd.get_dummies(df_bureau,columns=['CREDIT_TYPE','CREDIT_ACTIVE'])
    dummy_name = [ fea for fea in df_bureau if re.match('CREDIT_TYPE',fea) or re.match('CREDIT_ACTIVE',fea) ] 
    
    group = df_bureau.groupby(by='SK_ID_CURR')
    cat_sum = group[dummy_name].sum()
    cat_sum.columns = generate_name(cat_sum.columns,'sum')
    cat_mean = group[dummy_name].mean()
    cat_mean.columns = generate_name(cat_mean.columns,'mean')
    last_update = group['DAYS_CREDIT_UPDATE'].max()
    last_update.name= 'last_update'
    ltv = group['DAYS_CREDIT'].min() - group['DAYS_CREDIT_UPDATE'].max()
    ltv.name = 'ltv'
    annuity_diff = group['annuity'].max() - group['annuity'].min()
    annuity_diff.name = 'annuity_diff'
    payment_rate_diff = group['payment_rate'].max() - group['payment_rate'].min()
    payment_rate_diff.name = 'payment_rate_diff'
    sum_overdue = group['AMT_CREDIT_SUM_OVERDUE'].sum()
    sum_overdue.name = 'sum_overdue'

    data_without_card = df_bureau[df_bureau['CREDIT_TYPE_Credit card']==0]
    group = data_without_card.groupby(by='SK_ID_CURR')
    credit_count  = group['SK_ID_BUREAU'].size()
    credit_count.name = 'credit_count'
    non_card_stat_fea_name = ['credit_period_waited','credit_period_fact','debt','annuity','payment_rate',
                          'AMT_CREDIT_SUM','AMT_CREDIT_SUM_DEBT','DAYS_CREDIT_UPDATE','diffs_period','rate_period']
    stat_name = ['min','max','median','mean','std']
    non_card_fea = calc_stat_fea(group,non_card_stat_fea_name,stat_name)
    non_card_fea.columns = generate_name(non_card_fea.columns,'non_card_fea')

    bar = ProgBar(len(group),title='extract non-card last fea')
    non_card_last = group.apply(get_last_fea,bar,non_card_stat_fea_name,'DAYS_CREDIT_UPDATE')
    non_card_last.columns = generate_name(non_card_last.columns,'last_non_card_fea')

    card_data = df_bureau[df_bureau['CREDIT_TYPE_Credit card']==1]
    group = card_data.groupby(by='SK_ID_CURR')
    card_count = group.size()
    card_count.name = 'card_count'
    card_fea_name = ['debt','AMT_CREDIT_SUM','AMT_CREDIT_SUM_DEBT']
    card_fea = calc_stat_fea(group,card_fea_name,stat_name)
    card_fea.columns = generate_name(card_fea.columns,'card_fea')

    bar = ProgBar(len(group),title='extract card last fea')
    card_last = card_data.groupby(by='SK_ID_CURR').apply(get_last_fea,bar,card_fea_name,'DAYS_CREDIT_UPDATE')
    card_last.columns = generate_name(card_last.columns,'last')
    
    balance_data = balance_data.merge(right=df_bureau[['SK_ID_BUREAU','SK_ID_CURR']],
                          right_on='SK_ID_BUREAU',left_on = 'SK_ID_BUREAU',how='left')
    balance_data = pd.get_dummies(balance_data,columns=['STATUS'])
    bureau_balance_dummy_name = [ fea for fea in balance_data if re.match('STATUS',fea)]
    group = balance_data.groupby(by='SK_ID_CURR')
    sum_status = group[bureau_balance_dummy_name].sum()
    sum_status.columns = generate_name(sum_status.columns,'sum_status')
    mean_status = group[bureau_balance_dummy_name].mean()
    mean_status.columns = generate_name(mean_status.columns,'mean_status')
    
    bureau_df = pd.concat( (cat_sum,cat_mean, last_update, ltv,credit_count, non_card_fea, annuity_diff,
                            payment_rate_diff, sum_overdue, card_count, card_fea, card_last,non_card_last,
                        sum_status,mean_status, ),axis=1 )
    bureau_df.columns = generate_name(bureau_df.columns,'bureau')
    return bureau_df

In [35]:
def get_previous_application_features(df_prev):

    df_prev = df_prev.copy(deep=True)
    df_prev.loc[:,'diff_amt_credit_amt_good_price'] = df_prev.AMT_CREDIT - df_prev.AMT_GOODS_PRICE
    df_prev.loc[:,'diff_amt_credit_amt_application']= df_prev.AMT_CREDIT - df_prev.AMT_APPLICATION
    df_prev.loc[:,'diff_amt_application_amt_good_price'] = df_prev.AMT_APPLICATION - df_prev.AMT_GOODS_PRICE
    df_prev.loc[:,'pure_sum'] = df_prev.AMT_GOODS_PRICE - df_prev.AMT_DOWN_PAYMENT.fillna(0)
    df_prev.loc[:,'diff_credit_pure'] = df_prev.AMT_CREDIT - df_prev.pure_sum
    df_prev.loc[:,'payment_rate'] = df_prev.AMT_ANNUITY  / df_prev.AMT_CREDIT
    df_prev.loc[:,'total_sum'] = df_prev.CNT_PAYMENT * df_prev.AMT_ANNUITY
    df_prev.loc[:,'diff_total_sum_good_price'] = df_prev.total_sum - df_prev.AMT_GOODS_PRICE
    df_prev.loc[:,'diff_total_sum_amt_credit'] = df_prev.total_sum - df_prev.AMT_CREDIT
    df_prev.loc[:,'diff_total_sum_amt_application'] = df_prev.total_sum  - df_prev.AMT_APPLICATION
    df_prev.loc[:,'diff_total_sum_pure_sum'] = df_prev.total_sum -df_prev.pure_sum
    df_prev.loc[:,'concat_STATUS_TYPE'] = df_prev['NAME_CONTRACT_STATUS'] + df_prev['NAME_CONTRACT_TYPE']
    df_prev.loc[:,'interest_rate'] = (df_prev.total_sum - df_prev.pure_sum)\
                    / (df_prev.CNT_PAYMENT*df_prev.total_sum)
    df_prev.loc[:,'delta_days_waited'] = df_prev.DAYS_LAST_DUE_1ST_VERSION - df_prev.DAYS_FIRST_DUE
    df_prev.loc[:,'delta_days_fact'] = df_prev.DAYS_LAST_DUE - df_prev.DAYS_FIRST_DRAWING
    df_prev.loc[:,'diff_waited_fact'] = df_prev.DAYS_LAST_DUE_1ST_VERSION - df_prev.DAYS_LAST_DUE
    df_prev.loc[:,'rate_fact_waited'] = df_prev.DAYS_LAST_DUE_1ST_VERSION / df_prev.DAYS_LAST_DUE

    approved_data = df_prev.query('(NAME_CONTRACT_TYPE!="Revolving loans")&(NAME_CONTRACT_STATUS=="Approved")')
    app_group = approved_data.groupby(by='SK_ID_CURR')

    bar = ProgBar(approved_data.SK_ID_CURR.nunique(),title='extract payment index')
    index_payment_rate = app_group.apply(extracrt_index_and_diff,bar,'payment_rate',4,'DAYS_DECISION')

    bar = ProgBar(approved_data.SK_ID_CURR.nunique(),title = 'extract interest rate')
    index_interest_rate = app_group.apply(extracrt_index_and_diff,bar,'interest_rate',4,'DAYS_DECISION')

    stat_agg_name = ['diff_amt_application_amt_good_price','diff_credit_pure','pure_sum','payment_rate',
                     'diff_total_sum_amt_credit','diff_total_sum_amt_application','total_sum',
                     'diff_total_sum_pure_sum', 'AMT_ANNUITY','interest_rate','delta_days_waited','delta_days_fact',
                     'diff_waited_fact','rate_fact_waited','CNT_PAYMENT']

    group = df_prev.groupby(by='SK_ID_CURR')
    aggregate_data = calc_stat_fea(group,stat_agg_name,stat_list=stat_list)

    ltv = group['DAYS_DECISION'].min() - group['DAYS_TERMINATION'].max()
    ltv.name = 'ltv'

    last_name = ['diff_total_sum_pure_sum','interest_rate','diff_total_sum_amt_application',
                 'diff_total_sum_amt_credit','diff_total_sum_good_price',
                 'total_sum','payment_rate','pure_sum','diff_amt_application_amt_good_price',
                 'diff_amt_credit_amt_application','diff_amt_credit_amt_good_price','DAYS_TERMINATION',
                 'HOUR_APPR_PROCESS_START','DAYS_DECISION','rate_fact_waited','diff_waited_fact','delta_days_fact',
                 'AMT_ANNUITY']
    bar = ProgBar(len(app_group),title = 'extract last fea')
    last_fea = app_group.apply(get_last_fea,bar,last_name,order_col='DAYS_DECISION')
    last_fea.columns = generate_name(last_fea.columns,'last')

    revolve_name = ['AMT_ANNUITY','AMT_CREDIT','payment_rate']
    revolve_data = df_prev.query('(NAME_CONTRACT_STATUS=="Approved")&(NAME_CONTRACT_TYPE=="Revolving loans")')
    bar = ProgBar(revolve_data.SK_ID_CURR.nunique(),'extract revolve fea')
    revolv_fea = revolve_data.groupby(by='SK_ID_CURR').apply(get_last_fea,bar,revolve_name,'DAYS_DECISION')
    revolv_fea.columns = generate_name(revolv_fea.columns,'revolve')
    
    dummy_col = ['NAME_CONTRACT_STATUS','NAME_CONTRACT_TYPE','concat_STATUS_TYPE']
    dummy_data = pd.get_dummies(df_prev[dummy_col+['SK_ID_CURR']],columns=dummy_col)
    dummy_group = dummy_data.groupby('SK_ID_CURR')
    dummy_sum = dummy_group.sum()
    dummy_sum.columns = generate_name(dummy_sum.columns,'sum')
    dummy_mean = dummy_group.mean()
    dummy_mean.columns = generate_name(dummy_mean.columns,'mean')
    
    result = pd.concat( (index_payment_rate,index_interest_rate,aggregate_data,ltv,last_fea,revolv_fea, 
                         dummy_mean,dummy_sum) ,axis=1)
    result.columns = generate_name(result.columns,'prev')
    return result

In [29]:
fea_names = ['payment_diff','count_over_instalment','amt_over','amt_incomplete',
             'over_rate','incompete_rate','total_instalment','over_count_rate']

def extract_payment_fea(df,bar):                           
    amt_instalment = df.groupby(by=['SK_ID_PREV','DAYS_INSTALMENT'])['AMT_INSTALMENT'].last()
    total_instalment = amt_instalment.sum()    
    total_payment = df['AMT_PAYMENT'].sum()
    payment_diffs = total_instalment - total_payment
    bad_payment_sum = df.query('delta_days<0')['AMT_PAYMENT'].sum()
    over_payment_fea = df.query( '(delta_days>0) & (AMT_PAYMENT>AMT_INSTALMENT)' )['AMT_PAYMENT']
    over_payment_fea.name = 'AMT_OVER'
    add_fea = []
    for col in [amt_instalment,df['AMT_PAYMENT'],over_payment_fea,]:
        stats = col.aggregate(stat_list)
        stats.index = [ '{}_{}'.format(name,col.name) for name in stat_list ]
        add_fea.append(stats)
    over_payment_count = len(over_payment_fea)
    over_payment_sum = over_payment_fea.sum()
    over_rate = over_payment_sum / total_instalment
    incompete_rate = bad_payment_sum / total_instalment
    over_count_rate = over_payment_count / len(amt_instalment)
    bar.update()
    add_fea.append(  pd.Series( 
        [payment_diffs,over_payment_count, over_payment_sum,bad_payment_sum,over_rate,
         incompete_rate,total_instalment,over_count_rate],
        index=fea_names))
    return pd.concat(add_fea)

def get_payment_features(df_payment):
    df_payment.loc[:,'delta_days'] =  df_payment['DAYS_INSTALMENT'] - df_payment['DAYS_ENTRY_PAYMENT']
    df_payment.loc[:,'bad_payment'] = df_payment.delta_days <0

    stat_name = ['mean','median','std','max','min']
    payment_group = df_payment.groupby(by='SK_ID_CURR')
    days = calc_stat_fea(payment_group,['delta_days'],stat_list)
    bad_payment_count = payment_group['bad_payment'].sum()
    bad_payment_rate = payment_group['bad_payment'].mean()
    bad_payment_count.name = 'bad_payment_count'
    bad_payment_rate.name = 'bad_payment_rate'
    num_unique_inst = payment_group['NUM_INSTALMENT_VERSION'].nunique()
    ltv = (payment_group['DAYS_ENTRY_PAYMENT'].max() -  payment_group['DAYS_INSTALMENT'].min()).map(abs)
    ltv.name = 'life_time_value'
    last_day = payment_group['DAYS_ENTRY_PAYMENT'].max()
    last_day.name = 'last_entry_payment'
    last_delta_day = payment_group['DAYS_ENTRY_PAYMENT'].max() -  payment_group['DAYS_INSTALMENT'].max()
    last_delta_day.name = 'last_delta_day'

    bar = ProgBar(payment_data.SK_ID_CURR.nunique())
    correlation = payment_group.apply(extract_corr,bar)
    
    non_card_payment = df_payment.query('(NUM_INSTALMENT_VERSION!=0) & (AMT_INSTALMENT!=0)')
    bar = ProgBar(non_card_payment.SK_ID_CURR.nunique())
    non_card_payment_fea = non_card_payment.groupby(by = 'SK_ID_CURR').apply(extract_payment_fea,bar)
    non_card_payment_fea.columns = generate_name(non_card_payment_fea.columns,'non_card')

    card_payment = df_payment.query('(NUM_INSTALMENT_VERSION==0) & (AMT_INSTALMENT!=0)')
    bar = ProgBar(card_payment.SK_ID_CURR.nunique())
    card_payment_fea = card_payment.groupby(by = 'SK_ID_CURR').apply(extract_payment_fea,bar)
    card_payment_fea.columns = generate_name(card_payment_fea.columns,'card')

    payment_agg_data = df_payment.query('(AMT_INSTALMENT!=0)')
    bar = ProgBar(payment_agg_data.SK_ID_CURR.nunique())
    total_payment_fea = payment_agg_data.groupby(by='SK_ID_CURR').apply(extract_payment_fea,bar)
    total_payment_fea.columns = generate_name(total_payment_fea.columns,'total')
    last_payment_dict = OrderedDict()
    for i in [-10,-30,-50,-100,-200,-300,-400,-500,]:
        last_data = payment_data.query('DAYS_ENTRY_PAYMENT>={}'.format(i)).groupby('SK_ID_CURR')
        count_last_payments = last_data.size()
        count_bad_payment = last_data['bad_payment'].sum()
        bad_rate = count_bad_payment / count_last_payments
        last_payment_dict['last_payment_{}'.format(i)] = count_last_payments
        last_payment_dict['last_bad_payment_{}'.format(i)] = count_bad_payment
        last_payment_dict['last_bad_rate_{}'.format(i)] = bad_rate
    last_payment_df = pd.DataFrame(last_payment_dict)
    
    result = pd.concat( (days,bad_payment_count,bad_payment_rate,bad_payment_count,num_unique_inst,
                       ltv,last_day,last_delta_day,
                         correlation,
                         non_card_payment_fea,
                         card_payment_fea,
                         total_payment_fea,
                        last_payment_df),axis=1  )
    result.columns = generate_name(result.columns,'payment_data')
    return result

In [30]:
def get_pos_features(df_pos):
    df_pos = pd.get_dummies(df_pos,columns=['NAME_CONTRACT_STATUS'],dummy_na=True)
    dummy_col = [fea for fea in df_pos if re.match('NAME_CONTRACT_STATUS',fea)]
    pos_group = df_pos.groupby(by='SK_ID_CURR')
    state_sum = pos_group[dummy_col].sum()
    state_sum.columns = generate_name(state_sum.columns,'sum')
    state_mean = pos_group[dummy_col].mean()
    state_mean.columns = generate_name(state_mean.columns,'mean')
    size = pos_group.size()
    size.name = 'record_count'
    dpd_days = calc_stat_fea(pos_group,['SK_DPD','SK_DPD_DEF'],stat_list=stat_list)
    count_zero_instalment = pos_group['CNT_INSTALMENT_FUTURE'].apply( lambda df: np.sum(df==0) )
    count_zero_instalment.name = 'count_zero_instalment'
    mean_zero_instalment = count_zero_instalment / size
    mean_zero_instalment.name = 'mean_zero_instalment'
    pos_ltv = pos_group['MONTHS_BALANCE'].min() - pos_group['MONTHS_BALANCE'].max()
    pos_ltv.name = 'ltv'
    pos_last_date = pos_group['MONTHS_BALANCE'].max()
    pos_last_date.name = 'last_date'
    post_last_CNT_INSTALMENT = pos_group[['CNT_INSTALMENT_FUTURE','MONTHS_BALANCE']].apply( lambda df : 
                                        df.loc[df['MONTHS_BALANCE'].idxmax()]['CNT_INSTALMENT_FUTURE'] )
    post_last_CNT_INSTALMENT.name = 'last_CNT_INSTALMENT'
    pos_date = calc_stat_fea(pos_group,['MONTHS_BALANCE'],stat_list)
    future_payment_fea = df_pos.query('(MONTHS_BALANCE==-1)&(CNT_INSTALMENT_FUTURE!=0)').groupby(
        by='SK_ID_CURR')['CNT_INSTALMENT_FUTURE'].size()
    future_payment_fea.name = 'count_open_credit'
    pos_count_credit = pos_group['SK_ID_PREV'].nunique()
    pos_count_credit.name = 'count_credit'
    cnt_instalment_stats = calc_stat_fea(pos_group,['CNT_INSTALMENT_FUTURE'],stat_list)
    cnt_instalment = calc_stat_fea(pos_group,['CNT_INSTALMENT'],stat_list)
    pos_features = pd.concat(  (
        state_mean,
        state_sum,
        size,
        mean_zero_instalment,
        count_zero_instalment,
        dpd_days,
        pos_ltv,
        pos_date,
        post_last_CNT_INSTALMENT,
        cnt_instalment,
        future_payment_fea,
        pos_count_credit,
        cnt_instalment_stats,
    ),axis=1)
    pos_features.columns = generate_name(pos_features.columns,'pos')
    return pos_features

In [38]:
reg_name = ['slope','intercept']
def extract_trend_fea(df,bar,last,fea_name):
    names = [ '{}_{}'.format(fea_name,name) for name in reg_name ] 
    if last!=-1:
        names = ['last_{}_{}'.format(last,name) for name in names ]
    df = df.sort_values('MONTHS_BALANCE').groupby(by='MONTHS_BALANCE').sum()[fea_name].reset_index()
    if last!=-1 and last>len(df):
        last = len(df)
        df = df[-last:]
    vals = StandardScaler().fit_transform(df.values)
    regres = linregress(vals)
    bar.update()
    return pd.Series( [regres.slope,regres.intercept] ,
                     index=names )

def extract_corr(df,bar):
    col_1,col_2 = df['DAYS_INSTALMENT'],df['DAYS_ENTRY_PAYMENT']
    corr_k_days = kendalltau(col_1,col_2)[0]
    corr_p_days = pearsonr(col_1,col_2)[0]
    col_1,col_2 = df['AMT_PAYMENT'],df['AMT_INSTALMENT']
    corr_k_amt = kendalltau(col_1,col_2)[0]
    corr_p_amt = pearsonr(col_1,col_2)[0]
    bar.update()
    return pd.Series( [corr_k_days,corr_p_days,corr_k_amt,corr_p_amt],
                      index = ['days_corr_kend','days_corr_pear','amt_corr_kend','amt_corr_kend'])

def balance_change_extract(df,bar):
    df = df.sort_values(by='MONTHS_BALANCE')
    data = df.groupby(by='MONTHS_BALANCE')[['AMT_BALANCE','AMT_RECEIVABLE_PRINCIPAL','AMT_RECIVABLE']].sum()
    fea_dict = OrderedDict()
    temp = data.diff(1)
    fea_dict['corr_balance_principal'] = kendalltau(data['AMT_BALANCE'],data['AMT_RECEIVABLE_PRINCIPAL'])[0]
    fea_dict['corr_balance_RECEIVABLE'] = kendalltau(data['AMT_BALANCE'],data['AMT_RECIVABLE'])[0]
    for fea in temp:
        for func in [np.mean,np.max,np.std,np.min]:
            fea_dict['diff_{}_{}'.format(fea,func.__name__)] = func(data[fea])
    bar.update()
    return pd.Series(fea_dict)

def pos_atm_correlation(df,bar):
    atm_correlation = pearsonr(df['AMT_DRAWINGS_CURRENT'],df['AMT_DRAWINGS_ATM_CURRENT'])[0]
    pos_correlation = pearsonr(df['AMT_DRAWINGS_CURRENT'],df['AMT_DRAWINGS_POS_CURRENT'])[0]
    other_correlation = pearsonr(df['AMT_DRAWINGS_CURRENT'],df['AMT_DRAWINGS_POS_CURRENT'])[0]
    bar.update()
    return pd.Series([atm_correlation,pos_correlation,other_correlation],
                    index = ['atm_correlation','pos_correlation','other_correlation'])

card_last_name = ['last_AMT_CREDIT_LIMIT_ACTUAL','last_AMT_BALANCE','last_index_AMT_BALANCE',
                  'last_diff_AMT_BALANCE']
def card_last_balance_data(df,bar):
    df = df.sort_values(by='MONTHS_BALANCE')
    last_record = df.iloc[-1]
    last_debt = last_record['AMT_BALANCE']
    last_limit = last_record['AMT_CREDIT_LIMIT_ACTUAL']
    last_index = np.nan
    last_diff = np.nan
    if len(df)>1:
        balance = df.iloc[-2]['AMT_BALANCE']
        last_index =  last_debt/ balance
        last_diff = last_debt - balance
    bar.update()
    return pd.Series( [last_limit,last_debt,last_index,last_diff] ,index = card_last_name )

def get_credit_card_data(credit_card_balance):
    
    credit_card_balance = pd.get_dummies(credit_card_balance,columns=['NAME_CONTRACT_STATUS'])
    dummy_name = [fea for fea in credit_card_balance if re.match('NAME_CONTRACT_STATUS',fea)]
    credit_card_balance.loc[:,'atm_diff'] = credit_card_balance.AMT_DRAWINGS_CURRENT - \
                                            credit_card_balance.AMT_DRAWINGS_ATM_CURRENT
    credit_card_balance.loc[:,'pos_diff'] = credit_card_balance.AMT_DRAWINGS_CURRENT - \
                                            credit_card_balance.AMT_DRAWINGS_POS_CURRENT
    credit_card_balance.loc[:,'other_diff'] = credit_card_balance.AMT_DRAWINGS_CURRENT - \
                                              credit_card_balance.AMT_DRAWINGS_OTHER_CURRENT
    credit_card_balance.loc[:,'payment_diff'] = credit_card_balance.AMT_PAYMENT_TOTAL_CURRENT - \
                                                credit_card_balance.AMT_PAYMENT_CURRENT
    credit_card_balance.loc[:,'amt_debt'] = credit_card_balance.AMT_BALANCE - credit_card_balance.AMT_RECIVABLE
    credit_card_balance.loc[:,'amt_debt_PRINCIPAL']  = credit_card_balance.AMT_BALANCE - \
                                                       credit_card_balance.AMT_RECEIVABLE_PRINCIPAL
    credit_card_balance.loc[:,'amt_debt_TOTAL'] = credit_card_balance.AMT_BALANCE - \
                                                          credit_card_balance.AMT_TOTAL_RECEIVABLE
    credit_card_balance.loc[:,'diff_pincipal_receivable'] = credit_card_balance.AMT_RECEIVABLE_PRINCIPAL -\
                                                            credit_card_balance.AMT_RECIVABLE    
    credit_card_balance.loc[:,'card_payment_rate'] = credit_card_balance.AMT_BALANCE -\
                                                        credit_card_balance.AMT_PAYMENT_CURRENT
    
    bar = ProgBar(credit_card_balance.SK_ID_CURR.nunique())
    corr_data = credit_card_balance.groupby(by='SK_ID_CURR')[['AMT_DRAWINGS_CURRENT',
                   'AMT_DRAWINGS_ATM_CURRENT','AMT_DRAWINGS_POS_CURRENT' ]].apply(pos_atm_correlation,bar)
    
    groups = credit_card_balance.groupby(by='SK_ID_CURR')
    mean_status = groups[dummy_name].mean()
    mean_status.columns = generate_name(mean_status.columns,'mean')
    sum_status = groups[dummy_name].sum()
    sum_status.columns = generate_name(sum_status.columns,'sum')
    card_ltv = groups.MONTHS_BALANCE.min() - groups.MONTHS_BALANCE.max()
    card_ltv.name = 'card_life_time_value'
    total_card_count = groups['SK_ID_PREV'].nunique()
    total_card_count.name = 'total_card_count'
    total_record_count = groups.size()
    total_record_count.name = 'card_total_record_count'
    cnt_instalment_sum = groups['CNT_INSTALMENT_MATURE_CUM'].max()
    cnt_instalment_sum.name = 'cnt_instalment_sum'
    
    diff_name = ['atm_diff','pos_diff','other_diff','amt_debt','amt_debt_PRINCIPAL','amt_debt_TOTAL',
                 'diff_pincipal_receivable','AMT_BALANCE','SK_DPD','AMT_CREDIT_LIMIT_ACTUAL',
                 'CNT_DRAWINGS_CURRENT']
    stat_diff_fea = calc_stat_fea(groups,diff_name,stat_list)
    
    bar = ProgBar(credit_card_balance.SK_ID_CURR.nunique())
    actual_credit_linmit_data = groups[['MONTHS_BALANCE','AMT_CREDIT_LIMIT_ACTUAL']].apply(
        extract_trend_fea,bar,-1,'AMT_CREDIT_LIMIT_ACTUAL',)
    
    bar = ProgBar(credit_card_balance.SK_ID_CURR.nunique())
    last_credit_card_data = groups[['MONTHS_BALANCE','AMT_BALANCE',
                                    'AMT_CREDIT_LIMIT_ACTUAL']].apply( card_last_balance_data, bar )
    
    drawing_sum = groups.AMT_DRAWINGS_CURRENT.sum()
    rate_fea_dict = OrderedDict()
    for name,fea_name in zip(['pos_rate','atm_rate','other_rate'],['AMT_DRAWINGS_POS_CURRENT',
                                            'AMT_DRAWINGS_ATM_CURRENT','AMT_DRAWINGS_OTHER_CURRENT']):
        fea = groups[fea_name].sum() / drawing_sum
        fea.name = name
        rate_fea_dict[name] = fea
    
    rates_diffs = rate_fea_dict['pos_rate']  - rate_fea_dict['atm_rate']
    rates_diffs.name = 'diffs_pos_atm_rate'
    
    rate_fea_dict['diffs_pos_atm_rate'] = rates_diffs
    rate_frame = pd.DataFrame(rate_fea_dict)
    
    bar = ProgBar(credit_card_balance.SK_ID_CURR.nunique())
    credit_balancer_fea = groups.apply(balance_change_extract,bar)
    
    df_balance_credit = pd.concat((
        mean_status,sum_status,
        card_ltv,
        stat_diff_fea,
        total_record_count,
        total_card_count,
        rate_frame,
        cnt_instalment_sum,
        credit_balancer_fea,
        corr_data,
        actual_credit_linmit_data,
        last_credit_card_data,
        ),axis=1)
    df_balance_credit.columns = generate_name(df_balance_credit.columns,'crc')
    return df_balance_credit

In [23]:
def load_df_and_get_features(func_name,*args):
    datataset_list = []
    for name in args:
        datataset_list.append(pd.read_csv(os.path.join(PATH_TO_DATA,name)))
    features = func_name(*datataset_list)
    del datataset_list
    gc.collect()
    return features

In [37]:
app_features =   load_df_and_get_features(get_app_data,'application_train.csv','application_test.csv')
bureau_balance_feature = load_df_and_get_features(bureau_and_balance_data,'bureau.csv','bureau_balance.csv')
pos_features = load_df_and_get_features(get_pos_features,'POS_CASH_balance.csv')
prev_app_features = load_df_and_get_features(get_previous_application_features,'previous_application.csv')
credit_balance_features = load_df_and_get_features(get_credit_card_data,'credit_card_balance.csv')
payment_features = load_df_and_get_features(get_payment_features,'installments_payments.csv')

extract non-card last fea
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:04:18

Total time elapsed: 00:04:18
extract card last fea
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:02:51

Total time elapsed: 00:02:51
extract payment index
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:03:59

Total time elapsed: 00:03:59
extract interest rate
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:03:53

Total time elapsed: 00:03:53
extract last fea
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:05:00

Total time elapsed: 00:05:00
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:15

Total time elapsed: 00:01:15
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:56

Total time elapsed: 00:00:56
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00

In [55]:
df_features = pd.concat( (app_features,bureau_balance_feature,
                          pos_features,prev_app_features,credit_balance_features,payment_features),
                        axis=1 )
del app_features,bureau_balance_feature,pos_features,prev_app_features,credit_balance_features,payment_features

In [60]:
drop_list = []
for fea in df_features:
    if df_features[fea].nunique()==1:
        drop_list.append(fea)
df_features.drop(labels=drop_list,axis=1,inplace=True)

In [65]:
def reduce_mem_usage(data, verbose = True):
    start_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage of dataframe: {:.2f} MB'.format(start_mem))
    
    for col in data.columns:
        col_type = data[col].dtype
        
        if col_type != object:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)

    end_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage after optimization: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return data

In [66]:
df_features = reduce_mem_usage(df_features)

Memory usage of dataframe: 2152.65 MB
Memory usage after optimization: 756.28 MB
Decreased by 64.9%


In [64]:
res = validate(data=df_features,nfolds=5,fea_list=[fea for fea in df_features if fea!='TARGET'],
               model_type=LGBMClassifier,
               param={'n_estimators':300,'max_depth':5,'learning_rate':0.03}) 

fold = 1 oof score = 0.7853181100678557, train score = 0.8247367398778617 , validate_score = 0.7763126177270324 
fold = 2 oof score = 0.789840605183388, train score = 0.8233965558693972 , validate_score = 0.7751277793370306 
fold = 3 oof score = 0.7838577366149491, train score = 0.8234448025173577 , validate_score = 0.7761707938602898 
fold = 4 oof score = 0.7845047649577565, train score = 0.8246360299841166 , validate_score = 0.7753619392344516 
fold = 5 oof score = 0.7761322732588521, train score = 0.8251881630057356 , validate_score = 0.7750261168608406 
with seed 42 mean oof = 0.7839306980165602 , std oof = 0.0044272388650565325
with validation seed 42 validate score = 0.7771905837055095
