In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from statsmodels.discrete.discrete_model import Logit
warnings.simplefilter(action='ignore')

In [3]:
FEATURES_EXCLUDED = []
data = pd.read_csv('data/clean_data_feats.csv')
target = data['bad']
data.drop(['bad',], axis=1, inplace=True)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.15, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(15158, 44) (15158,)
(2675, 44) (2675,)


In [8]:
X_train.columns

Index(['mob', 'MOB_term', 'Credit_TermApr', 'maxdelay_one', 'delays_one',
       'MA_AGE', 'MA_Gender', 'MANUMBEROFCHILD', 'MA_MONTH_AT_CURR_ADDRESS',
       'MA_Real_Estate_Owner', 'MA_REG_Same_Fact_Addr',
       'MA_MONTH_AT_CURR_PASSP', 'MA_Exp_IND', 'MA_MONTH_AT_CURR_JOB',
       'MA_Time_Previous_Job', 'MA_Proposed_Amount', 'TOT_INCOME',
       'PARTWH_INCOME', 'PAYMD2TOTPAYM', 'PTI', 'ratio_curr_cap_share',
       'larger_diff_term', 'ratio_inst_amount', 'freq_nnkd', 'EQ_PP',
       'diff_white_pti', 'ratio_time_job', 'ratio_amount_income',
       'MA_Education_0', 'MA_Education_1', 'MA_Education_2', 'MA_Education_3',
       'MA_Education_4', 'MA_Marital_Status_1', 'MA_Marital_Status_2',
       'MA_Marital_Status_3', 'MA_Marital_Status_4', 'MA_Marital_Status_5',
       'MA_Marital_Status_6', 'MA_Residential_Status_1',
       'MA_Residential_Status_2', 'MA_Residential_Status_3',
       'MA_Residential_Status_4', 'MA_Residential_Status_5'],
      dtype='object')

In [9]:
feats_to_ohe = ['MA_Education', 'MA_Marital_Status', 'MA_Residential_Status']

other_feats = ['MA_AGE', 'MA_Gender','MANUMBEROFCHILD', 'MA_MONTH_AT_CURR_ADDRESS', 'MA_Real_Estate_Owner', 'MA_REG_Same_Fact_Addr',
              'MA_MONTH_AT_CURR_PASSP', 'MA_Exp_IND', 'MA_MONTH_AT_CURR_JOB', 'MA_Time_Previous_Job', 'MA_Education_0', 'MA_Education_1', 'MA_Education_2', 'MA_Education_3',
       'MA_Education_4', 'MA_Marital_Status_1', 'MA_Marital_Status_2',
       'MA_Marital_Status_3', 'MA_Marital_Status_4', 'MA_Marital_Status_5',
       'MA_Marital_Status_6', 'MA_Residential_Status_1',
       'MA_Residential_Status_2', 'MA_Residential_Status_3',
       'MA_Residential_Status_4', 'MA_Residential_Status_5']

feat_encodes = ['mob', 'MOB_term', 'Credit_TermApr', 'maxdelay_one', 'delays_one', 'TOT_INCOME', 'MA_Proposed_Amount', 
                'PAYMD2TOTPAYM', 'PTI', 'ratio_inst_amount', 'diff_white_pti', 'ratio_time_job', 'ratio_amount_income']

In [6]:
def lgb_other(params, train, test, target, y_test, metrics_f=roc_auc_score, num_folds=5, rs=42):
    folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=rs)
    
    oof_preds = np.zeros(len(train))
    sub_preds = np.zeros(len(test))
    valid_roc = 0
    feats = [f for f in train.columns if f not in FEATURES_EXCLUDED]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train[feats], target)):
        train_x, train_y = train[feats].iloc[train_idx], target.iloc[train_idx]
        valid_x, valid_y = train[feats].iloc[valid_idx], target.iloc[valid_idx]

            # set data structure
        lgb_train = lgb.Dataset(train_x,
                                    label=train_y,
                                    free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x,
                                   label=valid_y,
                                   free_raw_data=False)
        reg = lgb.train(
                params,
                lgb_train,
                valid_sets=[lgb_train, lgb_test],
                valid_names=['train', 'test'],
                num_boost_round=750,
                early_stopping_rounds=200,
                verbose_eval=False,

        )

        oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration)
        sub_preds += reg.predict(test[feats], num_iteration=reg.best_iteration) / folds.n_splits
        valid_roc += metrics_f(valid_y, oof_preds[valid_idx])
        print("CV score in {} fold: {:<8.7f}\n".format(n_fold + 1, metrics_f(valid_y, oof_preds[valid_idx])))
        
    valid_roc /= num_folds
    print("CV score on valid/test: {:<8.7f}/{:<8.7f}\n".format(valid_roc, metrics_f(y_test, sub_preds)))
        
    return sub_preds, oof_preds

In [10]:
param_other = {
    'learning_rate': 0.01,
    'num_leaves': 31,
    'reg_alpha': 2,
    'metric':'auc',
    'boost_from_average':'false',
    'feature_fraction': 0.8,
    'max_depth': 5,
    'objective': 'binary',
    'max_bin': 256,
    'verbosity': -10}

pred_other, oof_other = lgb_other(param_other, X_train[other_feats], X_test[other_feats], y_train, y_test)

CV score in 1 fold: 0.5964378

CV score in 2 fold: 0.6455612

CV score in 3 fold: 0.6197472

CV score in 4 fold: 0.6270807

CV score in 5 fold: 0.6221792

CV score on valid/test: 0.6222012/0.6524415



In [None]:
CV score on valid/test: 0.5733181/0.5920946
CV score on valid/test: 0.6253259/0.6548070

## Encoding part

In [11]:
def encode_FE(df_train, df_test, cols):
    train, test = df_train.copy(), df_test.copy()
    
    for col in cols:
        cv = train[col].value_counts()
        nm = col+'_FE'
        train[nm] = train[col].map(cv)

        test[nm] = test[col].map(cv)
        test[nm].fillna(0,inplace=True)

        if cv.max()<=255:
            train[nm] = train[nm].astype('uint8')
            test[nm] = test[nm].astype('uint8')
        else:
            train[nm] = train[nm].astype('uint16')
            test[nm] = test[nm].astype('uint16')  
        
        
    return train, test

In [12]:
train_enc, test_enc = encode_FE(X_train, X_test, feat_encodes)

In [13]:
param = {
    'learning_rate': 0.04,
    'num_leaves': 31,
    'reg_alpha': 1,
    'metric':'auc',
    'boost_from_average':'false',
    'feature_fraction': 1.0,
    'max_depth': -1,
    'objective': 'binary',
    'max_bin': 256,
    'verbosity': -10}

param_list = {'reg_alpha': [0.1, 0.5, 0.75, 1],
              'learning_rate': [0.05, 0.1, 0.12],
              'num_leaves': [3, 4, 5],
              'max_bin': [256, 512, 1024]
                 }

In [161]:
def lgb_feats_search_params(params, params_list,  train, test, target, y_test, metrics_f=roc_auc_score, num_folds=5, rs=42):
    param_dict = {'reg_alpha': np.zeros(len(feat_encodes)),
                 'learning_rate': np.zeros(len(feat_encodes)),
                 'num_leaves': np.zeros(len(feat_encodes)),
                 'max_bin': np.zeros(len(feat_encodes)),
                  'roc': np.zeros(len(feat_encodes))
                 }
    
    for j, feat in enumerate(feat_encodes):
        feats = [feat, feat+'_FE']
        valid_roc_max = 0
        
        for a in params_list['reg_alpha']:
            for lr in params_list['learning_rate']:
                for nl in params_list['num_leaves']:
                    for mb in params_list['max_bin']:
                        
                                params['max_bin'] = mb
                                params['learning_rate'] = lr
                                params['reg_alpha'] = a
                                params['num_leaves'] = nl

        
                                oof_preds = np.zeros(len(train))
                                valid_roc = 0
                                

                                folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=rs)

                                for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train[feats], target)):
                                    train_x, train_y = train[feats].iloc[train_idx], target.iloc[train_idx]
                                    valid_x, valid_y = train[feats].iloc[valid_idx], target.iloc[valid_idx]

                                    # set data structure
                                    lgb_train = lgb.Dataset(train_x,
                                                            label=train_y,
                                                            free_raw_data=False)
                                    lgb_test = lgb.Dataset(valid_x,
                                                           label=valid_y,
                                                           free_raw_data=False)
                                    reg = lgb.train(
                                        params,
                                        lgb_train,
                                        valid_sets=[lgb_train, lgb_test],
                                        valid_names=['train', 'test'],
                                        num_boost_round=750,
                                        early_stopping_rounds=200,
                                        verbose_eval=False,

                                    )

                                    oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration)
                                    valid_roc += metrics_f(valid_y, oof_preds[valid_idx])
                                    
                                valid_roc /= num_folds
                                
                                if valid_roc_max < valid_roc:
                                    param_dict['reg_alpha'][j] = params['reg_alpha']
                                    param_dict['learning_rate'][j] = params['learning_rate']
                                    param_dict['num_leaves'][j] = params['num_leaves']
                                    param_dict['max_bin'][j] = params['max_bin']
                                    param_dict['roc'][j] = valid_roc
                                    valid_roc_max = valid_roc
                
        print("feat {}, lr: {}, reg: {}, num_leaves: {}, max_bin: {}, roc: {:<8.7f}\n".format(feat, param_dict['learning_rate'][j], param_dict['reg_alpha'][j], param_dict['num_leaves'][j],
                                                                                              param_dict['max_bin'][j], param_dict['roc'][j]))
        
    return param_dict

In [162]:
%%time
final_param_dict_new = lgb_feats_search_params(param, param_list, train, test, y_train, y_test)

feat mob, lr: 0.05, reg: 0.5, num_leaves: 3.0, max_bin: 256.0, roc: 0.5264604

feat MOB_term, lr: 0.1, reg: 0.1, num_leaves: 5.0, max_bin: 512.0, roc: 0.5316353

feat Credit_TermApr, lr: 0.12, reg: 1.0, num_leaves: 5.0, max_bin: 256.0, roc: 0.5297721

feat maxdelay_one, lr: 0.12, reg: 1.0, num_leaves: 4.0, max_bin: 256.0, roc: 0.7123746

feat delays_one, lr: 0.05, reg: 0.1, num_leaves: 3.0, max_bin: 256.0, roc: 0.5781363

feat TOT_INCOME, lr: 0.12, reg: 0.5, num_leaves: 5.0, max_bin: 1024.0, roc: 0.5878517

feat MA_Proposed_Amount, lr: 0.12, reg: 0.75, num_leaves: 5.0, max_bin: 256.0, roc: 0.5547185

feat PAYMD2TOTPAYM, lr: 0.12, reg: 0.1, num_leaves: 5.0, max_bin: 512.0, roc: 0.6612326

feat PTI, lr: 0.12, reg: 0.5, num_leaves: 5.0, max_bin: 1024.0, roc: 0.6265637

feat ratio_inst_amount, lr: 0.12, reg: 0.1, num_leaves: 5.0, max_bin: 1024.0, roc: 0.6175470

feat diff_white_pti, lr: 0.12, reg: 0.1, num_leaves: 5.0, max_bin: 1024.0, roc: 0.6366125

feat ratio_time_job, lr: 0.12, reg: 0.

In [165]:
final_param_dict_new['num_leaves'] = final_param_dict_new['num_leaves'].astype(int)
final_param_dict_new['max_bin'] = final_param_dict_new['max_bin'].astype(int)

In [88]:
%%time
final_param_dict = lgb_feats_search_params(param, param_list, train, test, y_train, y_test)

feat mob, lr: 0.04, reg: 1.0, num_leaves: 3.0, max_bin: 256.0, roc: 0.5260609

feat MOB_term, lr: 0.12, reg: 1.0, num_leaves: 5.0, max_bin: 512.0, roc: 0.5299255

feat Credit_TermApr, lr: 0.08, reg: 1.0, num_leaves: 5.0, max_bin: 256.0, roc: 0.5298061

feat maxdelay_one, lr: 0.12, reg: 1.0, num_leaves: 4.0, max_bin: 256.0, roc: 0.7123746

feat delays_one, lr: 0.04, reg: 0.75, num_leaves: 3.0, max_bin: 256.0, roc: 0.5781363

feat TOT_INCOME, lr: 0.12, reg: 0.75, num_leaves: 5.0, max_bin: 1024.0, roc: 0.5874339

feat PAYMD2TOTPAYM, lr: 0.08, reg: 1.0, num_leaves: 5.0, max_bin: 512.0, roc: 0.6609248

feat PTI, lr: 0.12, reg: 0.75, num_leaves: 5.0, max_bin: 1024.0, roc: 0.6255393

feat ratio_inst_amount, lr: 0.12, reg: 0.75, num_leaves: 5.0, max_bin: 1024.0, roc: 0.6168904

feat diff_white_pti, lr: 0.12, reg: 0.75, num_leaves: 5.0, max_bin: 1024.0, roc: 0.6361368

feat ratio_time_job, lr: 0.12, reg: 1.0, num_leaves: 5.0, max_bin: 1024.0, roc: 0.5860101

feat ratio_amount_income, lr: 0.12, 

In [14]:
def lgb_feats(params, param_dict, train, test, target, y_test, metrics_f=roc_auc_score, num_folds=5, rs=42):
    all_oof_feats = np.zeros((len(train),len(feat_encodes)+1))
    all_oof_feats[:,0] = np.ones(len(train))

    all_preds_feats = np.zeros((len(test),len(feat_encodes)+1))
    all_preds_feats[:,0] = np.ones(len(test))
    
    for j, feat in enumerate(feat_encodes):
        params['reg_alpha'] = param_dict['reg_alpha'][j]
        params['learning_rate'] = param_dict['learning_rate'][j]
        params['num_leaves'] = param_dict['num_leaves'][j]
        params['max_bin'] = param_dict['max_bin'][j]
        
        feats = [feat, feat+'_FE']
        oof_preds = np.zeros(len(train))
        sub_preds = np.zeros(len(test))
        valid_roc = 0

        folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=rs)

        for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train[feats], target)):
            train_x, train_y = train[feats].iloc[train_idx], target.iloc[train_idx]
            valid_x, valid_y = train[feats].iloc[valid_idx], target.iloc[valid_idx]

            # set data structure
            lgb_train = lgb.Dataset(train_x,
                                    label=train_y,
                                    free_raw_data=False)
            lgb_test = lgb.Dataset(valid_x,
                                   label=valid_y,
                                   free_raw_data=False)
            reg = lgb.train(
                params,
                lgb_train,
                valid_sets=[lgb_train, lgb_test],
                valid_names=['train', 'test'],
                num_boost_round=750,
                early_stopping_rounds=200,
                verbose_eval=False,

            )

            oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration)
            sub_preds += reg.predict(test[feats], num_iteration=reg.best_iteration) / folds.n_splits
            valid_roc += metrics_f(valid_y, oof_preds[valid_idx])
        valid_roc /= num_folds
        print("feat {}, CV score on valid/test: {:<8.7f}/{:<8.7f}\n".format(feat, valid_roc, metrics_f(y_test, sub_preds)))
        
        all_oof_feats[:, j+1] = oof_preds
        all_preds_feats[:, j+1] = sub_preds
        
    return all_oof_feats, all_preds_feats

In [89]:
# первый прогон
all_oof, all_pred = lgb_feats(param, train, test, y_train, y_test)

feat mob, CV score on valid/test: 0.5006486/0.5245397

feat MOB_term, CV score on valid/test: 0.5262788/0.5203474

feat Credit_TermApr, CV score on valid/test: 0.5181776/0.5061834

feat maxdelay_one, CV score on valid/test: 0.7116005/0.7067010

feat delays_one, CV score on valid/test: 0.5781363/0.5856864

feat TOT_INCOME, CV score on valid/test: 0.5788832/0.5577320

feat PAYMD2TOTPAYM, CV score on valid/test: 0.6598520/0.5570885

feat PTI, CV score on valid/test: 0.6131345/0.5778046

feat ratio_inst_amount, CV score on valid/test: 0.6090152/0.5860251

feat diff_white_pti, CV score on valid/test: 0.6242453/0.6000559

feat ratio_time_job, CV score on valid/test: 0.5823891/0.5637585

feat ratio_amount_income, CV score on valid/test: 0.5965900/0.5894216



In [104]:
#прогон с подобранными гиперпараметрами
all_oof, all_pred = lgb_feats(param, final_param_dict, train, test, y_train, y_test)

feat mob, CV score on valid/test: 0.5260609/0.5218823

feat MOB_term, CV score on valid/test: 0.5299255/0.5217813

feat Credit_TermApr, CV score on valid/test: 0.5298061/0.4987082

feat maxdelay_one, CV score on valid/test: 0.7123746/0.7069441

feat delays_one, CV score on valid/test: 0.5781363/0.5856864

feat TOT_INCOME, CV score on valid/test: 0.5874339/0.5694732

feat PAYMD2TOTPAYM, CV score on valid/test: 0.6609248/0.5587995

feat PTI, CV score on valid/test: 0.6255393/0.6030623

feat ratio_inst_amount, CV score on valid/test: 0.6168904/0.5993003

feat diff_white_pti, CV score on valid/test: 0.6361368/0.6192809

feat ratio_time_job, CV score on valid/test: 0.5860101/0.5756191

feat ratio_amount_income, CV score on valid/test: 0.6025186/0.6085486



In [168]:
# прогон с новыми гиперпараметрами
all_oof_new, all_pred_new = lgb_feats(param, final_param_dict_new, train_enc, test_enc, y_train, y_test)

feat mob, CV score on valid/test: 0.5264604/0.5208367

feat MOB_term, CV score on valid/test: 0.5316353/0.5230164

feat Credit_TermApr, CV score on valid/test: 0.5297721/0.4995786

feat maxdelay_one, CV score on valid/test: 0.7123746/0.7069441

feat delays_one, CV score on valid/test: 0.5781363/0.5856864

feat TOT_INCOME, CV score on valid/test: 0.5878517/0.5703271

feat MA_Proposed_Amount, CV score on valid/test: 0.5547185/0.5527707

feat PAYMD2TOTPAYM, CV score on valid/test: 0.6612326/0.5564133

feat PTI, CV score on valid/test: 0.6265637/0.6046977

feat ratio_inst_amount, CV score on valid/test: 0.6175470/0.5987597

feat diff_white_pti, CV score on valid/test: 0.6366125/0.6211505

feat ratio_time_job, CV score on valid/test: 0.5883688/0.5919449

feat ratio_amount_income, CV score on valid/test: 0.6025186/0.6085486



In [None]:
# прогон с новыми гиперпараметрами
all_oof_new, all_pred_new = lgb_feats(param, final_param_dict_new, train_enc, test_enc, y_train, y_test)

In [None]:
pred_other, oof_other

In [211]:
all_oof_new[:,:len(feat_encodes)+1].shape

(15158, 14)

In [228]:
all_oof_new[:,:]

array([[1.        , 0.46685109, 0.14067393, ..., 0.55943425, 0.19350627,
        0.24200303],
       [1.        , 0.47037265, 0.21406765, ..., 0.08386685, 0.12161691,
        0.12060977],
       [1.        , 0.27309641, 0.43765967, ..., 0.16667227, 0.20448327,
        0.29369688],
       ...,
       [1.        , 0.25249828, 0.18157046, ..., 0.14429967, 0.18519256,
        0.15873457],
       [1.        , 0.21661611, 0.3087727 , ..., 0.3378521 , 0.24964109,
        0.23957489],
       [1.        , 0.30172209, 0.44102588, ..., 0.28155164, 0.22258197,
        0.08545286]])

In [230]:
new_all_oof = np.concatenate((all_oof_new[:,:], oof_other.reshape(len(oof_other), 1)), axis=1)
new_all_pred = np.concatenate((all_pred_new[:,:], pred_other.reshape(len(pred_other), 1)), axis=1)

In [16]:
all_oof_new = pd.read_csv('data/all_oof.csv')
all_pred_new = pd.read_csv('data/all_pred.csv')

In [None]:
# последние данные
logr = Logit(y_train, all_oof_new)
logr = logr.fit(disp=0)
ensemble_preds = logr.predict(all_oof_new)

print('valid:  ', roc_auc_score(y_train, ensemble_preds))

ensemble_pred_test = logr.predict(all_pred_new)
print('test:  ', roc_auc_score(y_test, ensemble_pred_test)) 

In [170]:
logr = Logit(y_train, all_oof_new[:,:len(feat_encodes)+1])
logr = logr.fit(disp=0)
ensemble_preds = logr.predict(all_oof_new[:,:len(feat_encodes)+1])

print('valid:  ', roc_auc_score(y_train, ensemble_preds))

ensemble_pred_test = logr.predict(all_pred_new[:,:len(feat_encodes)+1])
print('test:  ', roc_auc_score(y_test, ensemble_pred_test)) 

valid:   0.8082907564471435
test:   0.7583899035178969


In [171]:
pd.DataFrame(all_oof_new).to_csv('data/all_oof.csv', index=False)
pd.DataFrame(all_pred_new).to_csv('data/all_pred.csv', index=False)

In [105]:
logr = Logit(y_train, all_oof[:,:len(feat_encodes)+1])
logr = logr.fit(disp=0)
ensemble_preds = logr.predict(all_oof[:,:len(feat_encodes)+1])

ensemble_auc = roc_auc_score(y_train, ensemble_preds) 

In [106]:
ensemble_auc

0.8077369476178142

In [107]:
ensemble_pred_test = logr.predict(all_pred[:,:len(feat_encodes)+1])
roc_auc_score(y_test, ensemble_pred_test) 

0.7550323334798975