In [1]:
import datetime
import gc
import os
import time
import numpy as np
import pandas as pd
import pickle
import json

from colorama import Fore, Style
from contextlib import contextmanager
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier, plot_importance

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn import neighbors

import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

%matplotlib inline

In [2]:
def highlight_print(hightlight, message):
    print(hightlight + message + Style.RESET_ALL)


@contextmanager
def timer(title):
    t0 = time.time()
    yield
    diffs = time.time() - t0
    minutes = int(diffs/60)
    seconds = int(diffs)%60
    highlight_print(Fore.LIGHTGREEN_EX, "[Done] {} in {}:{} ({})".format(title, minutes, seconds, datetime.datetime.now()))


# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [3]:
def group_by_feats(df):
    groupby = ['NAME_EDUCATION_TYPE']
    selected = ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY'] + groupby
    agg = df[selected]\
        .groupby(groupby)\
        .agg({
            'AMT_INCOME_TOTAL': ['median'],
            # 'AMT_INCOME_TOTAL': ['mean', 'median']
            # 'AMT_CREDIT': ['mean', 'median'],
            # 'AMT_ANNUITY': ['mean', 'median']
        })\
        .reset_index()
    cols = agg.columns.tolist()[-1:]
    agg.columns = pd.Index(groupby+['NEW_BY_{}_'.format('_X_'.join(groupby)) + e[0] + "_" + e[1].upper() for e in cols])

    # df['MORE_O_AMT_INCOME_TOTAL__EDU_MEDIAN'] = df['AMT_INCOME_TOTAL'] / df['NEW_BY_NAME_EDUCATION_TYPE_AMT_INCOME_TOTAL_MEDIAN']
    
    return pd.merge(df, agg, how='left', on=groupby)

In [4]:
# Preprocess application_train.csv and application_test.csv
def application_train_test(num_rows=None, nan_as_category=False):
    # Read data and merge
    df = pd.read_csv('../data/application_train.csv', nrows=num_rows)
    test_df = pd.read_csv('../data/application_test.csv', nrows=num_rows)
    print("Train samples: {}, test samples: {}".format(len(df), len(test_df)))
    df = df.append(test_df, sort=True).reset_index()

    # Optional: Remove 4 applications with XNA CODE_GENDER (train set)
    df = df[df['CODE_GENDER'] != 'XNA']

    # NaN values for DAYS_EMPLOYED: 365.243 -> nan
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)

#     df['AAO_CREDIT__INC'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
#     df['AAO_ANNUITY__INC'] = df['AMT_ANNUITY'] / (1 + df['AMT_INCOME_TOTAL'])
#     df['AAO_ANNUITY__CREDIT'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
#     df['AAO_CREDIT__ANNUITY'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']
#     df['AAO_CREDIT__GOODS'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
#     df['AAO_INC__CHLD'] = df['AMT_INCOME_TOTAL'] / (1 + df['CNT_CHILDREN'])
#     df['AAO_INC__FM'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
#     df['AAO_CHLD__FM'] = df['CNT_CHILDREN'] / df['CNT_FAM_MEMBERS']

    docs = [_f for _f in df.columns if 'FLAG_DOC' in _f]
    live = [_f for _f in df.columns if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f)]    
    df['A_DOC_IND_KURT'] = df[docs].kurtosis(axis=1)
    df['A_LIVE_IND_SUM'] = df[live].sum(axis=1)
    
#     df = group_by_feats(df)
    
    # EXT_SOURCES
#     df['AA_SOURCES'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
#     df['AA_SOURCES_MEAN'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
#     df['AA_SCORES_STD'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
#     df['AA_SCORES_STD'] = df['AA_SCORES_STD'].fillna(df['AA_SCORES_STD'].mean())
    
    # TO_BIRTH_DAYS, TO_DAYS_EMPLOYED
#     df['AAO_EMPLOYED__BIRTH'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
#     df['AAO_CAR__BIRTH'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH']
#     df['AAO_CAR__EMPLOY'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED']
#     df['AAO_PHONE__BIRTH'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH']
#     df['AAO_PHONE__EMPLOYED'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_EMPLOYED']
    
#     df['AAAX_CREDIT_ANNUITY__BIRTH'] = df['AAO_CREDIT__ANNUITY'] * df['DAYS_BIRTH']
#     df['AAAX_CREDIT_ANNUITY__EMPLOYED'] = df['AAO_CREDIT__ANNUITY'] * df['DAYS_EMPLOYED']
#     df['AAAX_SOURCES_MEAN__BIRTH'] = df['AA_SOURCES_MEAN'] * df['DAYS_BIRTH']
#     df['AAAX_SOURCES_MEAN__EMPLOYED'] = df['AA_SOURCES_MEAN'] * df['DAYS_EMPLOYED']
#     df['AAAO_SOURCES_MEAN__EMPLOYED_BIRTH'] = df['AA_SOURCES_MEAN'] / df['AAO_EMPLOYED__BIRTH']
    
#     df['AAAX_CREDIT_ANNUITY__SOURCE_MEAN'] = df['AAO_CREDIT__ANNUITY'] * df['AA_SOURCES_MEAN']
#     df['AAAX_CREDIT_ANNUITY__SOURCE_3'] = df['AAO_CREDIT__ANNUITY'] * df['EXT_SOURCE_3']
#     df['AAAX_CREDIT_ANNUITY__SOURCE_2'] = df['AAO_CREDIT__ANNUITY'] * df['EXT_SOURCE_2']
#     df['AAAX_CREDIT_ANNUITY__SOURCE_1'] = df['AAO_CREDIT__ANNUITY'] * df['EXT_SOURCE_1']
        
    # Categorical features with Binary encode (0 or 1; two categories)
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])

    # Categorical features with One-Hot encode
    df, cat_cols = one_hot_encoder(df, nan_as_category)
    
    del test_df
    gc.collect()

    return df

In [5]:
useless_feats = [
    'ORGANIZATION_TYPE_Advertising',
    'FLAG_DOCUMENT_21',
    'NAME_FAMILY_STATUS_Unknown',
    'FLAG_DOCUMENT_20',
    'FLAG_DOCUMENT_4',
    'WALLSMATERIAL_MODE_Monolithic',
    'OCCUPATION_TYPE_Realty agents',
    'FLAG_DOCUMENT_2',
    'ORGANIZATION_TYPE_Industry: type 8',
    'NAME_INCOME_TYPE_Businessman',
    'NAME_EDUCATION_TYPE_Academic degree',
    'FLAG_DOCUMENT_19',
    'OCCUPATION_TYPE_IT staff',
    'FLAG_EMP_PHONE',
    'FLAG_DOCUMENT_17',
    'FLAG_MOBIL',
    'OCCUPATION_TYPE_HR staff',
    'FLAG_DOCUMENT_15',
    'FLAG_DOCUMENT_14',
    'FLAG_CONT_MOBILE',
    'HOUSETYPE_MODE_terraced house',
    'FLAG_DOCUMENT_10',
    'ORGANIZATION_TYPE_Cleaning',
    'ORGANIZATION_TYPE_Culture',
    'NAME_INCOME_TYPE_Maternity leave',
    'ORGANIZATION_TYPE_XNA',
    'ORGANIZATION_TYPE_Insurance',
    'NAME_TYPE_SUITE_Group of people',
    'ORGANIZATION_TYPE_Legal Services',
    'ORGANIZATION_TYPE_Industry: type 5',
    'FLAG_DOCUMENT_12',
    'ORGANIZATION_TYPE_Mobile',
    'ORGANIZATION_TYPE_Industry: type 4',
    'ORGANIZATION_TYPE_Industry: type 2',
    'ORGANIZATION_TYPE_Realtor',
    'ORGANIZATION_TYPE_Religion',
    'ORGANIZATION_TYPE_Industry: type 13',
    'NAME_INCOME_TYPE_Unemployed',
    'ORGANIZATION_TYPE_Industry: type 12',
    'NAME_INCOME_TYPE_Student',
    'ORGANIZATION_TYPE_Telecom',
    'ORGANIZATION_TYPE_Trade: type 1',
    'ORGANIZATION_TYPE_Industry: type 10',
    'ORGANIZATION_TYPE_Trade: type 4',
    'ORGANIZATION_TYPE_Trade: type 5',
    'ORGANIZATION_TYPE_Trade: type 6',
    'FLAG_DOCUMENT_7',
    'ORGANIZATION_TYPE_Transport: type 1',
    'NAME_INCOME_TYPE_Pensioner',
    'ORGANIZATION_TYPE_Emergency',
    'ORGANIZATION_TYPE_Industry: type 6',
    'FLAG_DOCUMENT_13',

    'NAME_HOUSING_TYPE_Co-op apartment',
    'ORGANIZATION_TYPE_Agriculture',
    'FLAG_DOCUMENT_9',
    
    'BURO_STATUS_nan_MEAN_MEAN',
    'CLOSED_CREDIT_DAY_OVERDUE_MAX',
    'CLOSED_AMT_CREDIT_SUM_OVERDUE_MEAN',
    'BURO_CREDIT_TYPE_nan_MEAN',
    'CLOSED_CREDIT_DAY_OVERDUE_MEAN',
    'ORGANIZATION_TYPE_University',
    'BURO_CREDIT_TYPE_Unknown type of loan_MEAN',
    'ORGANIZATION_TYPE_Postal',
    'ORGANIZATION_TYPE_Industry: type 1',
    'ORGANIZATION_TYPE_Industry: type 7',
    'BURO_CREDIT_ACTIVE_Bad debt_MEAN',
    'BURO_CREDIT_ACTIVE_nan_MEAN',
    'BURO_CREDIT_CURRENCY_currency 1_MEAN',
    'BURO_CREDIT_CURRENCY_currency 2_MEAN',
    'BURO_CREDIT_CURRENCY_currency 3_MEAN',
    'BURO_CREDIT_CURRENCY_nan_MEAN',
    'BURO_CREDIT_TYPE_Real estate loan_MEAN',
    'BURO_CREDIT_TYPE_Cash loan (non-earmarked)_MEAN',
    'BURO_CREDIT_TYPE_Interbank credit_MEAN',
    'BURO_CREDIT_TYPE_Loan for purchase of shares (margin lending)_MEAN',
    'BURO_CREDIT_TYPE_Loan for the purchase of equipment_MEAN',
    'BURO_CREDIT_TYPE_Loan for working capital replenishment_MEAN',
    'AMT_REQ_CREDIT_BUREAU_HOUR',
    'BURO_CREDIT_TYPE_Mobile operator loan_MEAN',
    'BURO_CREDIT_CURRENCY_currency 4_MEAN',
        
    # Worse feats
    'DAYS_DIFF_MEAN',
    'DAYS_DIFF_MAX',
    'DAYS_DIFF_VAR',
    
    'BURO_CREDIT_TYPE_Loan for business development_MEAN',
    'OCCUPATION_TYPE_Cooking staff',
    'OCCUPATION_TYPE_Secretaries',
    
#     'PREV_O_APP__CREDIT_MAX',
#     'PREV_O_APP__CREDIT_MIN',
#     'PREV_O_APP__CREDIT_MEAN',
#     'PREV_O_APP__CREDIT_VAR',
    'PREV_O_APP__GOODS_MAX',
    'PREV_O_APP__GOODS_MIN',
    'PREV_O_APP__GOODS_MEAN',
    'PREV_O_APP__GOODS_VAR',
    'PREV_O_APP__DOWN_MAX',
    'PREV_O_APP__DOWN_MIN',
    'PREV_O_APP__DOWN_MEAN',
    'PREV_O_APP__DOWN_VAR',
    'PREV_O_APP__ANNUITY_MAX',
    'PREV_O_APP__ANNUITY_MIN',
    'PREV_O_APP__ANNUITY_MEAN',
    'PREV_O_APP__ANNUITY_VAR',
    'PREV_O_CREDIT__ANNUITY_MAX',
    'PREV_O_CREDIT__ANNUITY_MIN',
    'PREV_O_CREDIT__ANNUITY_MEAN',
    'PREV_O_CREDIT__ANNUITY_VAR',
    'PREV_O_CREDIT__GOODS_MAX',
    'PREV_O_CREDIT__GOODS_MIN',
    'PREV_O_CREDIT__GOODS_MEAN',
    'PREV_O_CREDIT__GOODS_VAR',
    'PREV_O_CREDIT__DOWN_MAX',
    'PREV_O_CREDIT__DOWN_MIN',
    'PREV_O_CREDIT__DOWN_MEAN',
    'PREV_O_CREDIT__DOWN_VAR',
    'PREV_O_GOODS__ANNUITY_MAX',
    'PREV_O_GOODS__ANNUITY_MIN',
    'PREV_O_GOODS__ANNUITY_MEAN',
    'PREV_O_GOODS__ANNUITY_VAR',
    'PREV_O_GOODS__DOWN_MAX',
    'PREV_O_GOODS__DOWN_MIN',
    'PREV_O_GOODS__DOWN_MEAN',
    'PREV_O_GOODS__DOWN_VAR',
    'PREV_O_DOWN__ANNUITY_MAX',
    'PREV_O_DOWN__ANNUITY_MIN',
    'PREV_O_DOWN__ANNUITY_MEAN',
    'PREV_O_DOWN__ANNUITY_VAR',
    
#     'PREV_APPROVED_O_APP__CREDIT_MAX',
#     'PREV_APPROVED_O_APP__CREDIT_MIN',
#     'PREV_APPROVED_O_APP__CREDIT_MEAN',
#     'PREV_APPROVED_O_APP__CREDIT_VAR',
    'PREV_APPROVED_O_APP__GOODS_MAX',
    'PREV_APPROVED_O_APP__GOODS_MIN',
    'PREV_APPROVED_O_APP__GOODS_MEAN',
    'PREV_APPROVED_O_APP__GOODS_VAR',
    'PREV_APPROVED_O_APP__DOWN_MAX',
    'PREV_APPROVED_O_APP__DOWN_MIN',
    'PREV_APPROVED_O_APP__DOWN_MEAN',
    'PREV_APPROVED_O_APP__DOWN_VAR',
    'PREV_APPROVED_O_APP__ANNUITY_MAX',
    'PREV_APPROVED_O_APP__ANNUITY_MIN',
    'PREV_APPROVED_O_APP__ANNUITY_MEAN',
    'PREV_APPROVED_O_APP__ANNUITY_VAR',
    'PREV_APPROVED_O_CREDIT__ANNUITY_MAX',
    'PREV_APPROVED_O_CREDIT__ANNUITY_MIN',
    'PREV_APPROVED_O_CREDIT__ANNUITY_MEAN',
    'PREV_APPROVED_O_CREDIT__ANNUITY_VAR',
    'PREV_APPROVED_O_CREDIT__GOODS_MAX',
    'PREV_APPROVED_O_CREDIT__GOODS_MIN',
    'PREV_APPROVED_O_CREDIT__GOODS_MEAN',
    'PREV_APPROVED_O_CREDIT__GOODS_VAR',
    'PREV_APPROVED_O_CREDIT__DOWN_MAX',
    'PREV_APPROVED_O_CREDIT__DOWN_MIN',
    'PREV_APPROVED_O_CREDIT__DOWN_MEAN',
    'PREV_APPROVED_O_CREDIT__DOWN_VAR',
    'PREV_APPROVED_O_GOODS__ANNUITY_MAX',
    'PREV_APPROVED_O_GOODS__ANNUITY_MIN',
    'PREV_APPROVED_O_GOODS__ANNUITY_MEAN',
    'PREV_APPROVED_O_GOODS__ANNUITY_VAR',
    'PREV_APPROVED_O_GOODS__DOWN_MAX',
    'PREV_APPROVED_O_GOODS__DOWN_MIN',
    'PREV_APPROVED_O_GOODS__DOWN_MEAN',
    'PREV_APPROVED_O_GOODS__DOWN_VAR',
    'PREV_APPROVED_O_DOWN__ANNUITY_MAX',
    'PREV_APPROVED_O_DOWN__ANNUITY_MIN',
    'PREV_APPROVED_O_DOWN__ANNUITY_MEAN',
    'PREV_APPROVED_O_DOWN__ANNUITY_VAR',
    
#     'PREV_REFUSED_O_APP__CREDIT_MAX',
#     'PREV_REFUSED_O_APP__CREDIT_MIN',
#     'PREV_REFUSED_O_APP__CREDIT_MEAN',
#     'PREV_REFUSED_O_APP__CREDIT_VAR',
    'PREV_REFUSED_O_APP__GOODS_MAX',
    'PREV_REFUSED_O_APP__GOODS_MIN',
    'PREV_REFUSED_O_APP__GOODS_MEAN',
    'PREV_REFUSED_O_APP__GOODS_VAR',
    'PREV_REFUSED_O_APP__DOWN_MAX',
    'PREV_REFUSED_O_APP__DOWN_MIN',
    'PREV_REFUSED_O_APP__DOWN_MEAN',
    'PREV_REFUSED_O_APP__DOWN_VAR',
    'PREV_REFUSED_O_APP__ANNUITY_MAX',
    'PREV_REFUSED_O_APP__ANNUITY_MIN',
    'PREV_REFUSED_O_APP__ANNUITY_MEAN',
    'PREV_REFUSED_O_APP__ANNUITY_VAR',
    'PREV_REFUSED_O_CREDIT__ANNUITY_MAX',
    'PREV_REFUSED_O_CREDIT__ANNUITY_MIN',
    'PREV_REFUSED_O_CREDIT__ANNUITY_MEAN',
    'PREV_REFUSED_O_CREDIT__ANNUITY_VAR',
    'PREV_REFUSED_O_CREDIT__GOODS_MAX',
    'PREV_REFUSED_O_CREDIT__GOODS_MIN',
    'PREV_REFUSED_O_CREDIT__GOODS_MEAN',
    'PREV_REFUSED_O_CREDIT__GOODS_VAR',
    'PREV_REFUSED_O_CREDIT__DOWN_MAX',
    'PREV_REFUSED_O_CREDIT__DOWN_MIN',
    'PREV_REFUSED_O_CREDIT__DOWN_MEAN',
    'PREV_REFUSED_O_CREDIT__DOWN_VAR',
    'PREV_REFUSED_O_GOODS__ANNUITY_MAX',
    'PREV_REFUSED_O_GOODS__ANNUITY_MIN',
    'PREV_REFUSED_O_GOODS__ANNUITY_MEAN',
    'PREV_REFUSED_O_GOODS__ANNUITY_VAR',
    'PREV_REFUSED_O_GOODS__DOWN_MAX',
    'PREV_REFUSED_O_GOODS__DOWN_MIN',
    'PREV_REFUSED_O_GOODS__DOWN_MEAN',
    'PREV_REFUSED_O_GOODS__DOWN_VAR',
    'PREV_REFUSED_O_DOWN__ANNUITY_MAX',
    'PREV_REFUSED_O_DOWN__ANNUITY_MIN',
    'PREV_REFUSED_O_DOWN__ANNUITY_MEAN',
    'PREV_REFUSED_O_DOWN__ANNUITY_VAR',
    
    'CC_NAME_CONTRACT_STATUS_nan_MEAN',
    'CC_NAME_CONTRACT_STATUS_nan_VAR',
    'CC_AMT_DRAWINGS_ATM_CURRENT_MIN',
    'CC_NAME_CONTRACT_STATUS_nan_SUM',
    'CC_NAME_CONTRACT_STATUS_nan_MIN',
    'ORGANIZATION_TYPE_Trade: type 2',
    'CC_NAME_CONTRACT_STATUS_nan_MAX',
    'EMERGENCYSTATE_MODE_Yes',
    'CC_NAME_CONTRACT_STATUS_Signed_MIN',
    'EMERGENCYSTATE_MODE_No',
    'CC_NAME_CONTRACT_STATUS_Signed_MAX',
    'CC_NAME_CONTRACT_STATUS_Sent proposal_VAR',
    'CC_NAME_CONTRACT_STATUS_Sent proposal_SUM',
    'CC_NAME_CONTRACT_STATUS_Signed_SUM',
    'POS_NAME_CONTRACT_STATUS_nan_MEAN',
    'CC_SK_DPD_DEF_MIN',
    'ORGANIZATION_TYPE_Transport: type 2',
    'POS_NAME_CONTRACT_STATUS_XNA_MEAN',
    'FLAG_DOCUMENT_5',
    'PREV_WEEKDAY_APPR_PROCESS_START_nan_MEAN',
    'POS_NAME_CONTRACT_STATUS_Demand_MEAN',
    'POS_NAME_CONTRACT_STATUS_Canceled_MEAN',
    'POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN',
    'NAME_TYPE_SUITE_Other_A',
    'ORGANIZATION_TYPE_Business Entity Type 2',
    'CC_SK_DPD_MIN',
    'CC_NAME_CONTRACT_STATUS_Sent proposal_MIN',
    'ORGANIZATION_TYPE_Housing',
    'ORGANIZATION_TYPE_Industry: type 11',
    'ORGANIZATION_TYPE_Services',
    'CC_CNT_DRAWINGS_ATM_CURRENT_MIN',
    'CC_NAME_CONTRACT_STATUS_Demand_MAX',
    'PREV_PRODUCT_COMBINATION_nan_MEAN',
    'PREV_NAME_GOODS_CATEGORY_Weapon_MEAN',
    'PREV_NAME_CASH_LOAN_PURPOSE_Journey_MEAN',
    'PREV_NAME_GOODS_CATEGORY_Medicine_MEAN',
    'PREV_NAME_CASH_LOAN_PURPOSE_Hobby_MEAN',
    'PREV_NAME_GOODS_CATEGORY_Other_MEAN',
    'PREV_NAME_CASH_LOAN_PURPOSE_Gasification / water supply_MEAN',
    'PREV_NAME_GOODS_CATEGORY_Tourism_MEAN',
    'PREV_NAME_GOODS_CATEGORY_nan_MEAN',
    'PREV_NAME_GOODS_CATEGORY_House Construction_MEAN',
    'PREV_NAME_PAYMENT_TYPE_Cashless from the account of the employer_MEAN',
    'PREV_NAME_PAYMENT_TYPE_nan_MEAN',
    'PREV_NAME_CASH_LOAN_PURPOSE_Buying a new car_MEAN',
    'PREV_NAME_PORTFOLIO_Cars_MEAN',
    'PREV_NAME_CASH_LOAN_PURPOSE_Buying a home_MEAN',
    'PREV_NAME_CASH_LOAN_PURPOSE_Buying a holiday home / land_MEAN',
    'PREV_NAME_GOODS_CATEGORY_Insurance_MEAN',
    'PREV_NAME_GOODS_CATEGORY_Homewares_MEAN',
    'PREV_NAME_PORTFOLIO_nan_MEAN',
    'PREV_NAME_GOODS_CATEGORY_Animals_MEAN',
    'PREV_NAME_CONTRACT_STATUS_nan_MEAN',
    'PREV_NAME_CLIENT_TYPE_XNA_MEAN',
    'PREV_NAME_CASH_LOAN_PURPOSE_nan_MEAN',
    'PREV_NAME_CONTRACT_TYPE_XNA_MEAN',
    'PREV_NAME_CONTRACT_TYPE_nan_MEAN',
    'PREV_NAME_GOODS_CATEGORY_Additional Service_MEAN',
    'PREV_NAME_CASH_LOAN_PURPOSE_Wedding / gift / holiday_MEAN',
    'PREV_NAME_CASH_LOAN_PURPOSE_Money for a third person_MEAN',
    'PREV_NAME_CASH_LOAN_PURPOSE_Refusal to name the goal_MEAN',
    'PREV_NAME_CASH_LOAN_PURPOSE_Purchase of electronic equipment_MEAN',
    'PREV_NAME_GOODS_CATEGORY_Direct Sales_MEAN',
    'PREV_NAME_GOODS_CATEGORY_Education_MEAN',
    'PREV_NAME_GOODS_CATEGORY_Fitness_MEAN',
    'PREV_NAME_CASH_LOAN_PURPOSE_Payments on other loans_MEAN',
    'PREV_NAME_CASH_LOAN_PURPOSE_Buying a garage_MEAN',
    'PREV_NAME_CASH_LOAN_PURPOSE_Business development_MEAN',
    'CC_NAME_CONTRACT_STATUS_Sent proposal_MEAN',
    'CC_NAME_CONTRACT_STATUS_Demand_VAR',
    'CC_NAME_CONTRACT_STATUS_Completed_MAX',
    'CC_NAME_CONTRACT_STATUS_Completed_MIN',
    'PREV_NAME_CLIENT_TYPE_nan_MEAN',
    'CC_NAME_CONTRACT_STATUS_Demand_MEAN',
    'CC_NAME_CONTRACT_STATUS_Demand_MIN',
    'CC_NAME_CONTRACT_STATUS_Demand_SUM',
    'CC_NAME_CONTRACT_STATUS_Refused_MAX',
    'CC_NAME_CONTRACT_STATUS_Approved_SUM',
    'CC_AMT_DRAWINGS_OTHER_CURRENT_MIN',
    'CC_NAME_CONTRACT_STATUS_Refused_MEAN',
    'CC_NAME_CONTRACT_STATUS_Refused_MIN',
    'CC_NAME_CONTRACT_STATUS_Refused_SUM',
    'CC_NAME_CONTRACT_STATUS_Refused_VAR',
    'CC_NAME_CONTRACT_STATUS_Sent proposal_MAX',
    'CC_NAME_CONTRACT_STATUS_Approved_VAR',
    'PREV_NAME_YIELD_GROUP_nan_MEAN',
    'PREV_NAME_PRODUCT_TYPE_nan_MEAN',
    'PREV_NAME_SELLER_INDUSTRY_Tourism_MEAN',
    'PREV_NAME_CASH_LOAN_PURPOSE_Building a house or an annex_MEAN',
    'CC_CNT_DRAWINGS_OTHER_CURRENT_MIN',
    'CC_CNT_DRAWINGS_OTHER_CURRENT_SUM',
    'PREV_FLAG_LAST_APPL_PER_CONTRACT_Y_MEAN',
    'PREV_CODE_REJECT_REASON_nan_MEAN',
    'PREV_NAME_SELLER_INDUSTRY_MLM partners_MEAN',
    'PREV_NAME_SELLER_INDUSTRY_nan_MEAN',
    'CC_NAME_CONTRACT_STATUS_Approved_MIN',
    'PREV_CODE_REJECT_REASON_SYSTEM_MEAN',
    'PREV_CHANNEL_TYPE_nan_MEAN',
    'PREV_CHANNEL_TYPE_Car dealer_MEAN',
    'CC_NAME_CONTRACT_STATUS_Active_MAX',
    'CC_NAME_CONTRACT_STATUS_Approved_MAX',
    'CC_NAME_CONTRACT_STATUS_Approved_MEAN',
    'PREV_FLAG_LAST_APPL_PER_CONTRACT_nan_MEAN',
]

In [6]:
def load_or_train(num_folds, n_fold, model, X, y):
    folder = os.path.join('..', 'expmodel', prefix)
    if not os.path.exists(folder):
        os.makedirs(folder)

    model_filename = os.path.join(folder, '{}_{}.pickle'.format(model['name'], n_fold))
    with timer('Load or train: {}'.format(model_filename)):
        if os.path.exists(model_filename):
            with open(model_filename, 'rb') as f:
                model['clf'] = pickle.load(f)
        else:
            print('{} does not exist, going to train.'.format(model_filename))
            model['clf'].fit(X, y, **model['fit_params'])
            with open(model_filename, 'wb') as f:
                pickle.dump(model['clf'], f)
        
    return model

In [7]:
def train_(model, num_folds, n_fold, X_train, y_train, X_valid=None, y_valid=None):
    X = X_train.copy()
    if model['name'] in ['LightGBM', 'XGBoost']:
        if X_valid is None or y_valid is None:
            model['fit_params'].pop('early_stopping_rounds', None)
        else:
            model['fit_params']['early_stopping_rounds'] = 200
            model['fit_params']['eval_set'] = [
                (X, y_train), (X_valid, y_valid)
            ]
    elif model['name'] in ['RandomForest']:
        X = X.fillna(0).replace([np.inf, -np.inf], 0)
    load_or_train(num_folds, n_fold, model, X, y_train)

In [8]:
def predict_(model, X_input, preds_key, val, y=None):
    """Generate a prediction matrix."""
    X = X_input.copy()
    if model['name'] in ['RandomForest']:
        p = model['clf'].predict_proba(X.fillna(0).replace([np.inf, -np.inf], 0))[:, 1]
    else:
        p = model['clf'].predict_proba(X)[:, 1]

    if preds_key == 'oof_preds':
        model[preds_key][val] = p
    else:
        model[preds_key] += p/val

    if y is not None:
        highlight_print(Fore.LIGHTBLUE_EX, '%s: %.6f' % (model['name'], roc_auc_score(y, p)))

In [9]:
def kfold_knn(num_folds, random_state, name, feats, X_train, y_train, X_test):
    folds = KFold(n_splits=num_folds, shuffle=True, random_state=random_state)            
    feature_importance_df = pd.DataFrame()
    # http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
    params = {
        'n_neighbors': 5,
        'weights': 'distance', # 'uniform'
        #'n_jobs': -1
    }
    model = {
        'name': 'KNN',
        'clf': neighbors.KNeighborsClassifier(**params),
        'fit_params': {},
        'oof_preds': np.zeros(X_train.shape[0]),
        'test_preds': np.zeros(X_test.shape[0])
    }
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train)):
        with timer("Generating cross-validated predictions #{}".format(n_fold+1)):
            fold_x_train, fold_y_train = X_train[train_idx.tolist(), :], y_train.iloc[train_idx]
            fold_x_valid, fold_y_valid = X_train[valid_idx.tolist(), :], y_train.iloc[valid_idx]
            # fold_x_train, fold_y_train = X_train.iloc[train_idx], y_train.iloc[train_idx]
            # fold_x_valid, fold_y_valid = X_train.iloc[valid_idx], y_train.iloc[valid_idx]

            train_(
                model, num_folds, n_fold,
                fold_x_train, fold_y_train,
                fold_x_valid, fold_y_valid
            )

            predict_(model, fold_x_valid, 'oof_preds', valid_idx, fold_y_valid)
            predict_(model, X_test, 'test_preds', folds.n_splits)

    highlight_print(Fore.RED, '* Local: %.6f' % roc_auc_score(y_train, model['oof_preds']))

    return feature_importance_df, model['test_preds']

In [10]:
def cross_feats(df):
    df['BAO_CREDIT__ANNUNITY'] = df['ACTIVE_AMT_CREDIT_SUM_SUM'] / df['AMT_ANNUITY']
    df['BAO_CREDIT__INC'] = df['ACTIVE_AMT_CREDIT_SUM_SUM'] / df['AMT_INCOME_TOTAL']
    df['BAO_CREDIT__CREDIT'] = df['ACTIVE_AMT_CREDIT_SUM_SUM'] / df['AMT_CREDIT']
    df['BAO_CREDIT__GOODS'] = df['ACTIVE_AMT_CREDIT_SUM_SUM'] / df['AMT_GOODS_PRICE']
    
    df['PAO_CREDIT__ANNUNITY'] = df['PREV_AMT_CREDIT_MEAN'] / df['AMT_ANNUITY']
    df['PAO_CREDIT__INC'] = df['PREV_AMT_CREDIT_MEAN'] / df['AMT_INCOME_TOTAL']
    df['PAO_CREDIT__CREDIT'] = df['PREV_AMT_CREDIT_MEAN'] / df['AMT_CREDIT']
    df['PAO_CREDIT__GOODS'] = df['PREV_AMT_CREDIT_MEAN'] / df['AMT_GOODS_PRICE']
    
    df['PAO_ANNUNITY'] = df['PREV_AMT_ANNUITY_MEAN'] / df['AMT_ANNUITY']
    df['PAO_ANNUNITY__INC'] = df['PREV_AMT_ANNUITY_MEAN'] / df['AMT_INCOME_TOTAL']
    df['PAO_ANNUNITY__CREDIT'] = df['PREV_AMT_ANNUITY_MEAN'] / df['AMT_CREDIT']

    return df

In [11]:
num_rows = None

# application
with timer("Load application features:"):
    df = application_train_test(num_rows)
    print("App df shape:", df.shape)
    df_full = df.copy()

# bureau
with timer("Load bureau features:"):
    bureau = pd.read_csv('../preprocess/bureau_features.csv')
    print("Bureau df shape:", bureau.shape)
    df_full = df.merge(bureau, how='left', on=['SK_ID_CURR'])
    del bureau
    gc.collect()

# previous application
with timer("Load previous application features:"):
    prev_app = pd.read_csv('../preprocess/previous_application_features.csv')
    print("Previous application df shape:", prev_app.shape)
    df_full = df_full.merge(prev_app, how='left', on=['SK_ID_CURR'])
    del prev_app
    gc.collect()

# installments
with timer("Load installments features:"):
    installments = pd.read_csv('../preprocess/installments_payments_features.csv')
    print("Installments df shape:", installments.shape)
    df_full = df_full.merge(installments, how='left', on=['SK_ID_CURR'])
    del installments
    gc.collect()

# pos_cash
with timer("Load pos_cash features:"):
    pos_cash = pd.read_csv('../preprocess/pos_cash_features.csv')
    print("Pos-Cash df shape:", pos_cash.shape)
    df_full = df_full.merge(pos_cash, how='left', on=['SK_ID_CURR'])
    del pos_cash
    gc.collect()

# credit_card
with timer("Load credit_card features:"):
    credit_card = pd.read_csv('../preprocess/credit_card_features.csv')
    print("Credit card df shape:", credit_card.shape)
    df_full = df_full.merge(credit_card, how='left', on=['SK_ID_CURR'])
    del credit_card
    gc.collect()

# df_full = cross_feats(df_full)
print("Full df shape:", df_full.shape)
feats = [f for f in df_full.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index'] + useless_feats]
print(len(feats))

Train samples: 307511, test samples: 48744
App df shape: (356251, 245)
[92m[Done] Load application features: in 0:19 (2018-08-16 22:06:06.294877)[0m
Bureau df shape: (305811, 119)
[92m[Done] Load bureau features: in 0:25 (2018-08-16 22:06:31.785130)[0m
Previous application df shape: (338857, 304)
[92m[Done] Load previous application features: in 0:44 (2018-08-16 22:07:15.913596)[0m
Installments df shape: (339587, 27)
[92m[Done] Load installments features: in 0:4 (2018-08-16 22:07:20.746584)[0m
Pos-Cash df shape: (337252, 19)
[92m[Done] Load pos_cash features: in 0:3 (2018-08-16 22:07:24.122655)[0m
Credit card df shape: (103558, 142)
[92m[Done] Load credit_card features: in 0:17 (2018-08-16 22:07:41.547959)[0m
Full df shape: (356251, 851)
608


In [12]:
train_df = df_full[df_full['TARGET'].notnull()]
test_df = df_full[df_full['TARGET'].isnull()]

feats = [f for f in df_full.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index'] + useless_feats]
X_train = train_df[feats]
y_train = train_df['TARGET']
print("X_train df shape:", X_train.shape)

X_train df shape: (307507, 608)


In [13]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
KNN_train = scaler.fit_transform(X_train.fillna(0).replace([np.inf, -np.inf], 0))
KNN_test = scaler.transform(test_df[feats].fillna(0).replace([np.inf, -np.inf], 0))

In [15]:
feats = [f for f in df_full.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index'] + useless_feats]
prefix = 'knn_10x_2'
name = '{}-{date:%Y_%m_%d_%H_%M_%S}'.format(prefix, date=datetime.datetime.now())
print('name: {}\nfeats num:{}'.format(name, len(feats)))
model_folder = os.path.join('..', 'expmodel', '{}'.format(prefix))
with open(os.path.join(model_folder, 'feats.txt'), 'w') as f:
    json.dump(feats, f, indent=2)

name: knn_10x_2-2018_08_16_22_09_22
feats num:608


In [None]:
print("KNN_train df shape:", KNN_train.shape)

nums_fold = 5
random_state = 1001
with timer("Run KNN with KFold {}".format(nums_fold)):
    feat_importance, preds = kfold_knn(nums_fold, random_state, name, feats, KNN_train, y_train, KNN_test)

KNN_train df shape: (307507, 608)
../expmodel/knn_10x_2/KNN_0.pickle does not exist, going to train.
[92m[Done] Load or train: ../expmodel/knn_10x_2/KNN_0.pickle in 5:32 (2018-08-16 22:15:05.299661)[0m


In [None]:
"""
KNN: 0.540095
KNN: 0.543502
KNN: 0.537183
KNN: 0.549209
KNN: 0.551814
* Local: 0.544387
"""