In [29]:
import time

from contextlib import contextmanager
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]

    return df, new_columns

In [30]:
def application_train_and_test():
    
    application_train = pd.read_csv('../data/application_train.csv')
    application_test = pd.read_csv('../data/application_test.csv')
    
    application_train = application_train.sort_values(by = 'SK_ID_CURR')
    application_test = application_test.sort_values(by = 'SK_ID_CURR')
    df = application_train.append(application_test).reset_index()

    df['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
    df['CODE_GENDER'].replace({'XNA': np.nan}, inplace = True)
    df['ORGANIZATION_TYPE'].replace({'XNA': np.nan}, inplace = True)
    
    useless_features = ['FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 
                        'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_19', 
                        'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']
    
    df = df.drop(useless_features, axis = 1)
    
    docs = [_f for _f in df.columns if 'FLAG_DOC' in _f]
    live = [_f for _f in df.columns if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f)]
    
    inc_by_org = df[['AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE']].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']

    df['NEW_CREDIT_TO_GOODS_RATIO'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
    df['NEW_DOC_IND_AVG'] = df[docs].mean(axis=1)
    df['NEW_DOC_IND_STD'] = df[docs].std(axis=1)
    df['NEW_DOC_IND_KURT'] = df[docs].kurtosis(axis=1)
    df['NEW_LIVE_IND_SUM'] = df[live].sum(axis=1)
    df['NEW_LIVE_IND_STD'] = df[live].std(axis=1)
    df['NEW_LIVE_IND_KURT'] = df[live].kurtosis(axis=1)
    df['NEW_INC_PER_CHLD'] = df['AMT_INCOME_TOTAL'] / (1 + df['CNT_CHILDREN'])
    df['NEW_INC_BY_ORG'] = df['ORGANIZATION_TYPE'].map(inc_by_org)
    df['NEW_ANNUITY_TO_INCOME_RATIO'] = df['AMT_ANNUITY'] / (1 + df['AMT_INCOME_TOTAL'])
    df['NEW_SOURCES_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
    df['NEW_EXT_SOURCES_MEAN'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    df['NEW_SCORES_STD'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
    df['NEW_SCORES_STD'] = df['NEW_SCORES_STD'].fillna(df['NEW_SCORES_STD'].mean())
    df['NEW_CAR_TO_BIRTH_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH']
    df['NEW_CAR_TO_EMPLOY_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED']
    df['NEW_PHONE_TO_BIRTH_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH']
    df['NEW_PHONE_TO_EMPLOY_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_EMPLOYED']
    df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
    df['NEW_CREDIT_TO_ANNUITY_RATIO'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']

    df['NEW_EXT_SOURCES_MEAN_AMT_INCOME_TOTAL_PROD'] = df['NEW_EXT_SOURCES_MEAN']*df['AMT_INCOME_TOTAL']
    df['NEW_EXT_SOURCES_MEAN_DAYS_BIRTH_PROD'] = df['NEW_EXT_SOURCES_MEAN']*df['DAYS_BIRTH']
    df['NEW_EXT_SOURCES_MEAN_DAYS_EMPLOYED_PROD'] = df['NEW_EXT_SOURCES_MEAN']*df['DAYS_EMPLOYED']
    df['DAYS_REGISTRATION_ID'] = df['DAYS_REGISTRATION']*df['DAYS_ID_PUBLISH']
    df['EXT_SOURCES_PAYMENT_RATE'] = df['NEW_EXT_SOURCES_MEAN']*df['PAYMENT_RATE']
    df['AGE_TO_CAR_AGE_RATIO'] = df['DAYS_BIRTH']/df['OWN_CAR_AGE']
    df['AMT_CREDIT_CNT_CHILD_PROD'] = df['AMT_CREDIT']*df['CNT_CHILDREN']
    df['AMT_ANNUITY_CNT_CHILD_PROD'] = df['AMT_ANNUITY']*df['CNT_CHILDREN']
    df['GOODS_PER_CHILD_CNT_CHILD_PROD'] = df['AMT_GOODS_PRICE']*df['CNT_CHILDREN']
    df['CREDIT_PER_FAMILY_MEMBER'] = df['AMT_CREDIT']/df['CNT_FAM_MEMBERS']
    df['ANNUITY_PER_FAMILY_MEMBER'] = df['AMT_ANNUITY']/df['CNT_FAM_MEMBERS']
    df['GOODS_PER_FAMILY_MEMBER'] = df['AMT_GOODS_PRICE']/df['CNT_FAM_MEMBERS']
    df['FAM_SIZE_PER_POPULATION'] = df['CNT_FAM_MEMBERS']/df['REGION_POPULATION_RELATIVE']

    df['30_CNT_SOCIAL_CIRCLE_RATIO'] = df['DEF_30_CNT_SOCIAL_CIRCLE']/df['OBS_30_CNT_SOCIAL_CIRCLE']
    df['60_CNT_SOCIAL_CIRCLE_RATIO'] = df['DEF_60_CNT_SOCIAL_CIRCLE']/df['OBS_60_CNT_SOCIAL_CIRCLE']
    df['DEF_CNT_SOCIAL_CIRCLE_TOTAL'] = df['DEF_30_CNT_SOCIAL_CIRCLE'] + df['DEF_60_CNT_SOCIAL_CIRCLE']
    df['CNT_SOCIAL_CIRCLE_RATIO_TOTAL'] = df['30_CNT_SOCIAL_CIRCLE_RATIO'] + df['60_CNT_SOCIAL_CIRCLE_RATIO']
    
    # Feature interactions
    df['EXT_SOURCE_1_1'] = df['EXT_SOURCE_1']**2
    df['EXT_SOURCE_1_2'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2']
    df['EXT_SOURCE_1_3'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_3']
    df['EXT_SOURCE_1_DAYS_BIRTH'] = df['EXT_SOURCE_1'] * df['DAYS_BIRTH']
    df['EXT_SOURCE_2_2'] = df['EXT_SOURCE_2']**2
    df['EXT_SOURCE_2_3'] = df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
    df['EXT_SOURCE_2_DAYS_BIRTH'] = df['EXT_SOURCE_2'] * df['DAYS_BIRTH']
    df['EXT_SOURCE_3_3'] = df['EXT_SOURCE_3']**2
    df['EXT_SOURCE_3_DAYS_BIRTH'] = df['EXT_SOURCE_3'] * df['DAYS_BIRTH']
    df['DAYS_BIRTH_DAYS_BIRTH'] = df['DAYS_BIRTH']**2
    df['DAYS_EMPLOYED_DAYS_BIRTH'] = df['DAYS_EMPLOYED'] * df['DAYS_BIRTH']
    df['DAYS_EMPLOYED_DAYS_EMPLOYED'] = df['DAYS_EMPLOYED']**2
    df['AMT_CREDIT_AMT_ANNUITY'] = df['AMT_CREDIT'] * df['AMT_ANNUITY']
    df['AMT_CREDIT_AMT_CREDIT'] = df['AMT_CREDIT']**2
    df['AMT_ANNUITY_AMT_ANNUITY'] = df['AMT_ANNUITY']**2    

    categorical_features = ['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'FLAG_MOBIL', 'FLAG_EMP_PHONE',
                            'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL',
                            'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 
                            'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY',
                            'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6',
                            'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_11', 
                            'FLAG_DOCUMENT_18', 'CODE_GENDER', 'NAME_CONTRACT_TYPE',
                            'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'EMERGENCYSTATE_MODE',
                            'HOUSETYPE_MODE', 'FONDKAPREMONT_MODE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
                            'NAME_HOUSING_TYPE', 'NAME_TYPE_SUITE', 'WALLSMATERIAL_MODE','WEEKDAY_APPR_PROCESS_START',
                            'HOUR_APPR_PROCESS_START', 'NAME_INCOME_TYPE', 'OCCUPATION_TYPE', 'ORGANIZATION_TYPE']
    
    for feature in categorical_features:
        df[feature], uniques = pd.factorize(df[feature])

    df = df.drop('index', axis=1)
    
    del application_train
    del application_test
    del categorical_features
    
    return df

In [31]:
def bureau_and_balance(df):
    
    bureau = pd.read_csv('../data/bureau.csv')
    bureau_balance = pd.read_csv('../data/bureau_balance.csv')
    
    bureau, bureau_cat_cols = one_hot_encoder(bureau)
    bureau_balance, bureau_balance_cat_cols = one_hot_encoder(bureau_balance)

    # Oof cat mean aggregation

    bureau_cat_cols.append('SK_ID_CURR')
    bureau_balance_cat_cols.append('SK_ID_BUREAU')
    

    folds = KFold(n_splits= 5, shuffle=True, random_state=1001)
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(bureau_balance)):
        bureau_balance_cat_aggregations = bureau_balance.loc[train_idx, bureau_balance_cat_cols].groupby('SK_ID_BUREAU').agg(['mean', 'count'])
        bureau_balance_cat_aggregations.columns = pd.Index([e[0] + "_" + e[1].upper() + str(n_fold) for e in bureau_balance_cat_aggregations.columns.tolist()])
        bureau = bureau.join(bureau_balance_cat_aggregations, how='left', on='SK_ID_BUREAU')
        bb_cat_agg_list = list(bureau_balance_cat_aggregations.columns)
        bureau_cat_cols += bb_cat_agg_list
        
    folds = KFold(n_splits= 5, shuffle=True, random_state=1001)
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(bureau)):    
        bureau_cat_aggregations = bureau.loc[train_idx, bureau_cat_cols].groupby('SK_ID_CURR').agg(['mean', 'count'])
        bureau_cat_aggregations.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() + str(n_fold) for e in bureau_cat_aggregations.columns.tolist()])
        df = df.merge(bureau_cat_aggregations, on = 'SK_ID_CURR', how = 'left')

    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace= True)

    return df

In [32]:
df = application_train_and_test()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [33]:
df = bureau_and_balance(df)

In [34]:
np.shape(df)

(356255, 1326)

In [22]:
bureau = pd.read_csv('../data/bureau.csv')
bureau_balance = pd.read_csv('../data/bureau_balance.csv')

print(np.shape(bureau))

bureau, bureau_cat_cols = one_hot_encoder(bureau)
bureau_balance, bureau_balance_cat_cols = one_hot_encoder(bureau_balance)

# Oof cat mean aggregation

bureau_cat_cols.append('SK_ID_CURR')
bureau_balance_cat_cols.append('SK_ID_BUREAU')


folds = KFold(n_splits= 5, shuffle=True, random_state=1001)
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(bureau_balance)):
    bureau_balance_cat_aggregations = bureau_balance.loc[train_idx, bureau_balance_cat_cols].groupby('SK_ID_BUREAU').agg(['mean', 'count'])
    bureau_balance_cat_aggregations.columns = pd.Index([e[0] + "_" + e[1].upper() + str(n_fold) for e in bureau_balance_cat_aggregations.columns.tolist()])
    bureau = bureau.join(bureau_balance_cat_aggregations, how='left', on='SK_ID_BUREAU')
    bb_cat_agg_list = list(bureau_balance_cat_aggregations.columns)
    bureau_cat_cols += bb_cat_agg_list
    
print(np.shape(bureau))


(1716428, 17)
(1716428, 130)


In [24]:
np.shape(bureau_balance)

(27299925, 11)

In [25]:
bureau_balance.head()

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS_0,STATUS_1,STATUS_2,STATUS_3,STATUS_4,STATUS_5,STATUS_C,STATUS_X,STATUS_nan
0,5715448,0,0,0,0,0,0,0,1,0,0
1,5715448,-1,0,0,0,0,0,0,1,0,0
2,5715448,-2,0,0,0,0,0,0,1,0,0
3,5715448,-3,0,0,0,0,0,0,1,0,0
4,5715448,-4,0,0,0,0,0,0,1,0,0


In [26]:
bureau.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,...,STATUS_4_MEAN4,STATUS_4_COUNT4,STATUS_5_MEAN4,STATUS_5_COUNT4,STATUS_C_MEAN4,STATUS_C_COUNT4,STATUS_X_MEAN4,STATUS_X_COUNT4,STATUS_nan_MEAN4,STATUS_nan_COUNT4
0,215354,5714462,-497,0,-153.0,-153.0,,0,91323.0,0.0,...,,,,,,,,,,
1,215354,5714463,-208,0,1075.0,,,0,225000.0,171342.0,...,,,,,,,,,,
2,215354,5714464,-203,0,528.0,,,0,464323.5,,...,,,,,,,,,,
3,215354,5714465,-203,0,,,,0,90000.0,,...,,,,,,,,,,
4,215354,5714466,-629,0,1197.0,,77674.5,0,2700000.0,,...,,,,,,,,,,


In [27]:
list(bureau.columns)

['SK_ID_CURR',
 'SK_ID_BUREAU',
 'DAYS_CREDIT',
 'CREDIT_DAY_OVERDUE',
 'DAYS_CREDIT_ENDDATE',
 'DAYS_ENDDATE_FACT',
 'AMT_CREDIT_MAX_OVERDUE',
 'CNT_CREDIT_PROLONG',
 'AMT_CREDIT_SUM',
 'AMT_CREDIT_SUM_DEBT',
 'AMT_CREDIT_SUM_LIMIT',
 'AMT_CREDIT_SUM_OVERDUE',
 'DAYS_CREDIT_UPDATE',
 'AMT_ANNUITY',
 'CREDIT_ACTIVE_Active',
 'CREDIT_ACTIVE_Bad debt',
 'CREDIT_ACTIVE_Closed',
 'CREDIT_ACTIVE_Sold',
 'CREDIT_ACTIVE_nan',
 'CREDIT_CURRENCY_currency 1',
 'CREDIT_CURRENCY_currency 2',
 'CREDIT_CURRENCY_currency 3',
 'CREDIT_CURRENCY_currency 4',
 'CREDIT_CURRENCY_nan',
 'CREDIT_TYPE_Another type of loan',
 'CREDIT_TYPE_Car loan',
 'CREDIT_TYPE_Cash loan (non-earmarked)',
 'CREDIT_TYPE_Consumer credit',
 'CREDIT_TYPE_Credit card',
 'CREDIT_TYPE_Interbank credit',
 'CREDIT_TYPE_Loan for business development',
 'CREDIT_TYPE_Loan for purchase of shares (margin lending)',
 'CREDIT_TYPE_Loan for the purchase of equipment',
 'CREDIT_TYPE_Loan for working capital replenishment',
 'CREDIT_TYPE_Micro

In [28]:
print(np.shape(df))
print(np.shape(bureau))


folds = KFold(n_splits= 5, shuffle=True, random_state=1001)
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(bureau)):    
    bureau_cat_aggregations = bureau.loc[train_idx, bureau_cat_cols].groupby('SK_ID_CURR').agg(['mean', 'count'])
    bureau_cat_aggregations.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() + str(n_fold) for e in bureau_cat_aggregations.columns.tolist()])
    df = df.merge(bureau_cat_aggregations, on = 'SK_ID_CURR', how = 'left')
    
print(np.shape(bureau))



(356255, 1326)
(1716428, 130)


KeyboardInterrupt: 