In [1]:
import os
import gc
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from contextlib import contextmanager

%matplotlib inline

In [2]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))
    
# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [3]:
def group_by_feats(df):
    # groupby = ['NAME_EDUCATION_TYPE', 'ORGANIZATION_TYPE']
    groupby = ['NAME_EDUCATION_TYPE']
    selected = ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY'] + groupby
    agg = df[selected]\
        .groupby(groupby)\
        .agg({
            'AMT_INCOME_TOTAL': ['median'],
            # 'AMT_INCOME_TOTAL': ['mean', 'median']
            # 'AMT_CREDIT': ['mean', 'median'],
            # 'AMT_ANNUITY': ['mean', 'median']
        })\
        .reset_index()
    cols = agg.columns.tolist()[-1:]
    agg.columns = pd.Index(groupby+['AA_BY_{}_'.format('_X_'.join(groupby)) + e[0] + "_" + e[1].upper() for e in cols])
    # df['MORE_O_AMT_INCOME_TOTAL__EDU_MEDIAN'] = df['AMT_INCOME_TOTAL'] / df['NEW_BY_NAME_EDUCATION_TYPE_AMT_INCOME_TOTAL_MEDIAN']
    
    return pd.merge(df, agg, how='left', on=groupby)

In [4]:
def application_train_test(num_rows=None, nan_as_category=False):
    # Read data and merge
    df = pd.read_csv('../data/raw/application_train.csv', nrows=num_rows)
    test_df = pd.read_csv('../data/raw/application_test.csv', nrows=num_rows)
    print("Train samples: {}, test samples: {}".format(len(df), len(test_df)))
    df = df.append(test_df, sort=True).reset_index()

    # Optional: Remove 4 applications with XNA CODE_GENDER (train set)
    df = df[df['CODE_GENDER'] != 'XNA']

    # NaN values for DAYS_EMPLOYED: 365.243 -> nan
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)

    df['AAO_CREDIT__INC'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
    df['AAO_ANNUITY__INC'] = df['AMT_ANNUITY'] / (1 + df['AMT_INCOME_TOTAL'])
    df['AAO_ANNUITY__CREDIT'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
    df['AAO_CREDIT__ANNUITY'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']
    df['AAO_CREDIT__GOODS'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
    df['AAO_INC__ADULT'] = df['AMT_INCOME_TOTAL'] / (df['CNT_FAM_MEMBERS'] - df['CNT_CHILDREN'])
    df['AAO_INC__CHLD'] = df['AMT_INCOME_TOTAL'] / (1 + df['CNT_CHILDREN'])
    df['AAO_INC__FM'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['AAO_CHLD__FM'] = df['CNT_CHILDREN'] / df['CNT_FAM_MEMBERS']

    docs = [_f for _f in df.columns if 'FLAG_DOC' in _f]
    live = [_f for _f in df.columns if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f)]    
    df['AA_DOC_IND_KURT'] = df[docs].kurtosis(axis=1)
    df['AA_LIVE_IND_SUM'] = df[live].sum(axis=1)
    
    inc_by_org = df[['AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE']].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']
    df['AA_INC__ORG'] = df['ORGANIZATION_TYPE'].map(inc_by_org)
    df = group_by_feats(df)
    
    # EXT_SOURCES
    df['AA_SOURCES'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
    df['AA_SOURCES_MEAN'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    df['AA_SCORES_STD'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
    df['AA_SCORES_STD'] = df['AA_SCORES_STD'].fillna(df['AA_SCORES_STD'].mean())
    
    # TO_BIRTH_DAYS, TO_DAYS_EMPLOYED
    df['AAO_EMPLOYED__BIRTH'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['AAO_CAR__BIRTH'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH']
    df['AAO_CAR__EMPLOY'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED']
    df['AAO_PHONE__BIRTH'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH']
    df['AAO_PHONE__EMPLOYED'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_EMPLOYED']
    
    df['AAAX_CREDIT_ANNUITY__BIRTH'] = df['AAO_CREDIT__ANNUITY'] * df['DAYS_BIRTH']
    df['AAAX_CREDIT_ANNUITY__EMPLOYED'] = df['AAO_CREDIT__ANNUITY'] * df['DAYS_EMPLOYED']
    df['AAAX_SOURCES_MEAN__BIRTH'] = df['AA_SOURCES_MEAN'] * df['DAYS_BIRTH']
    df['AAAX_SOURCES_MEAN__EMPLOYED'] = df['AA_SOURCES_MEAN'] * df['DAYS_EMPLOYED']
    df['AAAO_SOURCES_MEAN__EMPLOYED_BIRTH'] = df['AA_SOURCES_MEAN'] / df['AAO_EMPLOYED__BIRTH']
    
    df['AAAX_CREDIT_ANNUITY__SOURCE_MEAN'] = df['AAO_CREDIT__ANNUITY'] * df['AA_SOURCES_MEAN']
    df['AAAX_CREDIT_ANNUITY__SOURCE_3'] = df['AAO_CREDIT__ANNUITY'] * df['EXT_SOURCE_3']
    df['AAAX_CREDIT_ANNUITY__SOURCE_2'] = df['AAO_CREDIT__ANNUITY'] * df['EXT_SOURCE_2']
    df['AAAX_CREDIT_ANNUITY__SOURCE_1'] = df['AAO_CREDIT__ANNUITY'] * df['EXT_SOURCE_1']
    
    # Categorical features with Binary encode (0 or 1; two categories)
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])

    # Categorical features with One-Hot encode
    df, cat_cols = one_hot_encoder(df, nan_as_category)
    
    del test_df
    gc.collect()

    return df

In [5]:
def main(filename, sample=False):
    num_rows = 10000 if sample else None
    with timer("Application"):
        application_features = application_train_test(num_rows)
        print("application df shape:", application_features.shape)
        application_features.to_csv(filename, index=False)
        return application_features

In [6]:
df = main(filename='../data/preprocess/application_features.csv')

Train samples: 307511, test samples: 48744
application df shape: (356251, 273)
Application - done in 147s


In [7]:
len(df.columns), df.columns

(273, Index(['index', 'AMT_ANNUITY', 'AMT_CREDIT', 'AMT_GOODS_PRICE',
        'AMT_INCOME_TOTAL', 'AMT_REQ_CREDIT_BUREAU_DAY',
        'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_MON',
        'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_WEEK',
        ...
        'WALLSMATERIAL_MODE_Panel', 'WALLSMATERIAL_MODE_Stone, brick',
        'WALLSMATERIAL_MODE_Wooden', 'WEEKDAY_APPR_PROCESS_START_FRIDAY',
        'WEEKDAY_APPR_PROCESS_START_MONDAY',
        'WEEKDAY_APPR_PROCESS_START_SATURDAY',
        'WEEKDAY_APPR_PROCESS_START_SUNDAY',
        'WEEKDAY_APPR_PROCESS_START_THURSDAY',
        'WEEKDAY_APPR_PROCESS_START_TUESDAY',
        'WEEKDAY_APPR_PROCESS_START_WEDNESDAY'],
       dtype='object', length=273))

In [8]:
for col in df.columns:
    print(col)

index
AMT_ANNUITY
AMT_CREDIT
AMT_GOODS_PRICE
AMT_INCOME_TOTAL
AMT_REQ_CREDIT_BUREAU_DAY
AMT_REQ_CREDIT_BUREAU_HOUR
AMT_REQ_CREDIT_BUREAU_MON
AMT_REQ_CREDIT_BUREAU_QRT
AMT_REQ_CREDIT_BUREAU_WEEK
AMT_REQ_CREDIT_BUREAU_YEAR
APARTMENTS_AVG
APARTMENTS_MEDI
APARTMENTS_MODE
BASEMENTAREA_AVG
BASEMENTAREA_MEDI
BASEMENTAREA_MODE
CNT_CHILDREN
CNT_FAM_MEMBERS
CODE_GENDER
COMMONAREA_AVG
COMMONAREA_MEDI
COMMONAREA_MODE
DAYS_BIRTH
DAYS_EMPLOYED
DAYS_ID_PUBLISH
DAYS_LAST_PHONE_CHANGE
DAYS_REGISTRATION
DEF_30_CNT_SOCIAL_CIRCLE
DEF_60_CNT_SOCIAL_CIRCLE
ELEVATORS_AVG
ELEVATORS_MEDI
ELEVATORS_MODE
ENTRANCES_AVG
ENTRANCES_MEDI
ENTRANCES_MODE
EXT_SOURCE_1
EXT_SOURCE_2
EXT_SOURCE_3
FLAG_CONT_MOBILE
FLAG_DOCUMENT_10
FLAG_DOCUMENT_11
FLAG_DOCUMENT_12
FLAG_DOCUMENT_13
FLAG_DOCUMENT_14
FLAG_DOCUMENT_15
FLAG_DOCUMENT_16
FLAG_DOCUMENT_17
FLAG_DOCUMENT_18
FLAG_DOCUMENT_19
FLAG_DOCUMENT_2
FLAG_DOCUMENT_20
FLAG_DOCUMENT_21
FLAG_DOCUMENT_3
FLAG_DOCUMENT_4
FLAG_DOCUMENT_5
FLAG_DOCUMENT_6
FLAG_DOCUMENT_7
FLAG_DOCUMENT_8