In [1]:
import os
import gc
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from contextlib import contextmanager

%matplotlib inline

In [2]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))
    
# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [3]:
# Preprocess previous_applications.csv
def previous_application(num_rows=None, nan_as_category=True):
    prev = pd.read_csv('../data/previous_application.csv', nrows=num_rows)
    prev, cat_cols = one_hot_encoder(prev, nan_as_category=nan_as_category)
    
    # Days 365.243 values -> nan
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)

    # Add feature: value ask / value received percentage
    prev['O_APP__CREDIT'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT'] # first one
    prev['O_APP__GOODS'] = prev['AMT_APPLICATION'] / prev['AMT_GOODS_PRICE']
    prev['O_APP__DOWN'] = prev['AMT_APPLICATION'] / prev['AMT_DOWN_PAYMENT']
    prev['O_APP__ANNUITY'] = prev['AMT_APPLICATION'] / prev['AMT_ANNUITY']
    prev['O_CREDIT__ANNUITY'] = prev['AMT_CREDIT'] / prev['AMT_ANNUITY']
    prev['O_CREDIT__GOODS'] = prev['AMT_CREDIT'] / prev['AMT_GOODS_PRICE']
    prev['O_CREDIT__DOWN'] = prev['AMT_CREDIT'] / prev['AMT_DOWN_PAYMENT']
    prev['O_GOODS__ANNUITY'] = prev['AMT_GOODS_PRICE'] / prev['AMT_ANNUITY']
    prev['O_GOODS__DOWN'] = prev['AMT_GOODS_PRICE'] / prev['AMT_DOWN_PAYMENT']
    prev['O_DOWN__ANNUITY'] = prev['AMT_DOWN_PAYMENT'] / prev['AMT_ANNUITY']
    
    # Previous applications numeric features
    num_aggregations = {
        'AMT_ANNUITY': ['min', 'max', 'mean'],
        'AMT_APPLICATION': ['min', 'max', 'mean'],
        'AMT_CREDIT': ['min', 'max', 'mean'],
        'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
        'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
        
        'O_APP__CREDIT': ['max', 'min', 'mean', 'var'], # first one
        'O_APP__GOODS': ['max', 'min', 'mean', 'var'],
        'O_APP__DOWN': ['max', 'min', 'mean', 'var'],
        'O_APP__ANNUITY': ['max', 'min', 'mean', 'var'],
        'O_CREDIT__ANNUITY': ['max', 'min', 'mean', 'var'],
        'O_CREDIT__GOODS': ['max', 'min', 'mean', 'var'],
        'O_CREDIT__DOWN': ['max', 'min', 'mean', 'var'],
        'O_GOODS__ANNUITY': ['max', 'min', 'mean', 'var'],
        'O_GOODS__DOWN': ['max', 'min', 'mean', 'var'],
        'O_DOWN__ANNUITY': ['max', 'min', 'mean', 'var'],
    }
    # Previous applications categorical features
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']
    
    prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
    
    # Previous Applications: Approved Applications - only numerical features
    approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    approved_agg.columns = pd.Index(['PREV_APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')

    # Previous Applications: Refused Applications - only numerical features
    refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index(['PREV_REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
    del refused, refused_agg, approved, approved_agg, prev
    gc.collect()
    return prev_agg

In [4]:
def main(filename, sample=False):
    if os.path.exists(filename):
        return pd.read_csv(filename)

    num_rows = 10000 if sample else None
    with timer("Process previous application"):
        previous_application_features = previous_application(num_rows)
        print("Previous previous application df shape:", previous_application_features.shape)
        previous_application_features.to_csv(filename)
        return previous_application_features

In [None]:
df = main(filename='previous_application_features.csv')

Previous previous application df shape: (338857, 357)


In [13]:
len(df.columns), df.columns

(303,
 Index(['PREV_AMT_ANNUITY_MIN', 'PREV_AMT_ANNUITY_MAX', 'PREV_AMT_ANNUITY_MEAN',
        'PREV_AMT_APPLICATION_MIN', 'PREV_AMT_APPLICATION_MAX',
        'PREV_AMT_APPLICATION_MEAN', 'PREV_AMT_CREDIT_MIN',
        'PREV_AMT_CREDIT_MAX', 'PREV_AMT_CREDIT_MEAN',
        'PREV_AMT_DOWN_PAYMENT_MIN',
        ...
        'PREV_REFUSED_O_CREDIT__GOODS_MEAN', 'PREV_REFUSED_O_CREDIT__GOODS_VAR',
        'PREV_REFUSED_O_CREDIT__DOWN_MEAN', 'PREV_REFUSED_O_CREDIT__DOWN_VAR',
        'PREV_REFUSED_O_GOODS__ANNUITY_MEAN',
        'PREV_REFUSED_O_GOODS__ANNUITY_VAR', 'PREV_REFUSED_O_GOODS__DOWN_MEAN',
        'PREV_REFUSED_O_GOODS__DOWN_VAR', 'PREV_REFUSED_O_DOWN__ANNUITY_MEAN',
        'PREV_REFUSED_O_DOWN__ANNUITY_VAR'],
       dtype='object', length=303))

In [25]:
for col in df.columns:
    print(col)

PREV_AMT_ANNUITY_MIN
PREV_AMT_ANNUITY_MAX
PREV_AMT_ANNUITY_MEAN
PREV_AMT_APPLICATION_MIN
PREV_AMT_APPLICATION_MAX
PREV_AMT_APPLICATION_MEAN
PREV_AMT_CREDIT_MIN
PREV_AMT_CREDIT_MAX
PREV_AMT_CREDIT_MEAN
PREV_AMT_DOWN_PAYMENT_MIN
PREV_AMT_DOWN_PAYMENT_MAX
PREV_AMT_DOWN_PAYMENT_MEAN
PREV_AMT_GOODS_PRICE_MIN
PREV_AMT_GOODS_PRICE_MAX
PREV_AMT_GOODS_PRICE_MEAN
PREV_HOUR_APPR_PROCESS_START_MIN
PREV_HOUR_APPR_PROCESS_START_MAX
PREV_HOUR_APPR_PROCESS_START_MEAN
PREV_RATE_DOWN_PAYMENT_MIN
PREV_RATE_DOWN_PAYMENT_MAX
PREV_RATE_DOWN_PAYMENT_MEAN
PREV_DAYS_DECISION_MIN
PREV_DAYS_DECISION_MAX
PREV_DAYS_DECISION_MEAN
PREV_CNT_PAYMENT_MEAN
PREV_CNT_PAYMENT_SUM
PREV_O_APP__CREDIT_MAX
PREV_O_APP__CREDIT_MIN
PREV_O_APP__CREDIT_MEAN
PREV_O_APP__CREDIT_VAR
PREV_O_APP__GOODS_MAX
PREV_O_APP__GOODS_MIN
PREV_O_APP__GOODS_MEAN
PREV_O_APP__GOODS_VAR
PREV_O_APP__DOWN_MAX
PREV_O_APP__DOWN_MIN
PREV_O_APP__DOWN_MEAN
PREV_O_APP__DOWN_VAR
PREV_O_APP__ANNUITY_MAX
PREV_O_APP__ANNUITY_MIN
PREV_O_APP__ANNUITY_MEAN
PREV_O_AP

In [24]:
for col in df.columns:
    if col.startswith('PREV_O_') or col.startswith('PREV_APPROVED_O_') or col.startswith('PREV_REFUSED_O_'):
        print("'{}',".format(col))

'PREV_O_APP__CREDIT_MAX',
'PREV_O_APP__CREDIT_MIN',
'PREV_O_APP__CREDIT_MEAN',
'PREV_O_APP__CREDIT_VAR',
'PREV_O_APP__GOODS_MAX',
'PREV_O_APP__GOODS_MIN',
'PREV_O_APP__GOODS_MEAN',
'PREV_O_APP__GOODS_VAR',
'PREV_O_APP__DOWN_MAX',
'PREV_O_APP__DOWN_MIN',
'PREV_O_APP__DOWN_MEAN',
'PREV_O_APP__DOWN_VAR',
'PREV_O_APP__ANNUITY_MAX',
'PREV_O_APP__ANNUITY_MIN',
'PREV_O_APP__ANNUITY_MEAN',
'PREV_O_APP__ANNUITY_VAR',
'PREV_O_CREDIT__ANNUITY_MAX',
'PREV_O_CREDIT__ANNUITY_MIN',
'PREV_O_CREDIT__ANNUITY_MEAN',
'PREV_O_CREDIT__ANNUITY_VAR',
'PREV_O_CREDIT__GOODS_MAX',
'PREV_O_CREDIT__GOODS_MIN',
'PREV_O_CREDIT__GOODS_MEAN',
'PREV_O_CREDIT__GOODS_VAR',
'PREV_O_CREDIT__DOWN_MAX',
'PREV_O_CREDIT__DOWN_MIN',
'PREV_O_CREDIT__DOWN_MEAN',
'PREV_O_CREDIT__DOWN_VAR',
'PREV_O_GOODS__ANNUITY_MAX',
'PREV_O_GOODS__ANNUITY_MIN',
'PREV_O_GOODS__ANNUITY_MEAN',
'PREV_O_GOODS__ANNUITY_VAR',
'PREV_O_GOODS__DOWN_MAX',
'PREV_O_GOODS__DOWN_MIN',
'PREV_O_GOODS__DOWN_MEAN',
'PREV_O_GOODS__DOWN_VAR',
'PREV_O_DOWN__ANNUITY_M