In [15]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [30]:
df = pd.read_csv('../Input/application_train.csv')
print(df.shape)
test_df = pd.read_csv('../Input/application_test.csv')
print(test_df.shape)
df = df.append(test_df).reset_index()

(307511, 122)
(48744, 121)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  sort=sort)


In [22]:
def app_data_engineering(df):
    #replace xna with nan
    df.replace('XNA', np.nan, inplace = True)
    df['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace = True)
    
    #Hand craft features
    df['annuity_income_percentage'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['car_to_birth_ratio'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH']
    df['car_to_employ_ratio'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED']
    df['children_ratio'] = df['CNT_CHILDREN'] / df['CNT_FAM_MEMBERS']
    df['credit_to_annuity_ratio'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']
    df['credit_to_goods_ratio'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
    df['credit_to_income_ratio'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
    df['days_employed_percentage'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['income_credit_percentage'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['income_per_child'] = df['AMT_INCOME_TOTAL'] / (1 + df['CNT_CHILDREN'])
    df['income_per_person'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['payment_rate'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
    df['phone_to_birth_ratio'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH']
    df['phone_to_employ_ratio'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_EMPLOYED']
    df['cnt_non_child'] = df['CNT_FAM_MEMBERS'] - df['CNT_CHILDREN']
    df['child_to_non_child_ratio'] = df['CNT_CHILDREN'] / df['cnt_non_child']
    df['income_per_non_child'] = df['AMT_INCOME_TOTAL'] / df['cnt_non_child']
    df['credit_per_person'] = df['AMT_CREDIT'] / df['CNT_FAM_MEMBERS']
    df['credit_per_child'] = df['AMT_CREDIT'] / (1 + df['CNT_CHILDREN'])
    df['credit_per_non_child'] = df['AMT_CREDIT'] / df['cnt_non_child']
    
    #External sources
    df['external_sources_weighted'] = df.EXT_SOURCE_1 * 2 + df.EXT_SOURCE_2 * 3 + df.EXT_SOURCE_3 * 4
    for function_name in ['min', 'max', 'sum', 'mean', 'nanmedian']:
        df['external_sources_{}'.format(function_name)] = eval('np.{}'.format(function_name))(
            df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)
        

    AGGREGATION_RECIPIES = [
    (['CODE_GENDER', 'NAME_EDUCATION_TYPE'], [('AMT_ANNUITY', 'max'),
                                              ('AMT_CREDIT', 'max'),
                                              ('EXT_SOURCE_1', 'mean'),
                                              ('EXT_SOURCE_2', 'mean'),
                                              ('OWN_CAR_AGE', 'max'),
                                              ('OWN_CAR_AGE', 'sum')]),
    (['CODE_GENDER', 'ORGANIZATION_TYPE'], [('AMT_ANNUITY', 'mean'),
                                            ('AMT_INCOME_TOTAL', 'mean'),
                                            ('DAYS_REGISTRATION', 'mean'),
                                            ('EXT_SOURCE_1', 'mean')]),
    (['CODE_GENDER', 'REG_CITY_NOT_WORK_CITY'], [('AMT_ANNUITY', 'mean'),
                                                 ('CNT_CHILDREN', 'mean'),
                                                 ('DAYS_ID_PUBLISH', 'mean')]),
    (['CODE_GENDER', 'NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'REG_CITY_NOT_WORK_CITY'], [('EXT_SOURCE_1', 'mean'),
                                                                                           ('EXT_SOURCE_2', 'mean')]),
    (['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE'], [('AMT_CREDIT', 'mean'),
                                                  ('AMT_REQ_CREDIT_BUREAU_YEAR', 'mean'),
                                                  ('APARTMENTS_AVG', 'mean'),
                                                  ('BASEMENTAREA_AVG', 'mean'),
                                                  ('EXT_SOURCE_1', 'mean'),
                                                  ('EXT_SOURCE_2', 'mean'),
                                                  ('EXT_SOURCE_3', 'mean'),
                                                  ('NONLIVINGAREA_AVG', 'mean'),
                                                  ('OWN_CAR_AGE', 'mean'),
                                                  ('YEARS_BUILD_AVG', 'mean')]),
    (['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'REG_CITY_NOT_WORK_CITY'], [('ELEVATORS_AVG', 'mean'),
                                                                            ('EXT_SOURCE_1', 'mean')]),
    (['OCCUPATION_TYPE'], [('AMT_ANNUITY', 'mean'),
                           ('CNT_CHILDREN', 'mean'),
                           ('CNT_FAM_MEMBERS', 'mean'),
                           ('DAYS_BIRTH', 'mean'),
                           ('DAYS_EMPLOYED', 'mean'),
                           ('DAYS_ID_PUBLISH', 'mean'),
                           ('DAYS_REGISTRATION', 'mean'),
                           ('EXT_SOURCE_1', 'mean'),
                           ('EXT_SOURCE_2', 'mean'),
                           ('EXT_SOURCE_3', 'mean')]),   
    ]
    groupby_aggregate_names = []
    for groupby_cols, specs in tqdm(AGGREGATION_RECIPIES):
        group_object = df.groupby(groupby_cols)
        for select, agg in tqdm(specs):
            groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)
            df = df.merge(group_object[select]
                                  .agg(agg)
                                  .reset_index()
                                  .rename(index=str, columns={select: groupby_aggregate_name})
                                  [groupby_cols + [groupby_aggregate_name]],
                                  on=groupby_cols,
                                  how='left')
            groupby_aggregate_names.append(groupby_aggregate_name)
            
    
    diff_feature_names  = []
    for groupby_cols, specs in tqdm(AGGREGATION_RECIPIES):
        for select, agg in tqdm(specs):
            if agg in ['mean','median','max','min']:
                groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)
                diff_name = '{}_diff'.format(groupby_aggregate_name)
                abs_diff_name = '{}_abs_diff'.format(groupby_aggregate_name)

                df[diff_name] = df[select] - df[groupby_aggregate_name] 
                df[abs_diff_name] = np.abs(df[select] - df[groupby_aggregate_name]) 

                diff_feature_names.append(diff_name)
                diff_feature_names.append(abs_diff_name)
                
    df['DAYS_EMPLOYED'].replace(365243,np.nan, inplace=True)
    df_clean = df[~pd.isnull(df['DAYS_EMPLOYED'])]
    
    df['long_employment'] = (df['DAYS_EMPLOYED'] < -2000).astype(int)
    df['retirement_age'] = (df['DAYS_BIRTH'] < -14000).astype(int)
    
    return df
    

In [4]:
df['CODE_GENDER'].unique()

array(['M', 'F', 'XNA'], dtype=object)

In [5]:
df.loc[df['DAYS_EMPLOYED'] > 0]['DAYS_EMPLOYED'].unique()


array([365243], dtype=int64)

In [6]:
sum(df['ORGANIZATION_TYPE'] == 'XNA')

55374

In [7]:
df['CODE_GENDER'].value_counts()

F      202448
M      105059
XNA         4
Name: CODE_GENDER, dtype: int64

In [9]:
df['CODE_GENDER'].replace('XNA', np.nan, inplace = True)
df['CODE_GENDER'].value_counts()

F    202448
M    105059
Name: CODE_GENDER, dtype: int64

In [11]:
df['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace = True)

In [12]:
df['annuity_income_percentage'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
df['car_to_birth_ratio'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH']
df['car_to_employ_ratio'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED']
df['children_ratio'] = df['CNT_CHILDREN'] / df['CNT_FAM_MEMBERS']
df['credit_to_annuity_ratio'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']
df['credit_to_goods_ratio'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
df['credit_to_income_ratio'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
df['days_employed_percentage'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
df['income_credit_percentage'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
df['income_per_child'] = df['AMT_INCOME_TOTAL'] / (1 + df['CNT_CHILDREN'])
df['income_per_person'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
df['payment_rate'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
df['phone_to_birth_ratio'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH']
df['phone_to_employ_ratio'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_EMPLOYED']

0    100002
1    100003
2    100004
3    100006
4    100007
Name: SK_ID_CURR, dtype: int64

In [35]:
df = app_data_engineering(df)
df.shape

  r = func(a, **kwargs)


HBox(children=(IntProgress(value=0, max=7), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=7), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




(356255, 260)

In [24]:
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,OCCUPATION_TYPE_mean_DAYS_REGISTRATION_diff,OCCUPATION_TYPE_mean_DAYS_REGISTRATION_abs_diff,OCCUPATION_TYPE_mean_EXT_SOURCE_1_diff,OCCUPATION_TYPE_mean_EXT_SOURCE_1_abs_diff,OCCUPATION_TYPE_mean_EXT_SOURCE_2_diff,OCCUPATION_TYPE_mean_EXT_SOURCE_2_abs_diff,OCCUPATION_TYPE_mean_EXT_SOURCE_3_diff,OCCUPATION_TYPE_mean_EXT_SOURCE_3_abs_diff,long_employment,retirement_age
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,1061.98882,1061.98882,-0.339016,0.339016,-0.235471,0.235471,-0.362717,0.362717,0,0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,3189.710229,3189.710229,-0.200252,0.200252,0.094487,0.094487,,,0,1
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,449.98882,449.98882,,,0.057493,0.057493,0.227474,0.227474,0,1
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,-5123.01118,5123.01118,,,0.152022,0.152022,,,1,1
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,64.710229,64.710229,,,-0.20502,0.20502,,,1,1


In [36]:
df.to_csv('../Input/application_engineered.csv')