# [Home Credit Default Risk](https://www.kaggle.com/c/home-credit-default-risk)
# Preprocessing - Auto Feature Engineering
## Platform: Python 3, colab.research.google.com

In [0]:
import featuretools as ft
import featuretools.variable_types as vtypes
import pandas as pd
import os
import pandas as pd
import psutil
import numpy as np
import sys

In [0]:
try:
    from google.colab import drive
    os.environ["runtime"] = "hosted"
    drive.mount('/content/gdrive', force_remount=True)
    project_folder = "/content/gdrive/My Drive/Colab Notebooks/kaggle-home-credit-default-risk"
    if pd.__version__ != "0.24.1":
        !pip install pandas==0.24.1
        import pandas as pd
except:
    os.environ["runtime"] = "local"
    project_folder = "."
print("Runtime: {}".format(os.environ["runtime"]))
print("Total number of cpus detected: {}.".format(psutil.cpu_count()))
print("Total size of system memory: {:.5f} gb.".format(psutil.virtual_memory().total / 1e9))

Mounted at /content/gdrive
Runtime: hosted
Total number of cpus detected: 2.
Total size of system memory: 13.65532 gb.


## Load data

In [0]:
app = pd.read_csv("{}/data/app_clean.csv".format(project_folder), header=0, index_col=0)
app.shape

(356255, 99)

In [0]:
# limit files to limit RAM
filename_shards = ["part_1", "part_2"]
current_shard = filename_shards[1]
if current_shard == filename_shards[0]:
    app = app.loc[:180000, :]
elif current_shard == filename_shards[len(filename_shards) - 1]:
    app = app.loc[180000:, :]
app.shape

(176255, 99)

In [0]:
for i in range(len(app.dtypes)):
    print(app.columns[i], app.dtypes[i])

AMT_ANNUITY float64
AMT_CREDIT float64
AMT_GOODS_PRICE float64
AMT_INCOME_TOTAL float64
AMT_REQ_CREDIT_BUREAU_DAY float64
AMT_REQ_CREDIT_BUREAU_HOUR float64
AMT_REQ_CREDIT_BUREAU_MON float64
AMT_REQ_CREDIT_BUREAU_QRT float64
AMT_REQ_CREDIT_BUREAU_WEEK float64
AMT_REQ_CREDIT_BUREAU_YEAR float64
APARTMENTS_AVG float64
APARTMENTS_MEDI float64
APARTMENTS_MODE float64
CNT_CHILDREN int64
CNT_FAM_MEMBERS float64
CODE_GENDER object
DAYS_BIRTH int64
DAYS_EMPLOYED int64
DAYS_ID_PUBLISH int64
DAYS_LAST_PHONE_CHANGE float64
DAYS_REGISTRATION float64
DEF_30_CNT_SOCIAL_CIRCLE float64
DEF_60_CNT_SOCIAL_CIRCLE float64
ELEVATORS_AVG float64
ELEVATORS_MEDI float64
ELEVATORS_MODE float64
EMERGENCYSTATE_MODE object
ENTRANCES_AVG float64
ENTRANCES_MEDI float64
ENTRANCES_MODE float64
EXT_SOURCE_1 float64
EXT_SOURCE_2 float64
EXT_SOURCE_3 float64
FLAG_CONT_MOBILE int64
FLAG_DOCUMENT_10 int64
FLAG_DOCUMENT_11 int64
FLAG_DOCUMENT_12 int64
FLAG_DOCUMENT_13 int64
FLAG_DOCUMENT_14 int64
FLAG_DOCUMENT_15 int64
FLA

In [0]:
bureau = pd.read_csv("{}/data/bureau_clean.csv".format(project_folder), header=0, index_col=0)
bureau.shape

  mask |= (ar1 == a)


(1716428, 16)

In [0]:
for i in range(len(bureau.dtypes)):
    print(bureau.columns[i], bureau.dtypes[i])

SK_ID_CURR int64
SK_ID_BUREAU int64
CREDIT_ACTIVE object
CREDIT_CURRENCY object
DAYS_CREDIT int64
CREDIT_DAY_OVERDUE int64
DAYS_CREDIT_ENDDATE float64
DAYS_ENDDATE_FACT float64
AMT_CREDIT_MAX_OVERDUE float64
CNT_CREDIT_PROLONG int64
AMT_CREDIT_SUM float64
AMT_CREDIT_SUM_DEBT float64
AMT_CREDIT_SUM_LIMIT float64
AMT_CREDIT_SUM_OVERDUE float64
CREDIT_TYPE object
DAYS_CREDIT_UPDATE int64


In [0]:
bureau_balance = pd.read_csv("{}/data/bureau_balance_clean.csv".format(project_folder), header=0, index_col=0)
bureau_balance.shape

  mask |= (ar1 == a)


(27299925, 3)

In [0]:
for i in range(len(bureau_balance.dtypes)):
    print(bureau_balance.columns[i], bureau_balance.dtypes[i])

SK_ID_BUREAU int64
MONTHS_BALANCE int64
STATUS object


## Setup ft

In [0]:
es = ft.EntitySet(id = "loans")

All indeces must be ints:

In [0]:
app["SK_ID_CURR"] = np.int64(app["SK_ID_CURR"])
bureau["SK_ID_CURR"] = np.int64(bureau["SK_ID_CURR"])
bureau["SK_ID_BUREAU"] = np.int64(bureau["SK_ID_BUREAU"])
bureau_balance["SK_ID_BUREAU"] = np.int64(bureau_balance["SK_ID_BUREAU"])

Define boolean and ordinal variables:

App types

In [0]:
app_types = {}

In [0]:
# Ordinal types
for i in [
    "REGION_RATING_CLIENT", 
    "REGION_RATING_CLIENT_W_CITY",
    "HOUR_APPR_PROCESS_START"
]:
    if i in app.columns:
        app_types[i] = vtypes.Ordinal

In [0]:
# Boolean types
for col in app:
    if (app[col].nunique() == 2) and (app[col].dtype == float):
        app_types[col] = vtypes.Boolean

In [0]:
app_types.pop("TARGET")
print(app_types)

{'REGION_RATING_CLIENT': <class 'featuretools.variable_types.variable.Ordinal'>, 'REGION_RATING_CLIENT_W_CITY': <class 'featuretools.variable_types.variable.Ordinal'>, 'HOUR_APPR_PROCESS_START': <class 'featuretools.variable_types.variable.Ordinal'>}


In [0]:
bureau_types = {
    "CREDIT_ACTIVE": vtypes.Categorical,
    "CREDIT_CURRENCY": vtypes.Categorical,
    "CREDIT_TYPE": vtypes.Categorical 
}

In [0]:
# Boolean types
for col in bureau:
    if (bureau[col].nunique() == 2) and (bureau[col].dtype == float):
        bureau_types[col] = vtypes.Boolean

In [0]:
print(bureau_types)

{'CREDIT_ACTIVE': <class 'featuretools.variable_types.variable.Categorical'>, 'CREDIT_CURRENCY': <class 'featuretools.variable_types.variable.Categorical'>, 'CREDIT_TYPE': <class 'featuretools.variable_types.variable.Categorical'>}


In [0]:
bureau_balance_types = {
    "STATUS": vtypes.Categorical
}

Define entities (tables):

In [0]:
# Entities with unique index
es = es.entity_from_dataframe(
    entity_id = "app", 
    dataframe = app, 
    index = "SK_ID_CURR", 
    variable_types = app_types
)
es = es.entity_from_dataframe(
    entity_id = "bureau", 
    dataframe = bureau, 
    index = 'SK_ID_BUREAU', 
    variable_types = bureau_types
)
# Entities with no unique index
es = es.entity_from_dataframe(
    entity_id = "bureau_balance", 
    dataframe = bureau_balance, 
    make_index = True, 
    index = "bureaubalance_index", 
    variable_types = bureau_balance_types
)

Define relationships:

In [0]:
r_app_bureau = ft.Relationship(es["app"]["SK_ID_CURR"], es["bureau"]["SK_ID_CURR"])
r_bureau_bureau_balance = ft.Relationship(es["bureau"]["SK_ID_BUREAU"], es["bureau_balance"]["SK_ID_BUREAU"])
es = es.add_relationships([r_app_bureau, r_bureau_bureau_balance])
es

Entityset: loans
  Entities:
    app [Rows: 176255, Columns: 99]
    bureau [Rows: 1716428, Columns: 16]
    bureau_balance [Rows: 27299925, Columns: 4]
  Relationships:
    bureau.SK_ID_CURR -> app.SK_ID_CURR
    bureau_balance.SK_ID_BUREAU -> bureau.SK_ID_BUREAU

In [0]:
print('Total size of entityset: {:.5f} gb.'.format(sys.getsizeof(es) / 1e9))

Total size of entityset: 4.85400 gb.


Feature primitives:

In [0]:
primitives = ft.list_primitives()
primitives.loc[primitives["type"]=="transform", :]

Unnamed: 0,name,type,description
19,haversine,transform,Calculate the approximate haversine distance i...
20,or,transform,"For two boolean values, determine if one value..."
21,weeks,transform,Transform a Timedelta feature into the number ...
22,latitude,transform,Returns the first value of the tuple base feat...
23,minutes,transform,Transform a Timedelta feature into the number ...
24,not,transform,"For each value of the base feature, negates th..."
25,seconds,transform,Transform a Timedelta feature into the number ...
26,hours,transform,Transform a Timedelta feature into the number ...
27,isin,transform,"For each value of the base feature, checks whe..."
28,days,transform,Transform a Timedelta feature into the number ...


In [0]:
agg_primitives =  [
    "max", 
    "median", 
#     "skew", 
    "min", 
    "count", 
#     "std", 
#     "sum", 
    "num_unique"
]
trans_primitives =  [
    "percentile"
]

In [0]:
target_entity = "app"
max_depth = 2

features = ft.dfs(entityset = es, 
    target_entity = target_entity,
    trans_primitives = trans_primitives,
    agg_primitives = agg_primitives,
    max_depth = max_depth,
    features_only = True)
for i in features:
    print(i)

<Feature: AMT_ANNUITY>
<Feature: AMT_CREDIT>
<Feature: AMT_GOODS_PRICE>
<Feature: AMT_INCOME_TOTAL>
<Feature: AMT_REQ_CREDIT_BUREAU_DAY>
<Feature: AMT_REQ_CREDIT_BUREAU_HOUR>
<Feature: AMT_REQ_CREDIT_BUREAU_MON>
<Feature: AMT_REQ_CREDIT_BUREAU_QRT>
<Feature: AMT_REQ_CREDIT_BUREAU_WEEK>
<Feature: AMT_REQ_CREDIT_BUREAU_YEAR>
<Feature: APARTMENTS_AVG>
<Feature: APARTMENTS_MEDI>
<Feature: APARTMENTS_MODE>
<Feature: CNT_CHILDREN>
<Feature: CNT_FAM_MEMBERS>
<Feature: CODE_GENDER>
<Feature: DAYS_BIRTH>
<Feature: DAYS_EMPLOYED>
<Feature: DAYS_ID_PUBLISH>
<Feature: DAYS_LAST_PHONE_CHANGE>
<Feature: DAYS_REGISTRATION>
<Feature: DEF_30_CNT_SOCIAL_CIRCLE>
<Feature: DEF_60_CNT_SOCIAL_CIRCLE>
<Feature: ELEVATORS_AVG>
<Feature: ELEVATORS_MEDI>
<Feature: ELEVATORS_MODE>
<Feature: EMERGENCYSTATE_MODE>
<Feature: ENTRANCES_AVG>
<Feature: ENTRANCES_MEDI>
<Feature: ENTRANCES_MODE>
<Feature: EXT_SOURCE_1>
<Feature: EXT_SOURCE_2>
<Feature: EXT_SOURCE_3>
<Feature: FLAG_CONT_MOBILE>
<Feature: FLAG_DOCUMENT_10>

In [0]:
len(features)

308

In [0]:
feature_matrix, features = ft.dfs(entityset = es, 
    target_entity = target_entity,
    trans_primitives = trans_primitives,
    agg_primitives = agg_primitives,
    max_depth = max_depth, n_jobs=1, verbose=1,
    features_only = False,
    chunk_size=0.02)

Built 308 features
Elapsed: 4:06:49 | Remaining: 1:24:28 | Progress:  75%|███████▍  | Calculated: 38/51 chunks

In [0]:
feature_matrix.head(5)

In [0]:
feature_matrix.shape

In [0]:
for i in feature_matrix.columns:
    print(i)

In [0]:
feature_matrix_null = feature_matrix.isnull().sum()
fill_na = {}
for i in feature_matrix.columns:
    if feature_matrix_null[i] > 0 and i != "TARGET":
        if feature_matrix[i].dtypes == 'float64':
            fill_na[i] = feature_matrix[i].median()
        else:
            fill_na[i] = feature_matrix[i].mode()[0]
fill_na

In [0]:
feature_matrix = feature_matrix.fillna(fill_na)
assert sum(feature_matrix.loc[:, "TARGET"].isnull()) == sum(feature_matrix.isnull().sum())

In [0]:
feature_matrix_train = feature_matrix.loc[feature_matrix["TARGET"].notnull(), :]
feature_matrix_train.to_csv("{}/data/featuretools_matrix_train_{}.csv".format(
    project_folder, current_shard))
feature_matrix_train.shape

In [0]:
feature_matrix_train_loaded = pd.read_csv("{}/data/featuretools_matrix_train_{}.csv".format(
    project_folder, current_shard), header=0, index_col=0)
assert feature_matrix_train.shape == feature_matrix_train_loaded.shape

In [0]:
if current_shard == filename_shards[len(filename_shards) - 1]:
    feature_matrix_test = feature_matrix.loc[feature_matrix["TARGET"].isnull(), :]
    feature_matrix_test = feature_matrix_test.drop("TARGET", axis = 1)
    feature_matrix_test.to_csv("{}/data/featuretools_matrix_test.csv".format(project_folder))
    feature_matrix_test.shape