In [1]:
# pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# featuretools for automated feature engineering
import featuretools as ft

import featuretools.variable_types as vtypes

import sys
import psutil

import os

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
# Read in the datasets and replace the anomalous values
app_train = pd.read_csv('../kaggle_home_credit/input/application_train.csv').replace({365243: np.nan})
app_test = pd.read_csv('../kaggle_home_credit/input/application_test.csv').replace({365243: np.nan})
bureau = pd.read_csv('../kaggle_home_credit/input/bureau.csv').replace({365243: np.nan})
bureau_balance = pd.read_csv('../kaggle_home_credit/input/bureau_balance.csv').replace({365243: np.nan})
cash = pd.read_csv('../kaggle_home_credit/input/POS_CASH_balance.csv').replace({365243: np.nan})
credit = pd.read_csv('../kaggle_home_credit/input/credit_card_balance.csv').replace({365243: np.nan})
previous = pd.read_csv('../kaggle_home_credit/input/previous_application.csv').replace({365243: np.nan})
installments = pd.read_csv('../kaggle_home_credit/input/installments_payments.csv').replace({365243: np.nan})

In [3]:
app_test['TARGET'] = np.nan

# Join together training and testing
app = app_train.append(app_test, ignore_index = True, sort = True)

In [4]:
for index in ['SK_ID_CURR', 'SK_ID_PREV', 'SK_ID_BUREAU']:
    for dataset in [app, bureau, bureau_balance, cash, credit, previous, installments]:
        if index in list(dataset.columns):
            dataset[index] = dataset[index].fillna(0).astype(np.int64)

In [5]:
# Entity set with id applications
es = ft.EntitySet(id = 'clients')

import featuretools.variable_types as vtypes

app_types = {}

# Handle the Boolean variables:
for col in app:
    if (app[col].nunique() == 2) and (app[col].dtype == float):
        app_types[col] = vtypes.Boolean

# Remove the `TARGET`
del app_types['TARGET']

print('There are {} Boolean variables in the application data.'.format(len(app_types)))

# Ordinal variables
app_types['REGION_RATING_CLIENT'] = vtypes.Ordinal
app_types['REGION_RATING_CLIENT_W_CITY'] = vtypes.Ordinal
app_types['HOUR_APPR_PROCESS_START'] = vtypes.Ordinal

previous_types = {}

# Handle the Boolean variables:
for col in previous:
    if (previous[col].nunique() == 2) and (previous[col].dtype == float):
        previous_types[col] = vtypes.Boolean

print('There are {} Boolean variables in the previous data.'.format(len(previous_types)))

installments = installments.drop(columns = ['SK_ID_CURR'])
credit = credit.drop(columns = ['SK_ID_CURR'])
cash = cash.drop(columns = ['SK_ID_CURR'])

There are 32 Boolean variables in the application data.
There are 2 Boolean variables in the previous data.


In [6]:
# Entities with a unique index
es = es.entity_from_dataframe(entity_id = 'app', dataframe = app, index = 'SK_ID_CURR',
                              variable_types = app_types)

es = es.entity_from_dataframe(entity_id = 'bureau', dataframe = bureau, index = 'SK_ID_BUREAU')

es = es.entity_from_dataframe(entity_id = 'previous', dataframe = previous, index = 'SK_ID_PREV',
                              variable_types = previous_types)

# Entities that do not have a unique index
es = es.entity_from_dataframe(entity_id = 'bureau_balance', dataframe = bureau_balance, 
                              make_index = True, index = 'bureaubalance_index')

es = es.entity_from_dataframe(entity_id = 'cash', dataframe = cash, 
                              make_index = True, index = 'cash_index')

es = es.entity_from_dataframe(entity_id = 'installments', dataframe = installments,
                              make_index = True, index = 'installments_index')

es = es.entity_from_dataframe(entity_id = 'credit', dataframe = credit,
                              make_index = True, index = 'credit_index')

# Relationship between app_train and bureau
r_app_bureau = ft.Relationship(es['app']['SK_ID_CURR'], es['bureau']['SK_ID_CURR'])

# Relationship between bureau and bureau balance
r_bureau_balance = ft.Relationship(es['bureau']['SK_ID_BUREAU'], es['bureau_balance']['SK_ID_BUREAU'])

# Relationship between current app and previous apps
r_app_previous = ft.Relationship(es['app']['SK_ID_CURR'], es['previous']['SK_ID_CURR'])

# Relationships between previous apps and cash, installments, and credit
r_previous_cash = ft.Relationship(es['previous']['SK_ID_PREV'], es['cash']['SK_ID_PREV'])
r_previous_installments = ft.Relationship(es['previous']['SK_ID_PREV'], es['installments']['SK_ID_PREV'])
r_previous_credit = ft.Relationship(es['previous']['SK_ID_PREV'], es['credit']['SK_ID_PREV'])

# Add in the defined relationships
es = es.add_relationships([r_app_bureau, r_bureau_balance, r_app_previous,
                           r_previous_cash, r_previous_installments, r_previous_credit])
# Print out the EntitySet
es

Entityset: clients
  Entities:
    app [Rows: 356255, Columns: 122]
    bureau [Rows: 1716428, Columns: 17]
    previous [Rows: 1670214, Columns: 37]
    bureau_balance [Rows: 27299925, Columns: 4]
    cash [Rows: 10001358, Columns: 8]
    installments [Rows: 13605401, Columns: 8]
    credit [Rows: 3840312, Columns: 23]
  Relationships:
    bureau.SK_ID_CURR -> app.SK_ID_CURR
    bureau_balance.SK_ID_BUREAU -> bureau.SK_ID_BUREAU
    previous.SK_ID_CURR -> app.SK_ID_CURR
    cash.SK_ID_PREV -> previous.SK_ID_PREV
    installments.SK_ID_PREV -> previous.SK_ID_PREV
    credit.SK_ID_PREV -> previous.SK_ID_PREV

In [7]:
sys.getsizeof(es) / 1e9

11.626818235

# Convert to Categorical Types

In [8]:
es_cat = ft.EntitySet(id = 'clients_cat')

In [9]:
for df in [app, bureau, bureau_balance, cash, credit, previous, installments]:
    for c in df:
        if df[c].dtype.name == "object":
            df[c] = df[c].astype("category")

In [10]:
# Entities_cat with a unique index
es_cat = es_cat.entity_from_dataframe(entity_id = 'app', dataframe = app, index = 'SK_ID_CURR',
                              variable_types = app_types)

es_cat = es_cat.entity_from_dataframe(entity_id = 'bureau', dataframe = bureau, index = 'SK_ID_BUREAU')

es_cat = es_cat.entity_from_dataframe(entity_id = 'previous', dataframe = previous, index = 'SK_ID_PREV',
                              variable_types = previous_types)

# Entities_cat that do not have a unique index
es_cat = es_cat.entity_from_dataframe(entity_id = 'bureau_balance', dataframe = bureau_balance, 
                              make_index = True, index = 'bureaubalance_index')

es_cat = es_cat.entity_from_dataframe(entity_id = 'cash', dataframe = cash, 
                              make_index = True, index = 'cash_index')

es_cat = es_cat.entity_from_dataframe(entity_id = 'installments', dataframe = installments,
                              make_index = True, index = 'installments_index')

es_cat = es_cat.entity_from_dataframe(entity_id = 'credit', dataframe = credit,
                              make_index = True, index = 'credit_index')

# Relationship between app_train and bureau
r_app_bureau = ft.Relationship(es_cat['app']['SK_ID_CURR'], es_cat['bureau']['SK_ID_CURR'])

# Relationship between bureau and bureau balance
r_bureau_balance = ft.Relationship(es_cat['bureau']['SK_ID_BUREAU'], es_cat['bureau_balance']['SK_ID_BUREAU'])

# Relationship between current app and previous apps
r_app_previous = ft.Relationship(es_cat['app']['SK_ID_CURR'], es_cat['previous']['SK_ID_CURR'])

# Relationships between previous apps and cash, installments, and credit
r_previous_cash = ft.Relationship(es_cat['previous']['SK_ID_PREV'], es_cat['cash']['SK_ID_PREV'])
r_previous_installments = ft.Relationship(es_cat['previous']['SK_ID_PREV'], es_cat['installments']['SK_ID_PREV'])
r_previous_credit = ft.Relationship(es_cat['previous']['SK_ID_PREV'], es_cat['credit']['SK_ID_PREV'])

# Add in the defined relationships
es_cat = es_cat.add_relationships([r_app_bureau, r_bureau_balance, r_app_previous,
                           r_previous_cash, r_previous_installments, r_previous_credit])

# Display entityset so far
es_cat

Entityset: clients_cat
  Entities:
    app [Rows: 356255, Columns: 122]
    bureau [Rows: 1716428, Columns: 17]
    previous [Rows: 1670214, Columns: 37]
    bureau_balance [Rows: 27299925, Columns: 4]
    cash [Rows: 10001358, Columns: 8]
    installments [Rows: 13605401, Columns: 8]
    credit [Rows: 3840312, Columns: 23]
  Relationships:
    bureau.SK_ID_CURR -> app.SK_ID_CURR
    bureau_balance.SK_ID_BUREAU -> bureau.SK_ID_BUREAU
    previous.SK_ID_CURR -> app.SK_ID_CURR
    cash.SK_ID_PREV -> previous.SK_ID_PREV
    installments.SK_ID_PREV -> previous.SK_ID_PREV
    credit.SK_ID_PREV -> previous.SK_ID_PREV

In [11]:
sys.getsizeof(es_cat) / 1e9

11.626818235

In [12]:
es_cat['app'].df.dtypes

SK_ID_CURR                       int64
AMT_ANNUITY                    float64
AMT_CREDIT                     float64
AMT_GOODS_PRICE                float64
AMT_INCOME_TOTAL               float64
AMT_REQ_CREDIT_BUREAU_DAY      float64
AMT_REQ_CREDIT_BUREAU_HOUR     float64
AMT_REQ_CREDIT_BUREAU_MON      float64
AMT_REQ_CREDIT_BUREAU_QRT      float64
AMT_REQ_CREDIT_BUREAU_WEEK     float64
AMT_REQ_CREDIT_BUREAU_YEAR     float64
APARTMENTS_AVG                 float64
APARTMENTS_MEDI                float64
APARTMENTS_MODE                float64
BASEMENTAREA_AVG               float64
BASEMENTAREA_MEDI              float64
BASEMENTAREA_MODE              float64
CNT_CHILDREN                   float64
CNT_FAM_MEMBERS                float64
COMMONAREA_AVG                 float64
COMMONAREA_MEDI                float64
COMMONAREA_MODE                float64
DAYS_BIRTH                     float64
DAYS_EMPLOYED                  float64
DAYS_ID_PUBLISH                float64
DAYS_LAST_PHONE_CHANGE   

In [13]:
app.dtypes

AMT_ANNUITY                      float64
AMT_CREDIT                       float64
AMT_GOODS_PRICE                  float64
AMT_INCOME_TOTAL                 float64
AMT_REQ_CREDIT_BUREAU_DAY        float64
AMT_REQ_CREDIT_BUREAU_HOUR       float64
AMT_REQ_CREDIT_BUREAU_MON        float64
AMT_REQ_CREDIT_BUREAU_QRT        float64
AMT_REQ_CREDIT_BUREAU_WEEK       float64
AMT_REQ_CREDIT_BUREAU_YEAR       float64
APARTMENTS_AVG                   float64
APARTMENTS_MEDI                  float64
APARTMENTS_MODE                  float64
BASEMENTAREA_AVG                 float64
BASEMENTAREA_MEDI                float64
BASEMENTAREA_MODE                float64
CNT_CHILDREN                     float64
CNT_FAM_MEMBERS                  float64
CODE_GENDER                     category
COMMONAREA_AVG                   float64
COMMONAREA_MEDI                  float64
COMMONAREA_MODE                  float64
DAYS_BIRTH                       float64
DAYS_EMPLOYED                    float64
DAYS_ID_PUBLISH 

In [14]:
# Specify primitives
agg_primitives =  ["sum", "max", "min", "mean", "count", "percent_true", "num_unique", "mode"]
trans_primitives = ['percentile', 'and']

# Deep feature synthesis 
feature_names = ft.dfs(entityset=es, target_entity='app',
                       agg_primitives = agg_primitives,
                       trans_primitives = trans_primitives,
                       n_jobs = -1, verbose = 1,
                       features_only = True,
                       max_depth = 2)

Built 1820 features


In [None]:
ft.save_features()


In [15]:
# Specify primitives
agg_primitives =  ["sum", "max", "min", "mean", "count", "percent_true", "num_unique", "mode"]
trans_primitives = ['percentile', 'and']

# Deep feature synthesis 
feature_names_cat = ft.dfs(entityset=es_cat, target_entity='app',
                       agg_primitives = agg_primitives,
                       trans_primitives = trans_primitives,
                       n_jobs = -1, verbose = 1,
                       features_only = True,
                       max_depth = 2)

Built 1820 features
