In [25]:
import math
import psutil
from datetime import datetime

import dask.dataframe as dd
import numpy as np
import pandas as pd
from dask.distributed import Client

import featuretools as ft
import featuretools.variable_types as vtypes

from tqdm import tqdm

In [26]:
print("Available Memory: {} Mb".format(psutil.virtual_memory().available / 1000000))
try:
    client.close()
except:
    pass
client = Client()
client

Available Memory: 7882.99776 Mb


0,1
Client  Scheduler: tcp://127.0.0.1:50254  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 16  Memory: 17.18 GB


In [27]:
%%time
print("Reading raw data...")
blocksize = "40MB"
# Read in the datasets and replace the anomalous values
app_train = dd.read_csv('data/home-credit-default-risk/application_train.csv', blocksize=blocksize).replace({365243: np.nan})
app_test = dd.read_csv('data/home-credit-default-risk/application_test.csv', blocksize=blocksize).replace({365243: np.nan})
bureau = dd.read_csv('data/home-credit-default-risk/bureau.csv', blocksize=blocksize).replace({365243: np.nan})
bureau_balance = dd.read_csv('data/home-credit-default-risk/bureau_balance.csv', blocksize=blocksize).replace({365243: np.nan})
cash = dd.read_csv('data/home-credit-default-risk/POS_CASH_balance.csv', blocksize=blocksize).replace({365243: np.nan})
credit = dd.read_csv('data/home-credit-default-risk/credit_card_balance.csv', blocksize=blocksize).replace({365243: np.nan})
previous = dd.read_csv('data/home-credit-default-risk/previous_application.csv', blocksize=blocksize).replace({365243: np.nan})
installments = dd.read_csv('data/home-credit-default-risk/installments_payments.csv', blocksize=blocksize).replace({365243: np.nan})

Reading raw data...
CPU times: user 231 ms, sys: 43.9 ms, total: 275 ms
Wall time: 302 ms


In [28]:
%%time
print("Preparing data...")
app_test['TARGET'] = np.nan
app = app_train.append(app_test[app_train.columns])

for index in ['SK_ID_CURR', 'SK_ID_PREV', 'SK_ID_BUREAU']:
    for dataset in [app, bureau, bureau_balance, cash, credit, previous, installments]:
        if index in list(dataset.columns):
            dataset[index] = dataset[index].fillna(0).astype(np.int64)

es = ft.EntitySet(id='clients')

installments = installments.drop(columns=['SK_ID_CURR'])
credit = credit.drop(columns=['SK_ID_CURR'])
cash = cash.drop(columns=['SK_ID_CURR'])

Preparing data...
CPU times: user 265 ms, sys: 14.3 ms, total: 279 ms
Wall time: 283 ms


In [29]:
app_vtypes = {
    'SK_ID_CURR': ft.variable_types.variable.Index,
    'AMT_ANNUITY': ft.variable_types.variable.Numeric,
    'AMT_CREDIT': ft.variable_types.variable.Numeric,
    'AMT_GOODS_PRICE': ft.variable_types.variable.Numeric,
    'AMT_INCOME_TOTAL': ft.variable_types.variable.Numeric,
    'AMT_REQ_CREDIT_BUREAU_DAY': ft.variable_types.variable.Numeric,
    'AMT_REQ_CREDIT_BUREAU_HOUR': ft.variable_types.variable.Numeric,
    'AMT_REQ_CREDIT_BUREAU_MON': ft.variable_types.variable.Numeric,
    'AMT_REQ_CREDIT_BUREAU_QRT': ft.variable_types.variable.Numeric,
    'AMT_REQ_CREDIT_BUREAU_WEEK': ft.variable_types.variable.Numeric,
    'AMT_REQ_CREDIT_BUREAU_YEAR': ft.variable_types.variable.Numeric,
    'APARTMENTS_AVG': ft.variable_types.variable.Numeric,
    'APARTMENTS_MEDI': ft.variable_types.variable.Numeric,
    'APARTMENTS_MODE': ft.variable_types.variable.Numeric,
    'BASEMENTAREA_AVG': ft.variable_types.variable.Numeric,
    'BASEMENTAREA_MEDI': ft.variable_types.variable.Numeric,
    'BASEMENTAREA_MODE': ft.variable_types.variable.Numeric,
    'CNT_CHILDREN': ft.variable_types.variable.Numeric,
    'CNT_FAM_MEMBERS': ft.variable_types.variable.Numeric,
    'CODE_GENDER': ft.variable_types.variable.Categorical,
    'COMMONAREA_AVG': ft.variable_types.variable.Numeric,
    'COMMONAREA_MEDI': ft.variable_types.variable.Numeric,
    'COMMONAREA_MODE': ft.variable_types.variable.Numeric,
    'DAYS_BIRTH': ft.variable_types.variable.Numeric,
    'DAYS_EMPLOYED': ft.variable_types.variable.Numeric,
    'DAYS_ID_PUBLISH': ft.variable_types.variable.Numeric,
    'DAYS_LAST_PHONE_CHANGE': ft.variable_types.variable.Numeric,
    'DAYS_REGISTRATION': ft.variable_types.variable.Numeric,
    'DEF_30_CNT_SOCIAL_CIRCLE': ft.variable_types.variable.Numeric,
    'DEF_60_CNT_SOCIAL_CIRCLE': ft.variable_types.variable.Numeric,
    'ELEVATORS_AVG': ft.variable_types.variable.Numeric,
    'ELEVATORS_MEDI': ft.variable_types.variable.Numeric,
    'ELEVATORS_MODE': ft.variable_types.variable.Numeric,
    'EMERGENCYSTATE_MODE': ft.variable_types.variable.Categorical,
    'ENTRANCES_AVG': ft.variable_types.variable.Numeric,
    'ENTRANCES_MEDI': ft.variable_types.variable.Numeric,
    'ENTRANCES_MODE': ft.variable_types.variable.Numeric,
    'EXT_SOURCE_1': ft.variable_types.variable.Numeric,
    'EXT_SOURCE_2': ft.variable_types.variable.Numeric,
    'EXT_SOURCE_3': ft.variable_types.variable.Numeric,
    'FLAG_CONT_MOBILE': ft.variable_types.variable.Boolean,
    'FLAG_DOCUMENT_10': ft.variable_types.variable.Boolean,
    'FLAG_DOCUMENT_11': ft.variable_types.variable.Boolean,
    'FLAG_DOCUMENT_12': ft.variable_types.variable.Boolean,
    'FLAG_DOCUMENT_13': ft.variable_types.variable.Boolean,
    'FLAG_DOCUMENT_14': ft.variable_types.variable.Boolean,
    'FLAG_DOCUMENT_15': ft.variable_types.variable.Boolean,
    'FLAG_DOCUMENT_16': ft.variable_types.variable.Boolean,
    'FLAG_DOCUMENT_17': ft.variable_types.variable.Boolean,
    'FLAG_DOCUMENT_18': ft.variable_types.variable.Boolean,
    'FLAG_DOCUMENT_19': ft.variable_types.variable.Boolean,
    'FLAG_DOCUMENT_2': ft.variable_types.variable.Boolean,
    'FLAG_DOCUMENT_20': ft.variable_types.variable.Boolean,
    'FLAG_DOCUMENT_21': ft.variable_types.variable.Boolean,
    'FLAG_DOCUMENT_3': ft.variable_types.variable.Boolean,
    'FLAG_DOCUMENT_4': ft.variable_types.variable.Boolean,
    'FLAG_DOCUMENT_5': ft.variable_types.variable.Boolean,
    'FLAG_DOCUMENT_6': ft.variable_types.variable.Boolean,
    'FLAG_DOCUMENT_7': ft.variable_types.variable.Boolean,
    'FLAG_DOCUMENT_8': ft.variable_types.variable.Boolean,
    'FLAG_DOCUMENT_9': ft.variable_types.variable.Boolean,
    'FLAG_EMAIL': ft.variable_types.variable.Boolean,
    'FLAG_EMP_PHONE': ft.variable_types.variable.Boolean,
    'FLAG_MOBIL': ft.variable_types.variable.Boolean,
    'FLAG_OWN_CAR': ft.variable_types.variable.Categorical,
    'FLAG_OWN_REALTY': ft.variable_types.variable.Categorical,
    'FLAG_PHONE': ft.variable_types.variable.Boolean,
    'FLAG_WORK_PHONE': ft.variable_types.variable.Boolean,
    'FLOORSMAX_AVG': ft.variable_types.variable.Numeric,
    'FLOORSMAX_MEDI': ft.variable_types.variable.Numeric,
    'FLOORSMAX_MODE': ft.variable_types.variable.Numeric,
    'FLOORSMIN_AVG': ft.variable_types.variable.Numeric,
    'FLOORSMIN_MEDI': ft.variable_types.variable.Numeric,
    'FLOORSMIN_MODE': ft.variable_types.variable.Numeric,
    'FONDKAPREMONT_MODE': ft.variable_types.variable.Categorical,
    'HOUR_APPR_PROCESS_START': ft.variable_types.variable.Numeric,
    'HOUSETYPE_MODE': ft.variable_types.variable.Categorical,
    'LANDAREA_AVG': ft.variable_types.variable.Numeric,
    'LANDAREA_MEDI': ft.variable_types.variable.Numeric,
    'LANDAREA_MODE': ft.variable_types.variable.Numeric,
    'LIVE_CITY_NOT_WORK_CITY': ft.variable_types.variable.Boolean,
    'LIVE_REGION_NOT_WORK_REGION': ft.variable_types.variable.Boolean,
    'LIVINGAPARTMENTS_AVG': ft.variable_types.variable.Numeric,
    'LIVINGAPARTMENTS_MEDI': ft.variable_types.variable.Numeric,
    'LIVINGAPARTMENTS_MODE': ft.variable_types.variable.Numeric,
    'LIVINGAREA_AVG': ft.variable_types.variable.Numeric,
    'LIVINGAREA_MEDI': ft.variable_types.variable.Numeric,
    'LIVINGAREA_MODE': ft.variable_types.variable.Numeric,
    'NAME_CONTRACT_TYPE': ft.variable_types.variable.Categorical,
    'NAME_EDUCATION_TYPE': ft.variable_types.variable.Categorical,
    'NAME_FAMILY_STATUS': ft.variable_types.variable.Categorical,
    'NAME_HOUSING_TYPE': ft.variable_types.variable.Categorical,
    'NAME_INCOME_TYPE': ft.variable_types.variable.Categorical,
    'NAME_TYPE_SUITE': ft.variable_types.variable.Categorical,
    'NONLIVINGAPARTMENTS_AVG': ft.variable_types.variable.Numeric,
    'NONLIVINGAPARTMENTS_MEDI': ft.variable_types.variable.Numeric,
    'NONLIVINGAPARTMENTS_MODE': ft.variable_types.variable.Numeric,
    'NONLIVINGAREA_AVG': ft.variable_types.variable.Numeric,
    'NONLIVINGAREA_MEDI': ft.variable_types.variable.Numeric,
    'NONLIVINGAREA_MODE': ft.variable_types.variable.Numeric,
    'OBS_30_CNT_SOCIAL_CIRCLE': ft.variable_types.variable.Numeric,
    'OBS_60_CNT_SOCIAL_CIRCLE': ft.variable_types.variable.Numeric,
    'OCCUPATION_TYPE': ft.variable_types.variable.Categorical,
    'ORGANIZATION_TYPE': ft.variable_types.variable.Categorical,
    'OWN_CAR_AGE': ft.variable_types.variable.Numeric,
    'REGION_POPULATION_RELATIVE': ft.variable_types.variable.Numeric,
    'REGION_RATING_CLIENT': ft.variable_types.variable.Numeric,
    'REGION_RATING_CLIENT_W_CITY': ft.variable_types.variable.Numeric,
    'REG_CITY_NOT_LIVE_CITY': ft.variable_types.variable.Boolean,
    'REG_CITY_NOT_WORK_CITY': ft.variable_types.variable.Boolean,
    'REG_REGION_NOT_LIVE_REGION': ft.variable_types.variable.Boolean,
    'REG_REGION_NOT_WORK_REGION': ft.variable_types.variable.Boolean,
    'TARGET': ft.variable_types.variable.Numeric,
    'TOTALAREA_MODE': ft.variable_types.variable.Numeric,
    'WALLSMATERIAL_MODE': ft.variable_types.variable.Categorical,
    'WEEKDAY_APPR_PROCESS_START': ft.variable_types.variable.Categorical,
    'YEARS_BEGINEXPLUATATION_AVG': ft.variable_types.variable.Numeric,
    'YEARS_BEGINEXPLUATATION_MEDI': ft.variable_types.variable.Numeric,
    'YEARS_BEGINEXPLUATATION_MODE': ft.variable_types.variable.Numeric,
    'YEARS_BUILD_AVG': ft.variable_types.variable.Numeric,
    'YEARS_BUILD_MEDI': ft.variable_types.variable.Numeric,
    'YEARS_BUILD_MODE': ft.variable_types.variable.Numeric
}

bureau_vtypes = {
    'SK_ID_BUREAU': ft.variable_types.variable.Index,
    'SK_ID_CURR': ft.variable_types.variable.Id,
    'CREDIT_ACTIVE': ft.variable_types.variable.Categorical,
    'CREDIT_CURRENCY': ft.variable_types.variable.Categorical,
    'DAYS_CREDIT': ft.variable_types.variable.Numeric,
    'CREDIT_DAY_OVERDUE': ft.variable_types.variable.Numeric,
    'DAYS_CREDIT_ENDDATE': ft.variable_types.variable.Numeric,
    'DAYS_ENDDATE_FACT': ft.variable_types.variable.Numeric,
    'AMT_CREDIT_MAX_OVERDUE': ft.variable_types.variable.Numeric,
    'CNT_CREDIT_PROLONG': ft.variable_types.variable.Numeric,
    'AMT_CREDIT_SUM': ft.variable_types.variable.Numeric,
    'AMT_CREDIT_SUM_DEBT': ft.variable_types.variable.Numeric,
    'AMT_CREDIT_SUM_LIMIT': ft.variable_types.variable.Numeric,
    'AMT_CREDIT_SUM_OVERDUE': ft.variable_types.variable.Numeric,
    'CREDIT_TYPE': ft.variable_types.variable.Categorical,
    'DAYS_CREDIT_UPDATE': ft.variable_types.variable.Numeric,
    'AMT_ANNUITY': ft.variable_types.variable.Numeric
}

previous_vtypes = {
    'SK_ID_PREV': ft.variable_types.variable.Index,
    'SK_ID_CURR': ft.variable_types.variable.Id,
    'NAME_CONTRACT_TYPE': ft.variable_types.variable.Categorical,
    'AMT_ANNUITY': ft.variable_types.variable.Numeric,
    'AMT_APPLICATION': ft.variable_types.variable.Numeric,
    'AMT_CREDIT': ft.variable_types.variable.Numeric,
    'AMT_DOWN_PAYMENT': ft.variable_types.variable.Numeric,
    'AMT_GOODS_PRICE': ft.variable_types.variable.Numeric,
    'WEEKDAY_APPR_PROCESS_START': ft.variable_types.variable.Categorical,
    'HOUR_APPR_PROCESS_START': ft.variable_types.variable.Numeric,
    'FLAG_LAST_APPL_PER_CONTRACT': ft.variable_types.variable.Categorical,
    'NFLAG_LAST_APPL_IN_DAY': ft.variable_types.variable.Boolean,
    'RATE_DOWN_PAYMENT': ft.variable_types.variable.Numeric,
    'RATE_INTEREST_PRIMARY': ft.variable_types.variable.Numeric,
    'RATE_INTEREST_PRIVILEGED': ft.variable_types.variable.Numeric,
    'NAME_CASH_LOAN_PURPOSE': ft.variable_types.variable.Categorical,
    'NAME_CONTRACT_STATUS': ft.variable_types.variable.Categorical,
    'DAYS_DECISION': ft.variable_types.variable.Numeric,
    'NAME_PAYMENT_TYPE': ft.variable_types.variable.Categorical,
    'CODE_REJECT_REASON': ft.variable_types.variable.Categorical,
    'NAME_TYPE_SUITE': ft.variable_types.variable.Categorical,
    'NAME_CLIENT_TYPE': ft.variable_types.variable.Categorical,
    'NAME_GOODS_CATEGORY': ft.variable_types.variable.Categorical,
    'NAME_PORTFOLIO': ft.variable_types.variable.Categorical,
    'NAME_PRODUCT_TYPE': ft.variable_types.variable.Categorical,
    'CHANNEL_TYPE': ft.variable_types.variable.Categorical,
    'SELLERPLACE_AREA': ft.variable_types.variable.Numeric,
    'NAME_SELLER_INDUSTRY': ft.variable_types.variable.Categorical,
    'CNT_PAYMENT': ft.variable_types.variable.Numeric,
    'NAME_YIELD_GROUP': ft.variable_types.variable.Categorical,
    'PRODUCT_COMBINATION': ft.variable_types.variable.Categorical,
    'DAYS_FIRST_DRAWING': ft.variable_types.variable.Numeric,
    'DAYS_FIRST_DUE': ft.variable_types.variable.Numeric,
    'DAYS_LAST_DUE_1ST_VERSION': ft.variable_types.variable.Numeric,
    'DAYS_LAST_DUE': ft.variable_types.variable.Numeric,
    'DAYS_TERMINATION': ft.variable_types.variable.Numeric,
    'NFLAG_INSURED_ON_APPROVAL': ft.variable_types.variable.Numeric
}

bureau_balance_vtypes = {
    'bureaubalance_index': ft.variable_types.variable.Index,
    'SK_ID_BUREAU': ft.variable_types.variable.Id,
    'MONTHS_BALANCE': ft.variable_types.variable.Numeric,
    'STATUS': ft.variable_types.variable.Categorical
}

cash_vtypes = {
    'cash_index': ft.variable_types.variable.Index,
    'SK_ID_PREV': ft.variable_types.variable.Id,
    'MONTHS_BALANCE': ft.variable_types.variable.Numeric,
    'CNT_INSTALMENT': ft.variable_types.variable.Numeric,
    'CNT_INSTALMENT_FUTURE': ft.variable_types.variable.Numeric,
    'NAME_CONTRACT_STATUS': ft.variable_types.variable.Categorical,
    'SK_DPD': ft.variable_types.variable.Numeric,
    'SK_DPD_DEF': ft.variable_types.variable.Numeric
}

installments_vtypes = {
    'installments_index': ft.variable_types.variable.Index,
    'SK_ID_PREV': ft.variable_types.variable.Id,
    'NUM_INSTALMENT_VERSION': ft.variable_types.variable.Numeric,
    'NUM_INSTALMENT_NUMBER': ft.variable_types.variable.Numeric,
    'DAYS_INSTALMENT': ft.variable_types.variable.Numeric,
    'DAYS_ENTRY_PAYMENT': ft.variable_types.variable.Numeric,
    'AMT_INSTALMENT': ft.variable_types.variable.Numeric,
    'AMT_PAYMENT': ft.variable_types.variable.Numeric
}

credit_vtypes = {
    'credit_index': ft.variable_types.variable.Index,
    'SK_ID_PREV': ft.variable_types.variable.Id,
    'MONTHS_BALANCE': ft.variable_types.variable.Numeric,
    'AMT_BALANCE': ft.variable_types.variable.Numeric,
    'AMT_CREDIT_LIMIT_ACTUAL': ft.variable_types.variable.Numeric,
    'AMT_DRAWINGS_ATM_CURRENT': ft.variable_types.variable.Numeric,
    'AMT_DRAWINGS_CURRENT': ft.variable_types.variable.Numeric,
    'AMT_DRAWINGS_OTHER_CURRENT': ft.variable_types.variable.Numeric,
    'AMT_DRAWINGS_POS_CURRENT': ft.variable_types.variable.Numeric,
    'AMT_INST_MIN_REGULARITY': ft.variable_types.variable.Numeric,
    'AMT_PAYMENT_CURRENT': ft.variable_types.variable.Numeric,
    'AMT_PAYMENT_TOTAL_CURRENT': ft.variable_types.variable.Numeric,
    'AMT_RECEIVABLE_PRINCIPAL': ft.variable_types.variable.Numeric,
    'AMT_RECIVABLE': ft.variable_types.variable.Numeric,
    'AMT_TOTAL_RECEIVABLE': ft.variable_types.variable.Numeric,
    'CNT_DRAWINGS_ATM_CURRENT': ft.variable_types.variable.Numeric,
    'CNT_DRAWINGS_CURRENT': ft.variable_types.variable.Numeric,
    'CNT_DRAWINGS_OTHER_CURRENT': ft.variable_types.variable.Numeric,
    'CNT_DRAWINGS_POS_CURRENT': ft.variable_types.variable.Numeric,
    'CNT_INSTALMENT_MATURE_CUM': ft.variable_types.variable.Numeric,
    'NAME_CONTRACT_STATUS': ft.variable_types.variable.Categorical,
    'SK_DPD': ft.variable_types.variable.Numeric,
    'SK_DPD_DEF': ft.variable_types.variable.Numeric
}

In [30]:
%%time
print("Creating entityset...")
# Entities with a unique index
es = es.entity_from_dataframe(entity_id='app', dataframe=app, index='SK_ID_CURR',
                              variable_types=app_vtypes)

es = es.entity_from_dataframe(entity_id='bureau', dataframe=bureau, index='SK_ID_BUREAU',
                              variable_types=bureau_vtypes)

es = es.entity_from_dataframe(entity_id='previous', dataframe=previous, index='SK_ID_PREV',
                              variable_types=previous_vtypes)

# Entities that do not have a unique index
es = es.entity_from_dataframe(entity_id='bureau_balance', dataframe=bureau_balance,
                              make_index=True, index='bureaubalance_index',
                              variable_types=bureau_balance_vtypes)

es = es.entity_from_dataframe(entity_id='cash', dataframe=cash,
                              make_index=True, index='cash_index',
                              variable_types=cash_vtypes)

es = es.entity_from_dataframe(entity_id='installments', dataframe=installments,
                              make_index=True, index='installments_index',
                              variable_types=installments_vtypes)

es = es.entity_from_dataframe(entity_id='credit', dataframe=credit,
                              make_index=True, index='credit_index',
                              variable_types=credit_vtypes)

print("Adding relationships...")
# Relationship between app_train and bureau
r_app_bureau = ft.Relationship(es['app']['SK_ID_CURR'], es['bureau']['SK_ID_CURR'])

# Relationship between bureau and bureau balance
r_bureau_balance = ft.Relationship(es['bureau']['SK_ID_BUREAU'], es['bureau_balance']['SK_ID_BUREAU'])

# Relationship between current app and previous apps
r_app_previous = ft.Relationship(es['app']['SK_ID_CURR'], es['previous']['SK_ID_CURR'])

# Relationships between previous apps and cash, installments, and credit
r_previous_cash = ft.Relationship(es['previous']['SK_ID_PREV'], es['cash']['SK_ID_PREV'])
r_previous_installments = ft.Relationship(es['previous']['SK_ID_PREV'], es['installments']['SK_ID_PREV'])
r_previous_credit = ft.Relationship(es['previous']['SK_ID_PREV'], es['credit']['SK_ID_PREV'])

# Add in the defined relationships
es = es.add_relationships([r_app_bureau, r_bureau_balance, r_app_previous,
                           r_previous_cash, r_previous_installments, r_previous_credit])
# Print out the EntitySet
print(es)

Creating entityset...
Adding relationships...
Entityset: clients
  Entities:
    app [Rows: Delayed('int-f0a2fa9a-46e7-48ab-98dc-e5cce95394e6'), Columns: 122]
    bureau [Rows: Delayed('int-fb73e688-ecc6-4564-902c-f8db72858cb7'), Columns: 17]
    previous [Rows: Delayed('int-db9ad8ca-bcbd-497e-b548-e6cfe8a72ee8'), Columns: 37]
    bureau_balance [Rows: Delayed('int-d72769fb-4e84-4f03-82ba-285790339634'), Columns: 4]
    cash [Rows: Delayed('int-e841d30a-239a-47b6-acdf-a02f46ec581a'), Columns: 8]
    installments [Rows: Delayed('int-51d2d83a-7849-4c1d-b664-3bc6bf22baaf'), Columns: 8]
    credit [Rows: Delayed('int-3d362c09-8a68-4eac-9366-1eb92abb1a5f'), Columns: 23]
  Relationships:
    bureau.SK_ID_CURR -> app.SK_ID_CURR
    bureau_balance.SK_ID_BUREAU -> bureau.SK_ID_BUREAU
    previous.SK_ID_CURR -> app.SK_ID_CURR
    cash.SK_ID_PREV -> previous.SK_ID_PREV
    installments.SK_ID_PREV -> previous.SK_ID_PREV
    credit.SK_ID_PREV -> previous.SK_ID_PREV
CPU times: user 151 ms, sys: 4.99

In [31]:
%%time
cutoff_times = app['SK_ID_CURR'].to_frame().rename(columns={"SK_ID_CURR":"instance_id"})
cutoff_times["time"] = datetime.now()
cutoff_times = cutoff_times.compute()

CPU times: user 182 ms, sys: 46.2 ms, total: 228 ms
Wall time: 3.16 s


In [32]:
%%time
# agg_primitives = ["sum", "max", "min", "mean", "count", "any", "all"]
# trans_primitives = ["and", "add_numeric", "negate"]
trans_primitives = []
agg_primitives = ["sum", "max", "min"]

print("Running DFS...")
features = ft.dfs(entityset=es, target_entity='app',
                  trans_primitives=trans_primitives,
                  agg_primitives=agg_primitives,
                  where_primitives=[], seed_features=[],
                  max_depth=2, verbose=1, features_only=True,
                  cutoff_time=cutoff_times)

new_partitions = es['app'].df.npartitions * math.ceil(len(features) / len(es['app'].df.columns))
print("New Partitions: {}".format(new_partitions))
es['app'].df = es['app'].df.repartition(npartitions=new_partitions)

# DFS with specified primitives
fm, features = ft.dfs(entityset=es, target_entity='app',
                      trans_primitives=trans_primitives,
                      agg_primitives=agg_primitives,
                      where_primitives=[], seed_features=[],
                      max_depth=2, verbose=0,
                      cutoff_time=cutoff_times)

Running DFS...
Built 697 features
New Partitions: 36
Aggregation dict lentgth: 1
Aggregation dict lentgth: 5
Aggregation dict lentgth: 20
Aggregation dict lentgth: 6
Aggregation dict lentgth: 15
Aggregation dict lentgth: 13
Aggregation dict lentgth: 23
Aggregation dict lentgth: 38
Aggregation dict lentgth: 24
Aggregation dict lentgth: 111
Aggregation dict to large - splitting


  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


CPU times: user 3.41 s, sys: 269 ms, total: 3.68 s
Wall time: 3.64 s


In [33]:
# %%time
# print("Output feature matrix to CSV...")
# fm.to_csv("dask-test-tmp/test_dask-*.csv")

In [34]:
%%time
print("Computing feature matrix...")
fm_computed = fm.compute()
print("Shape: {}".format(fm_computed.shape))
print("Memory: {} MB".format(fm_computed.memory_usage().sum() / 1000000))

Computing feature matrix...
Shape: (356255, 698)
Memory: 1992.17796 MB
CPU times: user 34.6 s, sys: 7.65 s, total: 42.3 s
Wall time: 2min 38s


In [35]:
print("Partition Ratio:", math.ceil(len(features) / len(es['app'].df.columns)))
print("Column Ratio:", len(fm_computed.columns) / len(app.columns))
print("Memory Ratio:", fm_computed.memory_usage().sum() / app.compute().memory_usage().sum())

Partition Ratio: 6
Column Ratio: 5.721311475409836
Memory Ratio: 5.682926829268292


In [36]:
client.close()

In [37]:
notebook_fm = pd.read_csv('home_credit_fm_notebook_agg_v1.csv')

In [38]:
print("Shape: {}".format(notebook_fm.shape))
print("Memory: {} MB".format(notebook_fm.memory_usage().sum() / 1000000))

Shape: (356255, 698)
Memory: 1909.526928 MB


In [39]:
fm_cols = fm_computed.columns

In [40]:
notebook_cols = notebook_fm.columns

In [41]:
missing = [col for col in notebook_cols if col not in fm_cols]
print("Columns missing: ", len(missing))

Columns missing:  0


In [42]:
# make sure row and column sorting order is the same and make sure indexes are the same
fm1 = notebook_fm.sort_values('SK_ID_CURR').reset_index(drop=True)
fm2 = fm_computed.sort_values('SK_ID_CURR')[fm1.columns].reset_index(drop=True)

In [43]:
fm1.head()

Unnamed: 0,SK_ID_CURR,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,...,MIN(credit.previous.DAYS_TERMINATION),MIN(credit.previous.AMT_GOODS_PRICE),MIN(credit.previous.DAYS_FIRST_DUE),MIN(credit.previous.DAYS_DECISION),MIN(credit.previous.AMT_APPLICATION),MIN(credit.previous.AMT_ANNUITY),MIN(credit.previous.DAYS_LAST_DUE),MIN(credit.previous.SELLERPLACE_AREA),MIN(credit.previous.RATE_DOWN_PAYMENT),MIN(credit.previous.NFLAG_INSURED_ON_APPROVAL)
0,0,53595.0,715095.0,675000.0,90000.0,0.0,0.0,1.0,0.0,0.0,...,,,,,,,,,,
1,100001,20560.5,568800.0,450000.0,135000.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,100002,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,100003,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,100004,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [44]:
fm2.head()

Unnamed: 0,SK_ID_CURR,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,...,MIN(credit.previous.DAYS_TERMINATION),MIN(credit.previous.AMT_GOODS_PRICE),MIN(credit.previous.DAYS_FIRST_DUE),MIN(credit.previous.DAYS_DECISION),MIN(credit.previous.AMT_APPLICATION),MIN(credit.previous.AMT_ANNUITY),MIN(credit.previous.DAYS_LAST_DUE),MIN(credit.previous.SELLERPLACE_AREA),MIN(credit.previous.RATE_DOWN_PAYMENT),MIN(credit.previous.NFLAG_INSURED_ON_APPROVAL)
0,0,53595.0,715095.0,675000.0,90000.0,0.0,0.0,1.0,0.0,0.0,...,,,,,,,,,,
1,100001,20560.5,568800.0,450000.0,135000.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,100002,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,100003,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,100004,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [45]:
for col in tqdm(fm1.columns):
    if fm1[col].dtype != fm2[col].dtype:
        fm2[col] = fm2[col].astype(fm1[col].dtype)

100%|██████████| 698/698 [00:36<00:00, 19.05it/s]


In [46]:
# pd.testing.assert_frame_equal(fm1, fm2)

In [None]:
for col in tqdm(fm1.columns):
    try:
        pd.testing.assert_series_equal(fm1[col], fm2[col])
    except:
        print(col)

 23%|██▎       | 162/698 [01:36<11:40,  1.31s/it]

SUM(previous.DAYS_LAST_DUE_1ST_VERSION)


 23%|██▎       | 163/698 [01:54<55:42,  6.25s/it]

SUM(previous.DAYS_DECISION)


 23%|██▎       | 164/698 [01:55<40:42,  4.57s/it]

SUM(previous.RATE_INTEREST_PRIVILEGED)


 24%|██▎       | 165/698 [02:13<1:16:08,  8.57s/it]

SUM(previous.AMT_GOODS_PRICE)


 24%|██▍       | 166/698 [02:22<1:18:40,  8.87s/it]

SUM(previous.NFLAG_INSURED_ON_APPROVAL)


 24%|██▍       | 167/698 [02:26<1:04:39,  7.31s/it]

SUM(previous.DAYS_FIRST_DRAWING)


 24%|██▍       | 168/698 [02:45<1:35:06, 10.77s/it]

SUM(previous.AMT_ANNUITY)


 24%|██▍       | 169/698 [03:03<1:53:36, 12.89s/it]

SUM(previous.DAYS_FIRST_DUE)


 24%|██▍       | 170/698 [03:21<2:07:49, 14.53s/it]

SUM(previous.AMT_APPLICATION)


 24%|██▍       | 171/698 [03:38<2:13:44, 15.23s/it]

SUM(previous.DAYS_LAST_DUE)


 25%|██▍       | 172/698 [03:56<2:22:12, 16.22s/it]

SUM(previous.CNT_PAYMENT)


 25%|██▍       | 173/698 [04:15<2:27:48, 16.89s/it]

SUM(previous.AMT_CREDIT)


 25%|██▍       | 174/698 [04:19<1:53:52, 13.04s/it]

SUM(previous.HOUR_APPR_PROCESS_START)


 25%|██▌       | 175/698 [04:31<1:51:35, 12.80s/it]

SUM(previous.AMT_DOWN_PAYMENT)


 25%|██▌       | 176/698 [04:49<2:05:38, 14.44s/it]

SUM(previous.SELLERPLACE_AREA)


 25%|██▌       | 177/698 [05:07<2:12:31, 15.26s/it]

SUM(previous.DAYS_TERMINATION)


 26%|██▌       | 178/698 [05:07<1:34:25, 10.89s/it]

SUM(previous.RATE_INTEREST_PRIMARY)


 26%|██▌       | 179/698 [05:15<1:24:44,  9.80s/it]

SUM(previous.RATE_DOWN_PAYMENT)


In [None]:
client.close()