In this notebook we will develop a function to perform dfs from an existing entityset. This reads in the entityset, creates the seed features and the interesting values, and then runs deep feature synthesis on the entityset. This code is specific to the Home Credit competition (for now).

In [1]:
import featuretools as ft
import featuretools.variable_types as vtypes

import pandas as pd
import numpy as np

import sys

In [8]:
def entityset_from_filepath(path):
    app = pd.read_csv('%s/app.csv' % path)
    bureau = pd.read_csv('%s/bureau.csv' % path)
    
    bureau_balance = pd.read_csv('%s/bureau_balance.csv' % path)
    
    previous = pd.read_csv('%s/previous.csv' % path)
    
    credit = pd.read_csv('%s/credit.csv' % path)
    installments = pd.read_csv('%s/installments.csv' % path)
    cash = pd.read_csv('%s/cash.csv' % path)
    
    # All ids should be integers
    for index in ['SK_ID_CURR', 'SK_ID_PREV', 'SK_ID_BUREAU']:
        for dataset in [app, bureau, bureau_balance, cash, credit, previous, installments]:
            if index in list(dataset.columns):
                # Convert to integers after filling in missing values (not sure why values are missing)
                dataset[index] = dataset[index].fillna(0).astype(np.int64)
    
    app_types = {}

    # Handle the Boolean variables:
    for col in app:
        if (app[col].nunique() == 2) and (app[col].dtype == float):
            app_types[col] = vtypes.Boolean

    # Remove the `TARGET`
    if 'TARGET' in app_types:
        del app_types['TARGET']
    
    previous_types = {}

    # Handle the Boolean variables:
    for col in previous:
        if (previous[col].nunique() == 2) and (previous[col].dtype == float):
            previous_types[col] = vtypes.Boolean
    
    es = ft.EntitySet(id = 'clients')
    
    app['LOAN_RATE'] = app['AMT_ANNUITY'] / app['AMT_CREDIT'] 
    app['CREDIT_INCOME_RATIO'] = app['AMT_CREDIT'] / app['AMT_INCOME_TOTAL']
    app['EMPLOYED_BIRTH_RATIO'] = app['DAYS_EMPLOYED'] / app['DAYS_BIRTH']
    app['EXT_SOURCE_SUM'] = app[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].sum(axis = 1)
    app['EXT_SOURCE_MEAN'] = app[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis = 1)
    app['AMT_REQ_SUM'] = app[[x for x in app.columns if 'AMT_REQ_' in x]].sum(axis = 1)
    
    # Entities with a unique index
    es = es.entity_from_dataframe(entity_id = 'app', dataframe = app, index = 'SK_ID_CURR',
                                  variable_types = app_types)

    es = es.entity_from_dataframe(entity_id = 'bureau', dataframe = bureau, index = 'SK_ID_BUREAU')

    es = es.entity_from_dataframe(entity_id = 'previous', dataframe = previous, index = 'SK_ID_PREV',
                                  variable_types = previous_types)

    # Entities that do not have a unique index
    es = es.entity_from_dataframe(entity_id = 'bureau_balance', dataframe = bureau_balance, 
                                  make_index = True, index = 'bureaubalance_index')

    es = es.entity_from_dataframe(entity_id = 'cash', dataframe = cash, 
                                  make_index = True, index = 'cash_index')

    es = es.entity_from_dataframe(entity_id = 'installments', dataframe = installments,
                                  make_index = True, index = 'installments_index')

    es = es.entity_from_dataframe(entity_id = 'credit', dataframe = credit,
                                  make_index = True, index = 'credit_index')
    
    # Relationship between app_train and bureau
    r_app_bureau = ft.Relationship(es['app']['SK_ID_CURR'], es['bureau']['SK_ID_CURR'])

    # Relationship between bureau and bureau balance
    r_bureau_balance = ft.Relationship(es['bureau']['SK_ID_BUREAU'], es['bureau_balance']['SK_ID_BUREAU'])

    # Relationship between current app and previous apps
    r_app_previous = ft.Relationship(es['app']['SK_ID_CURR'], es['previous']['SK_ID_CURR'])

    # Relationships between previous apps and cash, installments, and credit
    r_previous_cash = ft.Relationship(es['previous']['SK_ID_PREV'], es['cash']['SK_ID_PREV'])
    r_previous_installments = ft.Relationship(es['previous']['SK_ID_PREV'], es['installments']['SK_ID_PREV'])
    r_previous_credit = ft.Relationship(es['previous']['SK_ID_PREV'], es['credit']['SK_ID_PREV'])
    
    # Add in the defined relationships
    es = es.add_relationships([r_app_bureau, r_bureau_balance, r_app_previous,
                               r_previous_cash, r_previous_installments, r_previous_credit])
    
    # Bureau interesting values
    es['bureau']['CREDIT_ACTIVE'].interesting_values = ['Active', 'Closed']
    
    # Bureau seed features
    credit_overdue = ft.Feature(es['bureau']['CREDIT_DAY_OVERDUE']) > 0.0
    credit_overdue = credit_overdue.rename('CREDIT_OVERDUE')

    credit_loan_rate = ft.Feature(es['bureau']['AMT_ANNUITY']) / ft.Feature(es['bureau']['AMT_CREDIT_SUM'])
    credit_loan_rate = credit_loan_rate.rename('PREVIOUS_OTHER_LOAN_RATE')
    
    # Bureau balance seed features
    balance_past_due = ft.Feature(es['bureau_balance']['STATUS']).isin(['1', '2', '3', '4', '5'])
    balance_past_due = balance_past_due.rename('PREVIOUS_OTHER_MONTHLY_PAST_DUE')
    
    # Previous interesting values
    es['previous']['NAME_CONTRACT_STATUS'].interesting_values = ['Approved', 'Refused']
    
    # Previous seed features
    previous_difference = ft.Feature(es['previous']['AMT_APPLICATION']) - ft.Feature(es['previous']['AMT_CREDIT'])
    previous_difference = previous_difference.rename('PREVIOUS_APPLICATION_RECEIVED_DIFFERENCE')

    previous_loan_rate = ft.Feature(es['previous']['AMT_ANNUITY']) / ft.Feature(es['previous']['AMT_CREDIT'])
    previous_loan_rate = previous_loan_rate.rename('PREVIOUS_LOAN_RATE')
    
    # Credit interesting values
    es['credit']['NAME_CONTRACT_STATUS'].interesting_values = ['Active', 'Completed']
    
    # Credit seed features
    credit_card_past_due = ft.Feature(es['credit']['SK_DPD']) > 0.0
    credit_card_past_due = credit_card_past_due.rename('CREDIT_CARD_PAST_DUE')
    
    # Cash interesting values
    es['cash']['NAME_CONTRACT_STATUS'].interesting_values = ['Active', 'Completed']
    
    # Cash seed features
    cash_past_due = ft.Feature(es['cash']['SK_DPD']) > 0.0
    cash_past_due = cash_past_due.rename('CASH_PAST_DUE')
    
    # Installments seed features
    installments_late = ft.Feature(es['installments']['DAYS_ENTRY_PAYMENT']) > ft.Feature(es['installments']['DAYS_INSTALMENT'])
    installments_late = installments_late.rename('INSTALLMENT_LATE')

    installments_low_payment = ft.Feature(es['installments']['AMT_PAYMENT']) < ft.Feature(es['installments']['AMT_INSTALMENT']) 
    installments_low_payment = installments_low_payment.rename('INSTALLMENT_LOW')
    
    # List of seed features
    seed_features = [installments_low_payment, installments_late,
                       cash_past_due, credit_card_past_due, 
                       previous_difference, previous_loan_rate,
                       balance_past_due, credit_loan_rate, credit_overdue]
    
    # print total size of entityset in gb
    # print('Total size of entityset: {:.5f} gb.'.format(sys.getsizeof(es) / 1e9))
    
    return es#, seed_features

In [4]:
def feature_names_from_entityset(es, seed_features, agg_primitives = None, 
                       trans_primitives = None, where_primitives = None):
    
    """Run deep feature synthesis from an entityset. Specific to the Home Credit Competition"""
    
    if not agg_primitives:
        agg_primitives =  ["sum", "max", "min", "mean", "count", "percent_true", "num_unique", "mode"]
        
    if not trans_primitives:
        trans_primitives = ['percentile', 'and']
    
    if not where_primitives:
        where_primitives = ['percent_true', 'mean', 'sum']
    
    # Deep feature synthesis with domain knowledge (only features)
    feature_names = ft.dfs(entityset=es, target_entity='app',
                           agg_primitives = agg_primitives,
                           trans_primitives = trans_primitives,
                           seed_features = seed_features,
                           where_primitives = where_primitives,
                           n_jobs = 1, verbose = 1, features_only = True,
                           max_depth = 2)
    
    return feature_names

In [5]:
def feature_matrix_from_entityset(es, feature_names):
    
    """Run deep feature synthesis from an entityset. Specific to the Home Credit Competition"""

    
    # Deep feature synthesis with domain knowledge (only features)
    feature_matrix = ft.calculate_feature_matrix(feature_names, 
                                                 entityset=es, 
                                                 n_jobs = 1, 
                                                 verbose = 1)
    
    return feature_matrix

In [12]:
feature_names = ft.load_features('../../data/kaggle_home_credit/features.txt')

In [7]:
import dask
from dask import delayed
from dask.diagnostics import ProgressBar
from dask.distributed import Client, progress

client = Client()  # use dask.distributed by default

In [21]:
import os
from timeit import default_timer as timer

In [None]:
start = timer()
base = "../../data/kaggle_home_credit/partitions/"
paths = os.listdir(base)

fms = []
for p in paths:
    es = delayed(entityset_from_filepath)(base+p)
    fm = delayed(feature_matrix_from_entityset)(es, feature_names)
    fms.append(fm)


# dask.config.set(scheduler='processes')
out = delayed(pd.concat)(fms, axis=0)

with ProgressBar():
    x = out.compute()
    
timer() - start

In [None]:
x.to_csv('feature_matrix.csv', chunksize = 1000)

In [11]:
ft.save_features(feature_names, '../../data/kaggle_home_credit/features.txt')

In [None]:
out

In [None]:
%debug