From 8a30486a551fca759352277bda450d3b3094d8d6 Mon Sep 17 00:00:00 2001 From: Nate Parsons <4307001+thehomebrewnerd@users.noreply.github.com> Date: Tue, 12 May 2020 16:12:12 -0500 Subject: [PATCH] Updates for running home credit example with Dask (#953) * home credit tests * update home credit test * Improve column assignment for trans features * update home credit test * testing updates * home credit test updates * home credit notebook update * home credit test updates * home credit test updates * home credit test updates * home credit test updates * home credit updates * update feature_set_calculator.py * remove unnecessary repartitioning from notebook * update notebook text * update notebook to use os.path.join --- .../feature_set_calculator.py | 7 +- ...Home Credit Feature Matrix with Dask.ipynb | 611 ++++++++++++++++++ 2 files changed, 614 insertions(+), 4 deletions(-) create mode 100644 featuretools/dask-tests-tmp/Home Credit Feature Matrix with Dask.ipynb diff --git a/featuretools/computational_backends/feature_set_calculator.py b/featuretools/computational_backends/feature_set_calculator.py index b0fe2be7ee..6035ee644f 100644 --- a/featuretools/computational_backends/feature_set_calculator.py +++ b/featuretools/computational_backends/feature_set_calculator.py @@ -732,7 +732,8 @@ def last_n(df): # the column (in actuality grouping by both index and group would # work) if isinstance(base_frame, dd.core.DataFrame): - to_merge = base_frame.groupby(base_frame[groupby_var]).agg(to_agg) + to_merge = base_frame.groupby(groupby_var).agg(to_agg) + else: to_merge = base_frame.groupby(base_frame[groupby_var], observed=True, sort=False).agg(to_agg) @@ -747,9 +748,7 @@ def last_n(df): to_merge.index = to_merge.index.astype(object).astype(categories) if isinstance(frame, dd.core.DataFrame): - child_merge_var = to_merge.index.name - frame = dd.merge(left=frame, right=to_merge.reset_index(), - left_on=parent_merge_var, right_on=child_merge_var, how='left') + frame = frame.merge(to_merge, left_on=parent_merge_var, right_index=True, how='left') else: frame = pd.merge(left=frame, right=to_merge, left_index=True, right_index=True, how='left') diff --git a/featuretools/dask-tests-tmp/Home Credit Feature Matrix with Dask.ipynb b/featuretools/dask-tests-tmp/Home Credit Feature Matrix with Dask.ipynb new file mode 100644 index 0000000000..d5ca4537cb --- /dev/null +++ b/featuretools/dask-tests-tmp/Home Credit Feature Matrix with Dask.ipynb @@ -0,0 +1,611 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generating a Feature Matrix with Dask\n", + "\n", + "This notebook walks through an example of how to generate a feature matrix using Featuretools and an entityset created from Dask dataframes. This example uses the Home Credit Default Risk dataset which can be obtained from [Kaggle](https://www.kaggle.com/c/home-credit-default-risk/data).\n", + "\n", + "Before running this notebook, you should download the data and save all of the CSV files from the dataset into a directory called `data/homehome-credit-default-risk`. If you place the data in a different location you will need to update the code used to read in the CSV files so they can be found.\n", + "\n", + "Set the `version` variable in the following cell to one of the keys in the primitive dictionary that follows to select a set of primitives to use. This selection will also set the number of workers in the Dask client as well as the blocksize used to create the Dask dataframes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "version = \"v1\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Dict: {version_key : ([trans_primitives], [agg_primtivies], num_workers, blocksize)}\n", + "primitive_dict = {\n", + " \"v1\": ([\"and\"], [\"sum\", \"max\"], 4, \"40MB\"), # 937 features\n", + " \"v2\": ([\"and\"], [\"sum\", \"max\", \"min\", \"mean\"], 1, \"100MB\"), # 1545 features\n", + " \"v3\": ([\"and\", \"add_numeric\", \"negate\"], [], 4, \"1MB\"), #5946 features\n", + " \"v4\": ([\"and\", \"negate\"], [\"sum\", \"max\", \"min\", \"mean\", \"count\", \"any\", \"all\"], 1, \"100MB\"), #2083 features\n", + "}\n", + "trans_primitives = primitive_dict[version][0]\n", + "agg_primitives = primitive_dict[version][1]\n", + "num_workers = primitive_dict[version][2]\n", + "blocksize = primitive_dict[version][3]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import math\n", + "import os\n", + "from datetime import datetime\n", + "\n", + "import dask.dataframe as dd\n", + "import numpy as np\n", + "import pandas as pd\n", + "from dask.distributed import Client\n", + "\n", + "import featuretools as ft\n", + "import featuretools.variable_types as vtypes\n", + "\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a Dask client with the correct number of workers for the primitives being used." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " client.close()\n", + "except:\n", + " pass\n", + "client = Client(n_workers=num_workers)\n", + "client" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create variable for input and output directories. If you have placed your data in a different location, update the values in the following cell accordingly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "input_dir = os.path.join(\"data\", \"home-credit-default-risk\")\n", + "output_dir = os.path.join(\"data\", \"home-credit-default-risk\", \"output\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Read in the raw CSV data and store the data in Dask dataframes. The blocksize is set from the value contained in `primitive_dict` above. These values were found to work well on a MacBook Pro with 4 cores and 16GB of memory, but may need to be adjusted based on the available memory in your system." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "# Read in the datasets and replace the anomalous values\n", + "app_train = dd.read_csv(os.path.join(input_dir, \"application_train.csv\"), blocksize=blocksize).replace({365243: np.nan})\n", + "app_test = dd.read_csv(os.path.join(input_dir, \"application_test.csv\"), blocksize=blocksize).replace({365243: np.nan})\n", + "bureau = dd.read_csv(os.path.join(input_dir, \"bureau.csv\"), blocksize=blocksize).replace({365243: np.nan})\n", + "bureau_balance = dd.read_csv(os.path.join(input_dir, \"bureau_balance.csv\"), blocksize=blocksize).replace({365243: np.nan})\n", + "cash = dd.read_csv(os.path.join(input_dir, \"POS_CASH_balance.csv\"), blocksize=blocksize).replace({365243: np.nan})\n", + "credit = dd.read_csv(os.path.join(input_dir, \"credit_card_balance.csv\"), blocksize=blocksize).replace({365243: np.nan})\n", + "previous = dd.read_csv(os.path.join(input_dir, \"previous_application.csv\"), blocksize=blocksize).replace({365243: np.nan})\n", + "installments = dd.read_csv(os.path.join(input_dir, \"installments_payments.csv\"), blocksize=blocksize).replace({365243: np.nan})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Perform a few cleanup operations on the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%%time\n", + "app_test['TARGET'] = np.nan\n", + "app = app_train.append(app_test[app_train.columns])\n", + "\n", + "for index in ['SK_ID_CURR', 'SK_ID_PREV', 'SK_ID_BUREAU']:\n", + " for dataset in [app, bureau, bureau_balance, cash, credit, previous, installments]:\n", + " if index in list(dataset.columns):\n", + " dataset[index] = dataset[index].fillna(0).astype(np.int64)\n", + "\n", + "installments = installments.drop(columns=['SK_ID_CURR'])\n", + "credit = credit.drop(columns=['SK_ID_CURR'])\n", + "cash = cash.drop(columns=['SK_ID_CURR'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The current implementation of Dask entities does not support inferring variable types. As a result, the user must specify the proper Featuretools variable types for all of the columns in the dataframes used to create the entities. The following cell sets the appropriate datatypes for our entities." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "app_vtypes = {\n", + " 'SK_ID_CURR': ft.variable_types.variable.Index,\n", + " 'AMT_ANNUITY': ft.variable_types.variable.Numeric,\n", + " 'AMT_CREDIT': ft.variable_types.variable.Numeric,\n", + " 'AMT_GOODS_PRICE': ft.variable_types.variable.Numeric,\n", + " 'AMT_INCOME_TOTAL': ft.variable_types.variable.Numeric,\n", + " 'AMT_REQ_CREDIT_BUREAU_DAY': ft.variable_types.variable.Numeric,\n", + " 'AMT_REQ_CREDIT_BUREAU_HOUR': ft.variable_types.variable.Numeric,\n", + " 'AMT_REQ_CREDIT_BUREAU_MON': ft.variable_types.variable.Numeric,\n", + " 'AMT_REQ_CREDIT_BUREAU_QRT': ft.variable_types.variable.Numeric,\n", + " 'AMT_REQ_CREDIT_BUREAU_WEEK': ft.variable_types.variable.Numeric,\n", + " 'AMT_REQ_CREDIT_BUREAU_YEAR': ft.variable_types.variable.Numeric,\n", + " 'APARTMENTS_AVG': ft.variable_types.variable.Numeric,\n", + " 'APARTMENTS_MEDI': ft.variable_types.variable.Numeric,\n", + " 'APARTMENTS_MODE': ft.variable_types.variable.Numeric,\n", + " 'BASEMENTAREA_AVG': ft.variable_types.variable.Numeric,\n", + " 'BASEMENTAREA_MEDI': ft.variable_types.variable.Numeric,\n", + " 'BASEMENTAREA_MODE': ft.variable_types.variable.Numeric,\n", + " 'CNT_CHILDREN': ft.variable_types.variable.Numeric,\n", + " 'CNT_FAM_MEMBERS': ft.variable_types.variable.Numeric,\n", + " 'CODE_GENDER': ft.variable_types.variable.Categorical,\n", + " 'COMMONAREA_AVG': ft.variable_types.variable.Numeric,\n", + " 'COMMONAREA_MEDI': ft.variable_types.variable.Numeric,\n", + " 'COMMONAREA_MODE': ft.variable_types.variable.Numeric,\n", + " 'DAYS_BIRTH': ft.variable_types.variable.Numeric,\n", + " 'DAYS_EMPLOYED': ft.variable_types.variable.Numeric,\n", + " 'DAYS_ID_PUBLISH': ft.variable_types.variable.Numeric,\n", + " 'DAYS_LAST_PHONE_CHANGE': ft.variable_types.variable.Numeric,\n", + " 'DAYS_REGISTRATION': ft.variable_types.variable.Numeric,\n", + " 'DEF_30_CNT_SOCIAL_CIRCLE': ft.variable_types.variable.Numeric,\n", + " 'DEF_60_CNT_SOCIAL_CIRCLE': ft.variable_types.variable.Numeric,\n", + " 'ELEVATORS_AVG': ft.variable_types.variable.Numeric,\n", + " 'ELEVATORS_MEDI': ft.variable_types.variable.Numeric,\n", + " 'ELEVATORS_MODE': ft.variable_types.variable.Numeric,\n", + " 'EMERGENCYSTATE_MODE': ft.variable_types.variable.Categorical,\n", + " 'ENTRANCES_AVG': ft.variable_types.variable.Numeric,\n", + " 'ENTRANCES_MEDI': ft.variable_types.variable.Numeric,\n", + " 'ENTRANCES_MODE': ft.variable_types.variable.Numeric,\n", + " 'EXT_SOURCE_1': ft.variable_types.variable.Numeric,\n", + " 'EXT_SOURCE_2': ft.variable_types.variable.Numeric,\n", + " 'EXT_SOURCE_3': ft.variable_types.variable.Numeric,\n", + " 'FLAG_CONT_MOBILE': ft.variable_types.variable.Boolean,\n", + " 'FLAG_DOCUMENT_10': ft.variable_types.variable.Boolean,\n", + " 'FLAG_DOCUMENT_11': ft.variable_types.variable.Boolean,\n", + " 'FLAG_DOCUMENT_12': ft.variable_types.variable.Boolean,\n", + " 'FLAG_DOCUMENT_13': ft.variable_types.variable.Boolean,\n", + " 'FLAG_DOCUMENT_14': ft.variable_types.variable.Boolean,\n", + " 'FLAG_DOCUMENT_15': ft.variable_types.variable.Boolean,\n", + " 'FLAG_DOCUMENT_16': ft.variable_types.variable.Boolean,\n", + " 'FLAG_DOCUMENT_17': ft.variable_types.variable.Boolean,\n", + " 'FLAG_DOCUMENT_18': ft.variable_types.variable.Boolean,\n", + " 'FLAG_DOCUMENT_19': ft.variable_types.variable.Boolean,\n", + " 'FLAG_DOCUMENT_2': ft.variable_types.variable.Boolean,\n", + " 'FLAG_DOCUMENT_20': ft.variable_types.variable.Boolean,\n", + " 'FLAG_DOCUMENT_21': ft.variable_types.variable.Boolean,\n", + " 'FLAG_DOCUMENT_3': ft.variable_types.variable.Boolean,\n", + " 'FLAG_DOCUMENT_4': ft.variable_types.variable.Boolean,\n", + " 'FLAG_DOCUMENT_5': ft.variable_types.variable.Boolean,\n", + " 'FLAG_DOCUMENT_6': ft.variable_types.variable.Boolean,\n", + " 'FLAG_DOCUMENT_7': ft.variable_types.variable.Boolean,\n", + " 'FLAG_DOCUMENT_8': ft.variable_types.variable.Boolean,\n", + " 'FLAG_DOCUMENT_9': ft.variable_types.variable.Boolean,\n", + " 'FLAG_EMAIL': ft.variable_types.variable.Boolean,\n", + " 'FLAG_EMP_PHONE': ft.variable_types.variable.Boolean,\n", + " 'FLAG_MOBIL': ft.variable_types.variable.Boolean,\n", + " 'FLAG_OWN_CAR': ft.variable_types.variable.Categorical,\n", + " 'FLAG_OWN_REALTY': ft.variable_types.variable.Categorical,\n", + " 'FLAG_PHONE': ft.variable_types.variable.Boolean,\n", + " 'FLAG_WORK_PHONE': ft.variable_types.variable.Boolean,\n", + " 'FLOORSMAX_AVG': ft.variable_types.variable.Numeric,\n", + " 'FLOORSMAX_MEDI': ft.variable_types.variable.Numeric,\n", + " 'FLOORSMAX_MODE': ft.variable_types.variable.Numeric,\n", + " 'FLOORSMIN_AVG': ft.variable_types.variable.Numeric,\n", + " 'FLOORSMIN_MEDI': ft.variable_types.variable.Numeric,\n", + " 'FLOORSMIN_MODE': ft.variable_types.variable.Numeric,\n", + " 'FONDKAPREMONT_MODE': ft.variable_types.variable.Categorical,\n", + " 'HOUR_APPR_PROCESS_START': ft.variable_types.variable.Numeric,\n", + " 'HOUSETYPE_MODE': ft.variable_types.variable.Categorical,\n", + " 'LANDAREA_AVG': ft.variable_types.variable.Numeric,\n", + " 'LANDAREA_MEDI': ft.variable_types.variable.Numeric,\n", + " 'LANDAREA_MODE': ft.variable_types.variable.Numeric,\n", + " 'LIVE_CITY_NOT_WORK_CITY': ft.variable_types.variable.Boolean,\n", + " 'LIVE_REGION_NOT_WORK_REGION': ft.variable_types.variable.Boolean,\n", + " 'LIVINGAPARTMENTS_AVG': ft.variable_types.variable.Numeric,\n", + " 'LIVINGAPARTMENTS_MEDI': ft.variable_types.variable.Numeric,\n", + " 'LIVINGAPARTMENTS_MODE': ft.variable_types.variable.Numeric,\n", + " 'LIVINGAREA_AVG': ft.variable_types.variable.Numeric,\n", + " 'LIVINGAREA_MEDI': ft.variable_types.variable.Numeric,\n", + " 'LIVINGAREA_MODE': ft.variable_types.variable.Numeric,\n", + " 'NAME_CONTRACT_TYPE': ft.variable_types.variable.Categorical,\n", + " 'NAME_EDUCATION_TYPE': ft.variable_types.variable.Categorical,\n", + " 'NAME_FAMILY_STATUS': ft.variable_types.variable.Categorical,\n", + " 'NAME_HOUSING_TYPE': ft.variable_types.variable.Categorical,\n", + " 'NAME_INCOME_TYPE': ft.variable_types.variable.Categorical,\n", + " 'NAME_TYPE_SUITE': ft.variable_types.variable.Categorical,\n", + " 'NONLIVINGAPARTMENTS_AVG': ft.variable_types.variable.Numeric,\n", + " 'NONLIVINGAPARTMENTS_MEDI': ft.variable_types.variable.Numeric,\n", + " 'NONLIVINGAPARTMENTS_MODE': ft.variable_types.variable.Numeric,\n", + " 'NONLIVINGAREA_AVG': ft.variable_types.variable.Numeric,\n", + " 'NONLIVINGAREA_MEDI': ft.variable_types.variable.Numeric,\n", + " 'NONLIVINGAREA_MODE': ft.variable_types.variable.Numeric,\n", + " 'OBS_30_CNT_SOCIAL_CIRCLE': ft.variable_types.variable.Numeric,\n", + " 'OBS_60_CNT_SOCIAL_CIRCLE': ft.variable_types.variable.Numeric,\n", + " 'OCCUPATION_TYPE': ft.variable_types.variable.Categorical,\n", + " 'ORGANIZATION_TYPE': ft.variable_types.variable.Categorical,\n", + " 'OWN_CAR_AGE': ft.variable_types.variable.Numeric,\n", + " 'REGION_POPULATION_RELATIVE': ft.variable_types.variable.Numeric,\n", + " 'REGION_RATING_CLIENT': ft.variable_types.variable.Numeric,\n", + " 'REGION_RATING_CLIENT_W_CITY': ft.variable_types.variable.Numeric,\n", + " 'REG_CITY_NOT_LIVE_CITY': ft.variable_types.variable.Boolean,\n", + " 'REG_CITY_NOT_WORK_CITY': ft.variable_types.variable.Boolean,\n", + " 'REG_REGION_NOT_LIVE_REGION': ft.variable_types.variable.Boolean,\n", + " 'REG_REGION_NOT_WORK_REGION': ft.variable_types.variable.Boolean,\n", + " 'TARGET': ft.variable_types.variable.Numeric,\n", + " 'TOTALAREA_MODE': ft.variable_types.variable.Numeric,\n", + " 'WALLSMATERIAL_MODE': ft.variable_types.variable.Categorical,\n", + " 'WEEKDAY_APPR_PROCESS_START': ft.variable_types.variable.Categorical,\n", + " 'YEARS_BEGINEXPLUATATION_AVG': ft.variable_types.variable.Numeric,\n", + " 'YEARS_BEGINEXPLUATATION_MEDI': ft.variable_types.variable.Numeric,\n", + " 'YEARS_BEGINEXPLUATATION_MODE': ft.variable_types.variable.Numeric,\n", + " 'YEARS_BUILD_AVG': ft.variable_types.variable.Numeric,\n", + " 'YEARS_BUILD_MEDI': ft.variable_types.variable.Numeric,\n", + " 'YEARS_BUILD_MODE': ft.variable_types.variable.Numeric\n", + "}\n", + "\n", + "bureau_vtypes = {\n", + " 'SK_ID_BUREAU': ft.variable_types.variable.Index,\n", + " 'SK_ID_CURR': ft.variable_types.variable.Id,\n", + " 'CREDIT_ACTIVE': ft.variable_types.variable.Categorical,\n", + " 'CREDIT_CURRENCY': ft.variable_types.variable.Categorical,\n", + " 'DAYS_CREDIT': ft.variable_types.variable.Numeric,\n", + " 'CREDIT_DAY_OVERDUE': ft.variable_types.variable.Numeric,\n", + " 'DAYS_CREDIT_ENDDATE': ft.variable_types.variable.Numeric,\n", + " 'DAYS_ENDDATE_FACT': ft.variable_types.variable.Numeric,\n", + " 'AMT_CREDIT_MAX_OVERDUE': ft.variable_types.variable.Numeric,\n", + " 'CNT_CREDIT_PROLONG': ft.variable_types.variable.Numeric,\n", + " 'AMT_CREDIT_SUM': ft.variable_types.variable.Numeric,\n", + " 'AMT_CREDIT_SUM_DEBT': ft.variable_types.variable.Numeric,\n", + " 'AMT_CREDIT_SUM_LIMIT': ft.variable_types.variable.Numeric,\n", + " 'AMT_CREDIT_SUM_OVERDUE': ft.variable_types.variable.Numeric,\n", + " 'CREDIT_TYPE': ft.variable_types.variable.Categorical,\n", + " 'DAYS_CREDIT_UPDATE': ft.variable_types.variable.Numeric,\n", + " 'AMT_ANNUITY': ft.variable_types.variable.Numeric\n", + "}\n", + "\n", + "previous_vtypes = {\n", + " 'SK_ID_PREV': ft.variable_types.variable.Index,\n", + " 'SK_ID_CURR': ft.variable_types.variable.Id,\n", + " 'NAME_CONTRACT_TYPE': ft.variable_types.variable.Categorical,\n", + " 'AMT_ANNUITY': ft.variable_types.variable.Numeric,\n", + " 'AMT_APPLICATION': ft.variable_types.variable.Numeric,\n", + " 'AMT_CREDIT': ft.variable_types.variable.Numeric,\n", + " 'AMT_DOWN_PAYMENT': ft.variable_types.variable.Numeric,\n", + " 'AMT_GOODS_PRICE': ft.variable_types.variable.Numeric,\n", + " 'WEEKDAY_APPR_PROCESS_START': ft.variable_types.variable.Categorical,\n", + " 'HOUR_APPR_PROCESS_START': ft.variable_types.variable.Numeric,\n", + " 'FLAG_LAST_APPL_PER_CONTRACT': ft.variable_types.variable.Categorical,\n", + " 'NFLAG_LAST_APPL_IN_DAY': ft.variable_types.variable.Boolean,\n", + " 'RATE_DOWN_PAYMENT': ft.variable_types.variable.Numeric,\n", + " 'RATE_INTEREST_PRIMARY': ft.variable_types.variable.Numeric,\n", + " 'RATE_INTEREST_PRIVILEGED': ft.variable_types.variable.Numeric,\n", + " 'NAME_CASH_LOAN_PURPOSE': ft.variable_types.variable.Categorical,\n", + " 'NAME_CONTRACT_STATUS': ft.variable_types.variable.Categorical,\n", + " 'DAYS_DECISION': ft.variable_types.variable.Numeric,\n", + " 'NAME_PAYMENT_TYPE': ft.variable_types.variable.Categorical,\n", + " 'CODE_REJECT_REASON': ft.variable_types.variable.Categorical,\n", + " 'NAME_TYPE_SUITE': ft.variable_types.variable.Categorical,\n", + " 'NAME_CLIENT_TYPE': ft.variable_types.variable.Categorical,\n", + " 'NAME_GOODS_CATEGORY': ft.variable_types.variable.Categorical,\n", + " 'NAME_PORTFOLIO': ft.variable_types.variable.Categorical,\n", + " 'NAME_PRODUCT_TYPE': ft.variable_types.variable.Categorical,\n", + " 'CHANNEL_TYPE': ft.variable_types.variable.Categorical,\n", + " 'SELLERPLACE_AREA': ft.variable_types.variable.Numeric,\n", + " 'NAME_SELLER_INDUSTRY': ft.variable_types.variable.Categorical,\n", + " 'CNT_PAYMENT': ft.variable_types.variable.Numeric,\n", + " 'NAME_YIELD_GROUP': ft.variable_types.variable.Categorical,\n", + " 'PRODUCT_COMBINATION': ft.variable_types.variable.Categorical,\n", + " 'DAYS_FIRST_DRAWING': ft.variable_types.variable.Numeric,\n", + " 'DAYS_FIRST_DUE': ft.variable_types.variable.Numeric,\n", + " 'DAYS_LAST_DUE_1ST_VERSION': ft.variable_types.variable.Numeric,\n", + " 'DAYS_LAST_DUE': ft.variable_types.variable.Numeric,\n", + " 'DAYS_TERMINATION': ft.variable_types.variable.Numeric,\n", + " 'NFLAG_INSURED_ON_APPROVAL': ft.variable_types.variable.Numeric\n", + "}\n", + "\n", + "bureau_balance_vtypes = {\n", + " 'bureaubalance_index': ft.variable_types.variable.Index,\n", + " 'SK_ID_BUREAU': ft.variable_types.variable.Id,\n", + " 'MONTHS_BALANCE': ft.variable_types.variable.Numeric,\n", + " 'STATUS': ft.variable_types.variable.Categorical\n", + "}\n", + "\n", + "cash_vtypes = {\n", + " 'cash_index': ft.variable_types.variable.Index,\n", + " 'SK_ID_PREV': ft.variable_types.variable.Id,\n", + " 'MONTHS_BALANCE': ft.variable_types.variable.Numeric,\n", + " 'CNT_INSTALMENT': ft.variable_types.variable.Numeric,\n", + " 'CNT_INSTALMENT_FUTURE': ft.variable_types.variable.Numeric,\n", + " 'NAME_CONTRACT_STATUS': ft.variable_types.variable.Categorical,\n", + " 'SK_DPD': ft.variable_types.variable.Numeric,\n", + " 'SK_DPD_DEF': ft.variable_types.variable.Numeric\n", + "}\n", + "\n", + "installments_vtypes = {\n", + " 'installments_index': ft.variable_types.variable.Index,\n", + " 'SK_ID_PREV': ft.variable_types.variable.Id,\n", + " 'NUM_INSTALMENT_VERSION': ft.variable_types.variable.Numeric,\n", + " 'NUM_INSTALMENT_NUMBER': ft.variable_types.variable.Numeric,\n", + " 'DAYS_INSTALMENT': ft.variable_types.variable.Numeric,\n", + " 'DAYS_ENTRY_PAYMENT': ft.variable_types.variable.Numeric,\n", + " 'AMT_INSTALMENT': ft.variable_types.variable.Numeric,\n", + " 'AMT_PAYMENT': ft.variable_types.variable.Numeric\n", + "}\n", + "\n", + "credit_vtypes = {\n", + " 'credit_index': ft.variable_types.variable.Index,\n", + " 'SK_ID_PREV': ft.variable_types.variable.Id,\n", + " 'MONTHS_BALANCE': ft.variable_types.variable.Numeric,\n", + " 'AMT_BALANCE': ft.variable_types.variable.Numeric,\n", + " 'AMT_CREDIT_LIMIT_ACTUAL': ft.variable_types.variable.Numeric,\n", + " 'AMT_DRAWINGS_ATM_CURRENT': ft.variable_types.variable.Numeric,\n", + " 'AMT_DRAWINGS_CURRENT': ft.variable_types.variable.Numeric,\n", + " 'AMT_DRAWINGS_OTHER_CURRENT': ft.variable_types.variable.Numeric,\n", + " 'AMT_DRAWINGS_POS_CURRENT': ft.variable_types.variable.Numeric,\n", + " 'AMT_INST_MIN_REGULARITY': ft.variable_types.variable.Numeric,\n", + " 'AMT_PAYMENT_CURRENT': ft.variable_types.variable.Numeric,\n", + " 'AMT_PAYMENT_TOTAL_CURRENT': ft.variable_types.variable.Numeric,\n", + " 'AMT_RECEIVABLE_PRINCIPAL': ft.variable_types.variable.Numeric,\n", + " 'AMT_RECIVABLE': ft.variable_types.variable.Numeric,\n", + " 'AMT_TOTAL_RECEIVABLE': ft.variable_types.variable.Numeric,\n", + " 'CNT_DRAWINGS_ATM_CURRENT': ft.variable_types.variable.Numeric,\n", + " 'CNT_DRAWINGS_CURRENT': ft.variable_types.variable.Numeric,\n", + " 'CNT_DRAWINGS_OTHER_CURRENT': ft.variable_types.variable.Numeric,\n", + " 'CNT_DRAWINGS_POS_CURRENT': ft.variable_types.variable.Numeric,\n", + " 'CNT_INSTALMENT_MATURE_CUM': ft.variable_types.variable.Numeric,\n", + " 'NAME_CONTRACT_STATUS': ft.variable_types.variable.Categorical,\n", + " 'SK_DPD': ft.variable_types.variable.Numeric,\n", + " 'SK_DPD_DEF': ft.variable_types.variable.Numeric\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next we will create the entityset using the previously created Dask dataframes. The process for creating entities and entitysets from Dask dataframes is the same as the process of creating entities and entitysets from pandas dataframes, with the one exception being that the `variable_types` parameter must be used when creating entities from Dask dataframes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%%time\n", + "es = ft.EntitySet(id='clients')\n", + "\n", + "# Entities with a unique index\n", + "es = es.entity_from_dataframe(entity_id='app', dataframe=app, index='SK_ID_CURR',\n", + " variable_types=app_vtypes)\n", + "\n", + "es = es.entity_from_dataframe(entity_id='bureau', dataframe=bureau, index='SK_ID_BUREAU',\n", + " variable_types=bureau_vtypes)\n", + "\n", + "es = es.entity_from_dataframe(entity_id='previous', dataframe=previous, index='SK_ID_PREV',\n", + " variable_types=previous_vtypes)\n", + "\n", + "# Entities that do not have a unique index\n", + "es = es.entity_from_dataframe(entity_id='bureau_balance', dataframe=bureau_balance,\n", + " make_index=True, index='bureaubalance_index',\n", + " variable_types=bureau_balance_vtypes)\n", + "\n", + "es = es.entity_from_dataframe(entity_id='cash', dataframe=cash,\n", + " make_index=True, index='cash_index',\n", + " variable_types=cash_vtypes)\n", + "\n", + "es = es.entity_from_dataframe(entity_id='installments', dataframe=installments,\n", + " make_index=True, index='installments_index',\n", + " variable_types=installments_vtypes)\n", + "\n", + "es = es.entity_from_dataframe(entity_id='credit', dataframe=credit,\n", + " make_index=True, index='credit_index',\n", + " variable_types=credit_vtypes)\n", + "\n", + "print(\"Adding relationships...\")\n", + "# Relationship between app_train and bureau\n", + "r_app_bureau = ft.Relationship(es['app']['SK_ID_CURR'], es['bureau']['SK_ID_CURR'])\n", + "\n", + "# Relationship between bureau and bureau balance\n", + "r_bureau_balance = ft.Relationship(es['bureau']['SK_ID_BUREAU'], es['bureau_balance']['SK_ID_BUREAU'])\n", + "\n", + "# Relationship between current app and previous apps\n", + "r_app_previous = ft.Relationship(es['app']['SK_ID_CURR'], es['previous']['SK_ID_CURR'])\n", + "\n", + "# Relationships between previous apps and cash, installments, and credit\n", + "r_previous_cash = ft.Relationship(es['previous']['SK_ID_PREV'], es['cash']['SK_ID_PREV'])\n", + "r_previous_installments = ft.Relationship(es['previous']['SK_ID_PREV'], es['installments']['SK_ID_PREV'])\n", + "r_previous_credit = ft.Relationship(es['previous']['SK_ID_PREV'], es['credit']['SK_ID_PREV'])\n", + "\n", + "# Add in the defined relationships\n", + "es = es.add_relationships([r_app_bureau, r_bureau_balance, r_app_previous,\n", + " r_previous_cash, r_previous_installments, r_previous_credit])\n", + "# Print out the EntitySet\n", + "print(es)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a `cutoff_times` dataframe to pass to `ft.dfs()`. This step is optional and the `cutoff_time` parameter can be omitted when calling DFS. With the current implementation, supplying a cutoff time dataframe is slightly faster than supplying a single cutoff time value, although both approaches should work equivalently. This is something that should be improved in future updates of Featuretools." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "cutoff_times = app[\"SK_ID_CURR\"].to_frame().rename(columns={\"SK_ID_CURR\":\"instance_id\"})\n", + "cutoff_times[\"time\"] = datetime.now()\n", + "cutoff_times = cutoff_times.compute()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we can run `ft.dfs()` to generate the feature matrix. The feature matrix will be returned as a Dask dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "fm, features = ft.dfs(entityset=es, target_entity=\"app\",\n", + " trans_primitives=trans_primitives,\n", + " agg_primitives=agg_primitives,\n", + " where_primitives=[], seed_features=[],\n", + " max_depth=2, verbose=0,\n", + " cutoff_time=cutoff_times)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The feature matrix can now be saved to disk. Note, this process may take several minutes to complete, depending on the size of the feature matrix that was generated.\n", + "\n", + "At times this process may fail due to memory issues. These issues can sometimes be resolved by using a smaller partition size when reading in the original CSV data so that Dask has smaller chunks of data to work with. Another potential solution is to use workers with more available memory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "fm.to_csv(os.path.join(output_dir, f\"fm_{version}-*.csv\"), index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, the feature matrix can be brought into memory by running `.compute()` on the Dask feature matrix returned from `ft.dfs()`. Note, this process may fail depending on the size of the feature matrix generated and the available system memory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "fm_computed = fm.compute()\n", + "print(\"Shape: {}\".format(fm_computed.shape))\n", + "print(\"Memory: {} MB\".format(fm_computed.memory_usage().sum() / 1000000))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fm_computed.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we are finished, we can close our Dask client." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.7.4 64-bit ('env': venv)", + "language": "python", + "name": "python37464bitenvvenvb0576f6089f14bba82265783ef39e793" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}