This notebook requires [Featuretools](https://github.com/Featuretools/featuretools) version 0.1.17 or higher. To get started, download data from [Kaggle](https://www.kaggle.com/joniarroba/noshowappointments/data) on appointment noshows into a `data` folder in this directory.


In [1]:
import pandas as pd
import featuretools as ft
ft.__version__

'0.1.17'

# Step 1: Set an EntitySet structure for Featuretools

In [2]:
data = pd.read_csv("data/KaggleV2-May-2016.csv")
data.index = data['AppointmentID']
data.rename(columns = {'Hipertension': 'Hypertension',
                       'Handcap': 'Handicap',
                       'PatientId': 'PatientID',
                       'No-show': 'NoShow'}, inplace = True)
data.head()

Unnamed: 0_level_0,PatientID,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMS_received,NoShow
AppointmentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
5642903,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
5642503,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
5642549,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
5642828,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
5642494,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [None]:
import featuretools.variable_types as vtypes

# Give featuretools column metadata
variable_types = {'Gender': vtypes.Categorical,
                  'Age': vtypes.Categorical,
                  'Scholarship': vtypes.Boolean,
                  'Hypertension': vtypes.Boolean,
                  'Diabetes': vtypes.Boolean,
                  'Alcoholism': vtypes.Boolean,
                  'Handicap': vtypes.Boolean,
                  'SMS_received': vtypes.Boolean, 
                  'ScheduledDay': vtypes.Datetime, 
                  'AppointmentDay': vtypes.Datetime,
                  'Neighbourhood': vtypes.Categorical}

# Create an `EntitySet` named `appointment_data`
es = ft.EntitySet('appointment_data')

# Make an entity named 'appointments' which stores dataset metadata with the dataframe
es = es.entity_from_dataframe(entity_id="appointments",
                              dataframe=data,
                              index='AppointmentID',
                              time_index='ScheduledDay',
                              variable_types=variable_types)

In [3]:
# Make new entities from `appointments` for `patients`
# Variables like Gender, Age and preexisting conditions belong to the patient, not the appointment
# This automatically creates a parent-child relationship between the PatientID variable in both entities
es.normalize_entity('appointments', 'patients', 'PatientID',
                    additional_variables = ['Gender', 'Age', 'Scholarship', 
                                            'Hypertension', 'Diabetes', 'Alcoholism',
                                            'Handicap', 'SMS_received'],
                    make_time_index=True,
                    make_secondary_time_index={'ScheduledDay': []},
                    new_entity_secondary_time_index='last_appointments_time')


# Create entities for `ages` and `customers` as well
es.normalize_entity('patients', 'ages', 'Age')
es.normalize_entity('patients', 'genders', 'Gender')

cutoff_times = data[['AppointmentID', 'AppointmentDay']]

# Step 2: Create features with Deep Feature Synthesis

In [4]:
# Custom primitive: TODO fix to correctly incorporate label data while training
from featuretools.primitives import make_agg_primitive
def probability(boolean):
    numtrue = len([x for x in boolean if x==1])
    return numtrue/len(boolean)

Prob = make_agg_primitive(probability,
                          input_types=[vtypes.Boolean],
                          return_type=vtypes.Numeric)
    
    

In [7]:
from featuretools.primitives import Weekday, Hour
fm, features = ft.dfs(entityset=es,
                      target_entity='appointments',
                      agg_primitives=[Prob],
                      trans_primitives=[Weekday],
                      max_depth=3,
                      cutoff_time=cutoff_times[0:10000],
                      features_only=False,
                      verbose=True)
fm

Building features: 71it [00:00, 5707.85it/s]
Progress: 100%|██████████| 20/20 [04:05<00:00, 12.29s/cutoff time]


Unnamed: 0_level_0,PatientID,NoShow,Neighbourhood,WEEKDAY(ScheduledDay),WEEKDAY(AppointmentDay),patients.Gender,patients.Age,patients.WEEKDAY(last_appointments_time),patients.WEEKDAY(first_appointments_time),patients.ages.PROBABILITY(patients.Scholarship),...,patients.ages.PROBABILITY(patients.Handicap),patients.ages.PROBABILITY(patients.SMS_received),patients.ages.WEEKDAY(first_patients_time),patients.genders.PROBABILITY(patients.Scholarship),patients.genders.PROBABILITY(patients.Hypertension),patients.genders.PROBABILITY(patients.Diabetes),patients.genders.PROBABILITY(patients.Alcoholism),patients.genders.PROBABILITY(patients.Handicap),patients.genders.PROBABILITY(patients.SMS_received),patients.genders.WEEKDAY(first_patients_time)
AppointmentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5217179,1.423329e+12,No,SANTO ANDRÉ,1,4,M,84,,1,0.005495,...,0.076923,0.263736,1,0.048847,0.167573,0.061295,0.042152,0.020132,0.335505,3
5218520,4.616858e+12,No,REDENÇÃO,1,4,F,83,1.0,1,0.000000,...,0.099448,0.337017,1,0.117390,0.212581,0.076212,0.014184,0.014408,0.375668,1
5235449,5.558963e+13,No,MONTE BELO,0,4,F,74,0.0,0,0.002874,...,0.054598,0.385057,0,0.117390,0.212581,0.076212,0.014184,0.014408,0.375668,1
5235643,9.189694e+13,No,GURIGICA,0,4,F,70,0.0,0,0.033784,...,0.038288,0.391892,1,0.117390,0.212581,0.076212,0.014184,0.014408,0.375668,1
5235655,1.534482e+12,No,JUCUTUQUARA,0,4,F,87,,0,0.000000,...,0.084746,0.279661,0,0.117390,0.212581,0.076212,0.014184,0.014408,0.375668,1
5236116,3.136481e+14,No,REDENÇÃO,0,4,M,71,0.0,0,0.022222,...,0.037037,0.316049,0,0.048847,0.167573,0.061295,0.042152,0.020132,0.335505,3
5236380,1.596183e+14,No,PRAIA DO CANTO,0,4,F,88,0.0,0,0.000000,...,0.054348,0.271739,0,0.117390,0.212581,0.076212,0.014184,0.014408,0.375668,1
5303666,9.646768e+13,No,RESISTÊNCIA,4,4,F,1,4.0,4,0.034578,...,0.000692,0.306362,4,0.117390,0.212581,0.076212,0.014184,0.014408,0.375668,1
5304747,7.437646e+14,No,MARUÍPE,4,4,M,48,4.0,4,0.087610,...,0.013767,0.352941,0,0.048847,0.167573,0.061295,0.042152,0.020132,0.335505,3
5317449,7.414865e+12,Yes,JESUS DE NAZARETH,1,4,F,77,,1,0.003175,...,0.044444,0.365079,0,0.117390,0.212581,0.076212,0.014184,0.014408,0.375668,1


In [6]:
fm.shape

(110527, 13)

# Step 3: Predict