This notebook requires [Featuretools](https://github.com/Featuretools/featuretools) version 0.1.17 or higher. To get started, download data from [Kaggle](https://www.kaggle.com/joniarroba/noshowappointments/data) on appointment noshows into a `data` folder in this directory.


In [1]:
import numpy as np
import pandas as pd
import featuretools as ft
ft.__version__

'0.1.17'

# Step 1: Set an EntitySet structure for Featuretools

In [2]:
data = pd.read_csv("data/KaggleV2-May-2016.csv")
data.index = data['AppointmentID']
data.rename(columns = {'Hipertension': 'Hypertension',
                       'Handcap': 'Handicap',
                       'PatientId': 'PatientID',
                       'No-show': 'NoShow'}, inplace = True)
data['NoShow'] = data['NoShow'].map({'No': 0, 'Yes': 1})
data.head()

cutoff_times = data[['AppointmentID', 'AppointmentDay']]

In [3]:
import featuretools.variable_types as vtypes

# Give featuretools column metadata
variable_types = {'Gender': vtypes.Categorical,
                  'Age': vtypes.Categorical,
                  'Scholarship': vtypes.Boolean,
                  'Hypertension': vtypes.Boolean,
                  'Diabetes': vtypes.Boolean,
                  'Alcoholism': vtypes.Boolean,
                  'Handicap': vtypes.Boolean,
                  'ScheduledDay': vtypes.Datetime, 
                  'AppointmentDay': vtypes.Datetime,
                  'Neighbourhood': vtypes.Categorical,
                  'NoShow': vtypes.Boolean,
                  'SMS_received': vtypes.Boolean}

# Create an `EntitySet` named `appointment_data`
es = ft.EntitySet('appointment_data')

# Make an entity named 'appointments' which stores dataset metadata with the dataframe
es = es.entity_from_dataframe(entity_id="appointments",
                              dataframe=data,
                              index='AppointmentID',
                              time_index='ScheduledDay',
                              secondary_time_index={'AppointmentDay': ['NoShow']},
                              variable_types=variable_types)

es.normalize_entity('appointments', 'patients', 'PatientID', 
                    make_time_index=True)
es.normalize_entity('appointments', 'ages', 'Age',
                    make_time_index=False)
es.normalize_entity('appointments', 'genders', 'Gender',
                    make_time_index=False)

Entityset: appointment_data
  Entities:
    appointments (shape = [110527, 14])
    patients (shape = [62299, 2])
    ages (shape = [104, 1])
    genders (shape = [2, 1])
  Relationships:
    appointments.PatientID -> patients.PatientID
    appointments.Age -> ages.Age
    appointments.Gender -> genders.Gender

# Step 2: Create features with Deep Feature Synthesis

In [4]:
# Custom primitive: TODO fix to correctly incorporate label data while training
from featuretools.primitives import make_agg_primitive
def probability(boolean):
    numtrue = len([x for x in boolean if x==1])
    return numtrue/len(boolean)

Prob = make_agg_primitive(probability,
                          input_types=[vtypes.Boolean],
                          return_type=vtypes.Numeric)
    
    

In [5]:
from featuretools.primitives import Weekday, Hour, Count
fm, features = ft.dfs(entityset=es,
                      target_entity='appointments',
                      agg_primitives=[Prob, Count],
                      trans_primitives=[Weekday, Hour],
                      max_depth=3,
                      cutoff_time=cutoff_times,
                      features_only=False,
                      verbose=True)
fm, features = ft.synthesis.encode_features(fm, features, 
                                            top_n=5, 
                                            include_unknown=False, 
                                            to_encode=None, 
                                            inplace=False, 
                                            verbose=False)
fm.tail(20)

Building features: 74it [00:00, 5712.52it/s]
Progress: 100%|██████████| 27/27 [00:58<00:00,  2.15s/cutoff time]


Unnamed: 0_level_0,PatientID = 822145925426128.0,PatientID = 99637671331.0,PatientID = 26886125921145.0,PatientID = 33534783483176.0,PatientID = 871374938638855.0,Gender = F,Gender = M,Age = 0,Age = 1,Age = 52,...,ages.PROBABILITY(appointments.SMS_received),ages.COUNT(appointments),genders.PROBABILITY(appointments.Scholarship),genders.PROBABILITY(appointments.Hypertension),genders.PROBABILITY(appointments.Diabetes),genders.PROBABILITY(appointments.Alcoholism),genders.PROBABILITY(appointments.Handicap),genders.PROBABILITY(appointments.NoShow),genders.PROBABILITY(appointments.SMS_received),genders.COUNT(appointments)
AppointmentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5790352,0,0,0,0,0,1,0,0,0,0,...,0.35124,1452,0.123232,0.213516,0.078035,0.017024,0.016439,0.19613,0.336888,71840
5790355,0,0,0,0,0,1,0,0,0,0,...,0.332814,1283,0.123232,0.213516,0.078035,0.017024,0.016439,0.19613,0.336888,71840
5790358,0,0,0,0,0,1,0,0,0,0,...,0.372389,1101,0.123232,0.213516,0.078035,0.017024,0.016439,0.19613,0.336888,71840
5790362,0,0,0,0,0,1,0,0,0,0,...,0.355067,1411,0.123232,0.213516,0.078035,0.017024,0.016439,0.19613,0.336888,71840
5790364,0,0,0,0,0,1,0,0,0,0,...,0.355088,1425,0.123232,0.213516,0.078035,0.017024,0.016439,0.19613,0.336888,71840
5790367,0,0,0,0,0,0,1,0,0,0,...,0.338542,1536,0.051904,0.167033,0.060408,0.055238,0.022256,0.1927,0.291571,38687
5790369,0,0,0,0,0,1,0,0,0,0,...,0.34047,1448,0.123232,0.213516,0.078035,0.017024,0.016439,0.19613,0.336888,71840
5790370,0,0,0,0,0,1,0,0,0,0,...,0.319007,1652,0.123232,0.213516,0.078035,0.017024,0.016439,0.19613,0.336888,71840
5790372,0,0,0,0,0,0,1,0,0,0,...,0.241935,434,0.051904,0.167033,0.060408,0.055238,0.022256,0.1927,0.291571,38687
5790376,0,0,0,0,0,1,0,0,0,0,...,0.350791,1012,0.123232,0.213516,0.078035,0.017024,0.016439,0.19613,0.336888,71840


# Step 3: Predict

In [6]:
from sklearn.model_selection import train_test_split
#labels = fm.pop('NoShow')
labels = es['appointments'].df['NoShow']
X_train, X_test, y_train, y_test = train_test_split(fm, labels, test_size=0.40)


In [7]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=50)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)

In [8]:
from sklearn.metrics import accuracy_score, roc_auc_score
print(accuracy_score(preds, y_test))
print(roc_auc_score(preds, y_test))

0.7785166587500848
0.5505542832094379


In [9]:
feature_imps = [(imp, fm.columns[i]) for i, imp in enumerate(clf.feature_importances_)]
feature_imps.sort()
feature_imps.reverse()
feature_imps[0:10]


[(0.117336226072013, 'ages.PROBABILITY(appointments.NoShow)'),
 (0.09935994489520195, 'genders.PROBABILITY(appointments.NoShow)'),
 (0.06677283204124322, 'patients.COUNT(appointments)'),
 (0.04717495980141458, 'patients.PROBABILITY(appointments.SMS_received)'),
 (0.037867433716065484, 'ages.PROBABILITY(appointments.SMS_received)'),
 (0.0377183964568325, 'ages.PROBABILITY(appointments.Handicap)'),
 (0.03714108051762628, 'ages.COUNT(appointments)'),
 (0.03663328801915529, 'ages.PROBABILITY(appointments.Scholarship)'),
 (0.03442352350782013, 'ages.PROBABILITY(appointments.Hypertension)'),
 (0.034218493017104314, 'ages.PROBABILITY(appointments.Alcoholism)')]