This notebook requires [Featuretools](https://github.com/Featuretools/featuretools) version 0.1.17 or higher. To get started, download data from [Kaggle](https://www.kaggle.com/joniarroba/noshowappointments/data) on appointment noshows into a `data` folder in this directory.


In [1]:
import numpy as np
import pandas as pd
import featuretools as ft
ft.__version__

'0.1.17'

# Step 1: Set an EntitySet structure for Featuretools

In [2]:
data = pd.read_csv("data/KaggleV2-May-2016.csv")
data.index = data['AppointmentID']
data.rename(columns = {'Hipertension': 'Hypertension',
                       'Handcap': 'Handicap',
                       'PatientId': 'PatientID',
                       'No-show': 'NoShow'}, inplace = True)
data['NoShow'] = data['NoShow'].map({'No': 0, 'Yes': 1})
data.head()

cutoff_times = data[['AppointmentID', 'AppointmentDay']]

In [3]:
import featuretools.variable_types as vtypes

# Give featuretools column metadata
variable_types = {'Gender': vtypes.Categorical,
                  'Age': vtypes.Categorical,
                  'Scholarship': vtypes.Boolean,
                  'Hypertension': vtypes.Boolean,
                  'Diabetes': vtypes.Boolean,
                  'Alcoholism': vtypes.Boolean,
                  'Handicap': vtypes.Boolean,
                  'ScheduledDay': vtypes.Datetime, 
                  'AppointmentDay': vtypes.Datetime,
                  'Neighbourhood': vtypes.Categorical,
                  'NoShow': vtypes.Boolean,
                  'SMS_received': vtypes.Boolean}

# Create an `EntitySet` named `appointment_data`
es = ft.EntitySet('appointment_data')

# Make an entity named 'appointments' which stores dataset metadata with the dataframe
es = es.entity_from_dataframe(entity_id="appointments",
                              dataframe=data,
                              index='AppointmentID',
                              time_index='ScheduledDay',
                              secondary_time_index={'AppointmentDay': ['NoShow', 'SMS_received']},
                              variable_types=variable_types)

es.normalize_entity('appointments', 'patients', 'PatientID', 
                    make_time_index=True)
es.normalize_entity('appointments', 'ages', 'Age',
                    make_time_index=False)
es.normalize_entity('appointments', 'genders', 'Gender',
                    make_time_index=False)

Entityset: appointment_data
  Entities:
    appointments (shape = [110527, 14])
    patients (shape = [62299, 2])
    ages (shape = [104, 1])
    genders (shape = [2, 1])
  Relationships:
    appointments.PatientID -> patients.PatientID
    appointments.Age -> ages.Age
    appointments.Gender -> genders.Gender

# Step 2: Create features with Deep Feature Synthesis

In [None]:
# Custom primitive: TODO fix to correctly incorporate label data while training
from featuretools.primitives import make_agg_primitive
def probability(boolean):
    numtrue = len([x for x in boolean if x==1])
    return numtrue/len(boolean)

Prob = make_agg_primitive(probability,
                          input_types=[vtypes.Boolean],
                          return_type=vtypes.Numeric)
    
    

In [None]:
from featuretools.primitives import Weekday, Hour, Count
fm, features = ft.dfs(entityset=es,
                      target_entity='appointments',
                      agg_primitives=[Prob, Count],
                      trans_primitives=[Weekday, Hour],
                      max_depth=3,
                      drop_contains=['AppointmentDay'],
                      cutoff_time=cutoff_times,
                      features_only=False,
                      verbose=True)
fm, features = ft.synthesis.encode_features(fm, features, 
                                            top_n=5, 
                                            include_unknown=False, 
                                            to_encode=['Neighbourhood', 'Gender'], 
                                            inplace=False, 
                                            verbose=True)
fm.tail(20)

Building features: 74it [00:00, 7838.24it/s]
Progress:  59%|█████▉    | 16/27 [00:35<00:24,  2.24s/cutoff time]

# Step 3: Predict

In [None]:
from sklearn.model_selection import train_test_split
#labels = fm.pop('NoShow')
labels = es['appointments'].df['NoShow']
X_train, X_test, y_train, y_test = train_test_split(fm, labels, test_size=0.40)


In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=50)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score
print(accuracy_score(preds, y_test))
print(roc_auc_score(preds, y_test))

In [None]:
feature_imps = [(imp, fm.columns[i]) for i, imp in enumerate(clf.feature_importances_)]
feature_imps.sort()
feature_imps.reverse()
feature_imps[0:20]


In [None]:
# Demonstrates time handling: prob is 0 for everything to first cutoff time and then
# updates every cutoff time as time goes on.
fm[['Age','ages.PROBABILITY(appointments.NoShow)']]