In [3]:
import pandas as pd
import numpy as np 
import tensorflow as tf 
import xgboost as xgb

from sklearn.model_selection import train_test_split


In [5]:
# set some jupyter magic
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
no_show_df = pd.read_csv("data/no_show_feature_engineered_no_extreme_locations.csv")
no_show_df.head()

Unnamed: 0,patientId,appointmentId,gender,scheduledDay,appointmentDay,age,neighborhood,scholarship,hypertension,diabetes,...,appointmentDayDOW__Monday,appointmentDayDOW__Saturday,appointmentDayDOW__Thursday,appointmentDayDOW__Tuesday,appointmentDayDOW__Wednesday,lat,lon,distanceFromCenter,distanceFromCenterLat,distanceFromCenterLon
0,29872500000000.0,5642903,F,2016-04-29 18:38:08,2016-04-29 00:00:00,62,JARDIM DA PENHA,False,True,False,...,0,0,0,0,0,0.490525,-0.169426,-0.198132,-0.280227,-0.201355
1,558997800000000.0,5642503,M,2016-04-29 16:08:27,2016-04-29 00:00:00,56,JARDIM DA PENHA,False,False,False,...,0,0,0,0,0,0.490525,-0.169426,-0.198132,-0.280227,-0.201355
2,4262962000000.0,5642549,F,2016-04-29 16:19:04,2016-04-29 00:00:00,62,MATA DA PRAIA,False,False,False,...,0,0,0,0,0,0.788316,-0.166879,-0.190882,0.089317,-0.199223
3,867951200000.0,5642828,F,2016-04-29 17:29:31,2016-04-29 00:00:00,8,PONTAL DE CAMBURI,False,False,False,...,0,0,0,0,0,0.341636,-0.170699,-0.201469,-0.464992,-0.200074
4,8841186000000.0,5642494,F,2016-04-29 16:07:23,2016-04-29 00:00:00,56,JARDIM DA PENHA,False,True,True,...,0,0,0,0,0,0.490525,-0.169426,-0.198132,-0.280227,-0.201355


- Change how gender is encoded
- Select features
- Define the baseline model
- Define xgboost model
- Define tensorflow model

In [9]:
# encode the gender as a binary
# NOTE: the gender didn't seem to affect no show by itself, but we're going to keep it and
#       and verify if the model can still use it in conjunction with other variables
no_show_df["isFemale"] = (no_show_df["gender"] == "F")*1

# select the columns that we want to keep
FEATURE_COLS = ["age","scholarship","hypertension","diabetes","alcoholism","handicap","smsSent",
                "daysInAdvance","lat","lon","isFemale","distanceFromCenterLat","scheduledDayHour"]
days_of_weeks_cols = [col_name for col_name in no_show_df.columns if "appointmentDayDOW__" in col_name]
FEATURE_COLS += days_of_weeks_cols
print(FEATURE_COLS)

# target column
TARGET_COLUMN = "noShow"

# prepare dataset for models
no_show_df["age"] = (no_show_df["age"]-no_show_df["age"].mean())/no_show_df["age"].std()
no_show_df["noShow"] = (no_show_df["noShow"])*1

['age', 'scholarship', 'hypertension', 'diabetes', 'alcoholism', 'handicap', 'smsSent', 'daysInAdvance', 'lat', 'lon', 'isFemale', 'distanceFromCenterLat', 'scheduledDayHour', 'appointmentDayDOW__Friday', 'appointmentDayDOW__Monday', 'appointmentDayDOW__Saturday', 'appointmentDayDOW__Thursday', 'appointmentDayDOW__Tuesday', 'appointmentDayDOW__Wednesday']


In [11]:
# get training, validation, and test after 
from imblearn.over_sampling import SMOTE
X = no_show_df[FEATURE_COLS].values
y = no_show_df[TARGET_COLUMN].values

# using SMOTE for generating synthetic samples to help with class imbalance
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
print("Number of samples in the oversampled dataset:",len(X))
# get train = 60%, validation = 20%, test = 20%
train_X, test_X, train_y, test_y = train_test_split(X,y,test_size=0.2) # split between train and test
train_X, val_X, train_y, val_y = train_test_split(train_X,train_y,train_size=0.75) # split train to get validation

Number of samples in the oversampled dataset: 176348


## Defining a baseline model

As reported in publications studying hospitals no-shows, most clinics currently do not have any way to identify patients that are likely not to show up to appointments (Srinivas, Sharan, and A. Ravi Ravindran, 2018). For this reason, and given the class imbalance of the dataset, we can assume they naively decide on either assuming everyone will show up, or that they won't. 

From an accuracy perspective, the baseline model assuming that every patient always shows up gives the highest value (as most patients actually show up). In practice, even if hospitals cannot predict if a given patient will show up or not, they are likely to know a percentage of no-shows per day and use it to overbook appointments. 

## The Wide and Deep model



In [13]:
from models.deep_and_wide import tuner
from tensorflow.keras import callbacks
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)
# set callbacks

class EndEpochCallback(callbacks.Callback):
    def on_epoch_end(self,epoch,logs):
        print("\ntrain loss: {:.4f}, val loss: {:.4f}, ".format(logs["loss"],logs["val_loss"]))

In [1]:
# search!
tuner.search(train_X, train_y,
            epochs= 5,
             validation_data=(val_X, val_y),
             verbose=2)

NameError: name 'tuner' is not defined