This is a dataset that contains 300k medical appointments with characteristics (features) of the patients and whether the patient showed up in the appointment or not. We will try to create a Neural Network that given the characteristics of a patient will predict the probability of that patient showing up.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import pandas as pd
import numpy as np

In [2]:

df = pd.read_csv('data.csv')

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
PatientId         110527 non-null float64
AppointmentID     110527 non-null int64
Gender            110527 non-null object
ScheduledDay      110527 non-null object
AppointmentDay    110527 non-null object
Age               110527 non-null int64
Neighbourhood     110527 non-null object
Scholarship       110527 non-null int64
Hipertension      110527 non-null int64
Diabetes          110527 non-null int64
Alcoholism        110527 non-null int64
Handcap           110527 non-null int64
SMS_received      110527 non-null int64
No-show           110527 non-null object
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB
None


In [3]:
# Converts the two variables to datetime variables
df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay'])
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay'])

# Create a variable called "AwaitingTime" by subtracting the date the patient made the appointment and the date of the appointment.
df['AwaitingTime'] = df["AppointmentDay"].sub(df["ScheduledDay"], axis=0)

# Convert the result "AwaitingTime" to number of days between appointment day and scheduled day. 
df["AwaitingTime"] = (df["AwaitingTime"] / np.timedelta64(1, 'D')).abs()

##### Recode Target Variable

0 = Did not show up to appointment

1 = Showed up to appointment


In [4]:
# Replace Males with 0 and Females with 1
df["Gender"].replace("M", 0, inplace=True)
df["Gender"].replace("F", 1, inplace=True)

# Replace No with 0 and Yes with 1
df['No-show'].replace("No", 0, inplace=True)
df['No-show'].replace("Yes", 1, inplace=True)

In [5]:
# There are 5 types of handicaps
# Convert to Categorical
df['Handcap'] = pd.Categorical(df['Handcap'])

# Convert to Dummy Variables
Handicap = pd.get_dummies(df['Handcap'], prefix = 'Handicap')
df = pd.concat([df, Handicap], axis=1)

In [6]:
# Let's take a look at the columns
df.columns

Index(['PatientId', 'AppointmentID', 'Gender', 'ScheduledDay',
       'AppointmentDay', 'Age', 'Neighbourhood', 'Scholarship', 'Hipertension',
       'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received', 'No-show',
       'AwaitingTime', 'Handicap_0', 'Handicap_1', 'Handicap_2', 'Handicap_3',
       'Handicap_4'],
      dtype='object')

In [7]:
df.drop(["PatientId",'AppointmentID','ScheduledDay','Handcap','AppointmentDay','Neighbourhood'], axis=1, inplace = True)

In [8]:
X = df[['Gender', 'Diabetes','Hipertension', 'Scholarship', 'SMS_received',
        'Handicap_0','Handicap_1','Handicap_2','Handicap_3','Handicap_4', 'Age', 'AwaitingTime']]

y = df["No-show"]

In [9]:
X.head()

Unnamed: 0,Gender,Diabetes,Hipertension,Scholarship,SMS_received,Handicap_0,Handicap_1,Handicap_2,Handicap_3,Handicap_4,Age,AwaitingTime
0,1,0,1,0,0,1,0,0,0,0,62,0.776481
1,0,0,0,0,0,1,0,0,0,0,56,0.672535
2,1,0,0,0,0,1,0,0,0,0,62,0.679907
3,1,0,0,0,0,1,0,0,0,0,8,0.728831
4,1,1,1,0,0,1,0,0,0,0,56,0.671794


In [11]:
# 1
import numpy as np
seed = 7
np.random.seed(seed)

# 2
from keras.models import Sequential
from keras.layers import Dense
model = Sequential()
model.add(Dense(12, input_shape=(12,), kernel_initializer='uniform', activation='sigmoid'))
model.add(Dense(12, kernel_initializer='uniform', activation='sigmoid'))
model.add(Dense(12, kernel_initializer='uniform', activation='sigmoid'))
model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
model.summary()

# 3
import keras
tbCallBack = keras.callbacks.TensorBoard(log_dir='/tmp/keras_logs', write_graph=True)

# 4
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
model.fit(X.values, y.values, epochs=9, batch_size=50,  verbose=1, validation_split=0.3, callbacks=[tbCallBack])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 12)                156       
_________________________________________________________________
dense_6 (Dense)              (None, 12)                156       
_________________________________________________________________
dense_7 (Dense)              (None, 12)                156       
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 13        
Total params: 481
Trainable params: 481
Non-trainable params: 0
_________________________________________________________________
Train on 77368 samples, validate on 33159 samples
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


<keras.callbacks.History at 0x7fdbc1d69f60>