In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

In [2]:
# Import data
path = "clean_df.csv"

noshow_df = pd.read_csv(path)
noshow_df

Unnamed: 0,Age,Gender,Insurance_Type,Chronic_Disease,Mental_Health_Status,Employment_Status,Education_Level,Transportation_Access,Distance_from_Facility,Area_Type,Days Diff,Appointment_Outcome
0,93.0,Male,Medicare,False,True,Student,Advanced Degree,True,37.785038,Suburban,285,Show
1,52.0,Male,Uninsured,False,False,Employed,College,True,,Rural,257,No-Show
2,8.0,Male,Medicare,True,,,Advanced Degree,True,27.277654,Rural,622,Show
3,5.0,Other,Medicare,False,True,Student,College,True,44.618708,,366,Show
4,15.0,Female,Medicaid,,False,Unemployed,High School,True,34.033755,Suburban,345,Show
...,...,...,...,...,...,...,...,...,...,...,...,...
45037,57.0,Female,Private,True,False,Student,High School,True,28.410940,Suburban,35,Show
45038,48.0,Female,,True,True,Student,,True,26.576418,Urban,113,No-Show
45039,5.0,Other,Medicaid,True,True,Student,,False,20.674054,Suburban,6,No-Show
45040,53.0,Male,Medicaid,False,False,Employed,College,False,21.190068,Suburban,231,No-Show


In [3]:
noshow_df.dtypes

Age                       float64
Gender                     object
Insurance_Type             object
Chronic_Disease            object
Mental_Health_Status       object
Employment_Status          object
Education_Level            object
Transportation_Access      object
Distance_from_Facility    float64
Area_Type                  object
Days Diff                   int64
Appointment_Outcome        object
dtype: object

In [4]:
noshow_df.nunique()

Age                         101
Gender                        3
Insurance_Type                4
Chronic_Disease               2
Mental_Health_Status          2
Employment_Status             4
Education_Level               3
Transportation_Access         2
Distance_from_Facility    40494
Area_Type                     3
Days Diff                   724
Appointment_Outcome           2
dtype: int64

In [5]:
age_counts = noshow_df['Age'].value_counts()
age_counts

Age
28.0    452
94.0    443
96.0    441
2.0     440
9.0     438
       ... 
58.0    365
51.0    364
17.0    362
66.0    361
71.0    339
Name: count, Length: 101, dtype: int64

In [6]:
gender_counts = noshow_df['Gender'].value_counts()
gender_counts

Gender
Female    13656
Other     13501
Male      13444
Name: count, dtype: int64

In [7]:
insurance_counts = noshow_df['Insurance_Type'].value_counts()
insurance_counts

Insurance_Type
Uninsured    10211
Medicare     10163
Private      10141
Medicaid     10025
Name: count, dtype: int64

In [8]:
chronic_disease_counts = noshow_df['Chronic_Disease'].value_counts()
chronic_disease_counts

Chronic_Disease
False    20343
True     20233
Name: count, dtype: int64

In [9]:
mental_health_status_counts = noshow_df['Mental_Health_Status'].value_counts()
mental_health_status_counts

Mental_Health_Status
False    20381
True     20095
Name: count, dtype: int64

In [10]:
employment_status_counts = noshow_df['Employment_Status'].value_counts()
employment_status_counts

Employment_Status
Employed      10192
Retired       10118
Student       10089
Unemployed    10031
Name: count, dtype: int64

In [11]:
education_level_counts = noshow_df['Education_Level'].value_counts()
education_level_counts

# Missing 'None' values

Education_Level
High School        10277
Advanced Degree    10261
College             9872
Name: count, dtype: int64

In [12]:
transportation_access_counts = noshow_df['Transportation_Access'].value_counts()
transportation_access_counts

Transportation_Access
False    20308
True     20212
Name: count, dtype: int64

In [13]:
distance_counts = noshow_df['Distance_from_Facility'].value_counts()
distance_counts

Distance_from_Facility
37.785038    1
7.728993     1
39.040855    1
44.856200    1
44.440638    1
            ..
39.135709    1
12.547019    1
21.085779    1
13.770227    1
21.190068    1
Name: count, Length: 40494, dtype: int64

In [14]:
area_counts = noshow_df['Area_Type'].value_counts()
area_counts

Area_Type
Urban       13586
Rural       13505
Suburban    13475
Name: count, dtype: int64

In [15]:
days_diff_counts = noshow_df['Days Diff'].value_counts()
days_diff_counts

Days Diff
307    113
123    110
106    108
101    106
217    105
      ... 
714      1
723      1
727      1
710      1
713      1
Name: count, Length: 724, dtype: int64

In [16]:
appt_outcome_counts = noshow_df['Appointment_Outcome'].value_counts()
appt_outcome_counts

Appointment_Outcome
Show       22575
No-Show    22467
Name: count, dtype: int64

In [17]:
# Map Appointment Outcome column to 0's and 1's
noshow_df['Appointment_Outcome'] = noshow_df['Appointment_Outcome'].map({'Show':1, 'No-Show':0})

In [18]:
# Convert categorical data to numeric with `pd.get_dummies`
noshow_df_dummies = pd.get_dummies(noshow_df)
noshow_df_dummies.dtypes

Age                                float64
Distance_from_Facility             float64
Days Diff                            int64
Appointment_Outcome                  int64
Gender_Female                         bool
Gender_Male                           bool
Gender_Other                          bool
Insurance_Type_Medicaid               bool
Insurance_Type_Medicare               bool
Insurance_Type_Private                bool
Insurance_Type_Uninsured              bool
Chronic_Disease_False                 bool
Chronic_Disease_True                  bool
Mental_Health_Status_False            bool
Mental_Health_Status_True             bool
Employment_Status_Employed            bool
Employment_Status_Retired             bool
Employment_Status_Student             bool
Employment_Status_Unemployed          bool
Education_Level_Advanced Degree       bool
Education_Level_College               bool
Education_Level_High School           bool
Transportation_Access_False           bool
Transportat

In [19]:
noshow_df_dummies

Unnamed: 0,Age,Distance_from_Facility,Days Diff,Appointment_Outcome,Gender_Female,Gender_Male,Gender_Other,Insurance_Type_Medicaid,Insurance_Type_Medicare,Insurance_Type_Private,...,Employment_Status_Student,Employment_Status_Unemployed,Education_Level_Advanced Degree,Education_Level_College,Education_Level_High School,Transportation_Access_False,Transportation_Access_True,Area_Type_Rural,Area_Type_Suburban,Area_Type_Urban
0,93.0,37.785038,285,1,False,True,False,False,True,False,...,True,False,True,False,False,False,True,False,True,False
1,52.0,,257,0,False,True,False,False,False,False,...,False,False,False,True,False,False,True,True,False,False
2,8.0,27.277654,622,1,False,True,False,False,True,False,...,False,False,True,False,False,False,True,True,False,False
3,5.0,44.618708,366,1,False,False,True,False,True,False,...,True,False,False,True,False,False,True,False,False,False
4,15.0,34.033755,345,1,True,False,False,True,False,False,...,False,True,False,False,True,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45037,57.0,28.410940,35,1,True,False,False,False,False,True,...,True,False,False,False,True,False,True,False,True,False
45038,48.0,26.576418,113,0,True,False,False,False,False,False,...,True,False,False,False,False,False,True,False,False,True
45039,5.0,20.674054,6,0,False,False,True,True,False,False,...,True,False,False,False,False,True,False,False,True,False
45040,53.0,21.190068,231,0,False,True,False,True,False,False,...,False,False,False,True,False,True,False,False,True,False


In [20]:
# Split our preprocessed data into our features and target arrays
X = noshow_df_dummies.drop('Appointment_Outcome', axis=1)
y = noshow_df_dummies['Appointment_Outcome']

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

In [21]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [22]:
len(X_train_scaled[0])

26

In [23]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 8
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 216       
                                                                 
 dense_1 (Dense)             (None, 5)                 45        
                                                                 
 dense_2 (Dense)             (None, 1)                 6         
                                                                 
Total params: 267
Trainable params: 267
Non-trainable params: 0
_________________________________________________________________


In [24]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [25]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [26]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

282/282 - 0s - loss: 0.6931 - accuracy: 0.5052 - 475ms/epoch - 2ms/step
Loss: 0.6931353211402893, Accuracy: 0.5051615238189697
