In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
import numpy as np
#import seaborn as sns
import matplotlib.pyplot as plt

# Import data
path = "significant_df.csv"

noshow_df = pd.read_csv(path)
noshow_df

Unnamed: 0,Patient_ID,Age,Gender,Insurance_Type,Chronic_Disease,Mental_Health_Status,Employment_Status,Education_Level,Transportation_Access,Distance_from_Facility,Area_Type,Booking_Date,Appointment_Date,Days Diff,Appointment_Outcome
0,75763,93.0,Male,Medicare,False,True,Student,Advanced Degree,True,37.785038,Suburban,2023-11-04,2024-08-15,285,Show
1,84945,52.0,Male,Uninsured,False,False,Employed,College,True,,Rural,2023-10-28,2024-07-11,257,No-Show
2,88559,8.0,Male,Medicare,True,,,Advanced Degree,True,27.277654,Rural,2023-02-12,2024-10-26,622,Show
3,72217,5.0,Other,Medicare,False,True,Student,College,True,44.618708,,2023-01-30,2024-01-31,366,Show
4,34403,15.0,Female,Medicaid,,False,Unemployed,High School,True,34.033755,Suburban,2023-08-13,2024-07-23,345,Show
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45037,91361,57.0,Female,Private,True,False,Student,High School,True,28.410940,Suburban,2024-01-26,2024-03-01,35,Show
45038,16333,48.0,Female,,True,True,Student,,True,26.576418,Urban,2023-11-24,2024-03-16,113,No-Show
45039,11153,5.0,Other,Medicaid,True,True,Student,,False,20.674054,Suburban,2023-11-29,2023-12-05,6,No-Show
45040,41988,53.0,Male,Medicaid,False,False,Employed,College,False,21.190068,Suburban,2023-05-02,2023-12-19,231,No-Show


In [2]:
noshow_df = noshow_df.drop(columns=['Patient_ID', 'Booking_Date', 'Appointment_Date'])

In [3]:
# Preprocessing
noshow_df['Age'].fillna(noshow_df['Age'].median(), inplace=True)

noshow_df['Gender'].fillna(value='Unknown', inplace=True)

noshow_df['Insurance_Type'].fillna(value='Unknown', inplace=True)

noshow_df['Chronic_Disease'].fillna(value='Unknown', inplace=True)

noshow_df['Mental_Health_Status'].fillna(value='Unknown', inplace=True)

noshow_df['Employment_Status'].fillna(value='Unknown', inplace=True)

noshow_df['Education_Level'].fillna(value='Unknown', inplace=True)

noshow_df['Transportation_Access'].fillna(value='Unknown', inplace=True)

noshow_df['Distance_from_Facility'].fillna(noshow_df['Distance_from_Facility'].median(), inplace=True)

noshow_df['Area_Type'].fillna(value='Unknown', inplace=True)




noshow_df['Appointment_Outcome'] = noshow_df['Appointment_Outcome'].map({'Show':1, 'No-Show':0})

In [4]:
noshow_df

Unnamed: 0,Age,Gender,Insurance_Type,Chronic_Disease,Mental_Health_Status,Employment_Status,Education_Level,Transportation_Access,Distance_from_Facility,Area_Type,Days Diff,Appointment_Outcome
0,93.0,Male,Medicare,False,True,Student,Advanced Degree,True,37.785038,Suburban,285,1
1,52.0,Male,Uninsured,False,False,Employed,College,True,25.101971,Rural,257,0
2,8.0,Male,Medicare,True,Unknown,Unknown,Advanced Degree,True,27.277654,Rural,622,1
3,5.0,Other,Medicare,False,True,Student,College,True,44.618708,Unknown,366,1
4,15.0,Female,Medicaid,Unknown,False,Unemployed,High School,True,34.033755,Suburban,345,1
...,...,...,...,...,...,...,...,...,...,...,...,...
45037,57.0,Female,Private,True,False,Student,High School,True,28.410940,Suburban,35,1
45038,48.0,Female,Unknown,True,True,Student,Unknown,True,26.576418,Urban,113,0
45039,5.0,Other,Medicaid,True,True,Student,Unknown,False,20.674054,Suburban,6,0
45040,53.0,Male,Medicaid,False,False,Employed,College,False,21.190068,Suburban,231,0


In [5]:
noshow_df.isnull().sum()

Age                       0
Gender                    0
Insurance_Type            0
Chronic_Disease           0
Mental_Health_Status      0
Employment_Status         0
Education_Level           0
Transportation_Access     0
Distance_from_Facility    0
Area_Type                 0
Days Diff                 0
Appointment_Outcome       0
dtype: int64

In [6]:
noshow_df.dtypes

Age                       float64
Gender                     object
Insurance_Type             object
Chronic_Disease            object
Mental_Health_Status       object
Employment_Status          object
Education_Level            object
Transportation_Access      object
Distance_from_Facility    float64
Area_Type                  object
Days Diff                   int64
Appointment_Outcome         int64
dtype: object

In [7]:
noshow_df.nunique()

Age                         101
Gender                        4
Insurance_Type                5
Chronic_Disease               3
Mental_Health_Status          3
Employment_Status             5
Education_Level               4
Transportation_Access         3
Distance_from_Facility    40495
Area_Type                     4
Days Diff                   724
Appointment_Outcome           2
dtype: int64

In [8]:
# Bin Age into groups
age_bins = [0, 24, 50, 75, 100]
age_labels = ['0-24', '25-50', '51-75', '76-100']

noshow_df['Age'] = pd.cut(noshow_df['Age'], bins=age_bins, labels=age_labels, include_lowest=True)

In [9]:
# Bin Distance into groups
distance_bins = [0, 10, 20, 30, 40, 50]
distance_labels = ['0-10','11-20','21-30','31-40','41-50']

noshow_df['Distance_from_Facility'] = pd.cut(noshow_df['Distance_from_Facility'], bins=distance_bins, labels=distance_labels, include_lowest=True)

In [10]:
# Bin Days Diff into groups
days_diff_bins = [0, 100, 200, 300, 400, 500, 600, 700, 800]
days_diff_labels = ['0-100','101-200', '201-300', '301-400', '401-500', '501-600', '601-700', '701-800']

noshow_df['Days Diff'] = pd.cut(noshow_df['Days Diff'], bins=days_diff_bins, labels=days_diff_labels, include_lowest=True)

In [11]:
noshow_df.nunique()

Age                       4
Gender                    4
Insurance_Type            5
Chronic_Disease           3
Mental_Health_Status      3
Employment_Status         5
Education_Level           4
Transportation_Access     3
Distance_from_Facility    5
Area_Type                 4
Days Diff                 8
Appointment_Outcome       2
dtype: int64

In [12]:
#for column in noshow_df.columns:
#    plt.figure(figsize=(12, 4))

    # Check if the column is numeric or categorical
#    if pd.api.types.is_numeric_dtype(noshow_df[column]):
#        sns.histplot(data=noshow_df[column], kde=True)
#    else:
#        sns.countplot(data=noshow_df, x=column)

#    plt.title(f'Distribution of {column}')
#    plt.show()

In [13]:
# Convert categorical data to numeric with `pd.get_dummies`
noshow_df_dummies = pd.get_dummies(noshow_df)
noshow_df_dummies.dtypes

Appointment_Outcome                int64
Age_0-24                            bool
Age_25-50                           bool
Age_51-75                           bool
Age_76-100                          bool
Gender_Female                       bool
Gender_Male                         bool
Gender_Other                        bool
Gender_Unknown                      bool
Insurance_Type_Medicaid             bool
Insurance_Type_Medicare             bool
Insurance_Type_Private              bool
Insurance_Type_Uninsured            bool
Insurance_Type_Unknown              bool
Chronic_Disease_False               bool
Chronic_Disease_True                bool
Chronic_Disease_Unknown             bool
Mental_Health_Status_False          bool
Mental_Health_Status_True           bool
Mental_Health_Status_Unknown        bool
Employment_Status_Employed          bool
Employment_Status_Retired           bool
Employment_Status_Student           bool
Employment_Status_Unemployed        bool
Employment_Statu

In [14]:
noshow_df_dummies.isnull().sum()

Appointment_Outcome                0
Age_0-24                           0
Age_25-50                          0
Age_51-75                          0
Age_76-100                         0
Gender_Female                      0
Gender_Male                        0
Gender_Other                       0
Gender_Unknown                     0
Insurance_Type_Medicaid            0
Insurance_Type_Medicare            0
Insurance_Type_Private             0
Insurance_Type_Uninsured           0
Insurance_Type_Unknown             0
Chronic_Disease_False              0
Chronic_Disease_True               0
Chronic_Disease_Unknown            0
Mental_Health_Status_False         0
Mental_Health_Status_True          0
Mental_Health_Status_Unknown       0
Employment_Status_Employed         0
Employment_Status_Retired          0
Employment_Status_Student          0
Employment_Status_Unemployed       0
Employment_Status_Unknown          0
Education_Level_Advanced Degree    0
Education_Level_College            0
E

In [15]:
# Split our preprocessed data into our features and target arrays
X = noshow_df_dummies.drop('Appointment_Outcome', axis=1)
y = noshow_df_dummies['Appointment_Outcome']

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)

In [16]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [17]:
len(X_train_scaled[0])

48

In [18]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 20
hidden_nodes_layer2 = 10

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 20)                980       
                                                                 
 dense_1 (Dense)             (None, 10)                210       
                                                                 
 dense_2 (Dense)             (None, 1)                 11        
                                                                 
Total params: 1,201
Trainable params: 1,201
Non-trainable params: 0
_________________________________________________________________


In [19]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [20]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [21]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

423/423 - 1s - loss: 0.7254 - accuracy: 0.4954 - 535ms/epoch - 1ms/step
Loss: 0.725372314453125, Accuracy: 0.4953748285770416
