In [1]:
# import dependencies 
from google.colab import files
uploaded = files.upload()
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
import pandas as pd
import io
import tensorflow as tf
import plotly as plt

Saving urgentcare_clean.csv to urgentcare_clean (2).csv


In [2]:
#  read in the csv
urgentcare_df = pd.read_csv(io.StringIO(uploaded["urgentcare_clean.csv"].decode("utf-8")), sep = ",")
urgentcare_df.head()

Unnamed: 0,diagnosis_category,diagnosis_sub_category,treatment_category,treatment_sub_category,determination,type,age_range,patient_gender
0,Infectious,Hepatitis,Pharmacy/Prescription Drugs,Anti-virals,Overturned Decision of Health Plan,Medical Necessity,41-50,Male
1,Mental,Eating Disorder,Mental Health Treatment,Residential Treatment Center - Admission,Upheld Decision of Health Plan,Medical Necessity,21-30,Female
2,Autism Spectrum,Autism-PDD-NOS,Autism Related Treatment,Speech Therapy,Upheld Decision of Health Plan,Medical Necessity,0-10,Female
3,Prevention/Good Health,,"Diagnostic Imaging, Screening and Testing",Mammography,Overturned Decision of Health Plan,Experimental/Investigational,65+,Female
4,Prevention/Good Health,,"Diagnostic Imaging, Screening and Testing",Lab Work,Upheld Decision of Health Plan,Experimental/Investigational,21-30,Male


In [3]:
#Determine the number of unique values in each column
urgentcare_df.nunique()

diagnosis_category         27
diagnosis_sub_category    145
treatment_category         30
treatment_sub_category    120
determination               2
type                        3
age_range                   7
patient_gender              2
dtype: int64

In [4]:
# View fill list of parameters/null values
urgentcare_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1307 entries, 0 to 1306
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   diagnosis_category      1307 non-null   object
 1   diagnosis_sub_category  1171 non-null   object
 2   treatment_category      1306 non-null   object
 3   treatment_sub_category  1298 non-null   object
 4   determination           1307 non-null   object
 5   type                    1307 non-null   object
 6   age_range               1307 non-null   object
 7   patient_gender          1307 non-null   object
dtypes: object(8)
memory usage: 81.8+ KB


In [5]:
# Determine the number of unique values in each column.
data_cat = urgentcare_df.dtypes[urgentcare_df.dtypes == "object"].index.tolist()
urgentcare_df[data_cat].nunique()

# We'll need to bin the 'diagnosis_category, diagnosis_sub_category, treatment_category, and treatment_sub_category' 
# with 'other' categories 

diagnosis_category         27
diagnosis_sub_category    145
treatment_category         30
treatment_sub_category    120
determination               2
type                        3
age_range                   7
patient_gender              2
dtype: int64

In [6]:
# column 'determination' is the target data, meaning we will need to separate this from our data set and use 
# supervised learning to train our model. 
urgentcare_df['determination'].value_counts()

Overturned Decision of Health Plan    724
Upheld Decision of Health Plan        583
Name: determination, dtype: int64

In [7]:
# To clean all bins, we will first start with the diagnosis_category
diagnoses = urgentcare_df['diagnosis_category'].value_counts()
diagnoses

Infectious                               244
Orthopedic/ Musculoskeletal              187
Mental                                   154
OB-Gyn/ Pregnancy                        108
Prevention/Good Health                   105
Cancer                                    86
Central Nervous System/ Neuromuscular     70
Cardiac/Circulatory                       57
Digestive System/ Gastrointestinal        48
Endocrine/ Metabolic                      36
Autism Spectrum                           31
Skin                                      28
Respiratory System                        23
Immunologic                               21
Genitourinary/ Kidney                     15
Pediatrics                                14
Chronic Pain                              12
Morbid Obesity                            11
Blood Related                              9
Foot                                       8
Ears, Nose, Throat                         8
Not Applicable                             8
Genetic   

In [8]:
# Choose a cutoff value and create a list of application types to be replaced (45)
# use the variable name `diagnosis_types_to_replace`

diagnosis_types_to_replace = diagnoses[diagnoses<45]
diagnosis_types_to_replace

Endocrine/ Metabolic          36
Autism Spectrum               31
Skin                          28
Respiratory System            23
Immunologic                   21
Genitourinary/ Kidney         15
Pediatrics                    14
Chronic Pain                  12
Morbid Obesity                11
Blood Related                  9
Foot                           8
Ears, Nose, Throat             8
Not Applicable                 8
Genetic                        7
Vision                         5
Trauma/Injuries                5
Dental                         5
Post Surgical Complication     2
Name: diagnosis_category, dtype: int64

In [9]:
# Checking the number of values that will go into 'other' column
diagnosis_types_to_replace = urgentcare_df['diagnosis_category'].isin(diagnosis_types_to_replace.index)
diagnosis_types_to_replace.sum()

248

In [10]:
# replace lower values of bin threshold with list comprehension 
urgentcare_df.loc[diagnosis_types_to_replace, 'diagnosis_category'] = 'Other'
urgentcare_df['diagnosis_category'].value_counts()

Other                                    248
Infectious                               244
Orthopedic/ Musculoskeletal              187
Mental                                   154
OB-Gyn/ Pregnancy                        108
Prevention/Good Health                   105
Cancer                                    86
Central Nervous System/ Neuromuscular     70
Cardiac/Circulatory                       57
Digestive System/ Gastrointestinal        48
Name: diagnosis_category, dtype: int64

In [11]:
# Determine diag.sub.cat bins next
urgentcare_df['diagnosis_sub_category'].value_counts()

Hepatitis                  233
Other                      196
Female Breast Disorder      47
Back Pain                   43
Breast Cancer               37
                          ... 
Carpal Tunnel Syndrome       1
Urinary Tract Infection      1
Decay/ Cavities              1
Viral Infection              1
Paralysis                    1
Name: diagnosis_sub_category, Length: 145, dtype: int64

In [12]:
# print a list of value counts to find a working threshold (20)
list(urgentcare_df['diagnosis_sub_category'].value_counts())

[233,
 196,
 47,
 43,
 37,
 29,
 23,
 22,
 22,
 21,
 19,
 19,
 17,
 16,
 16,
 15,
 15,
 13,
 13,
 12,
 12,
 11,
 11,
 11,
 9,
 9,
 9,
 8,
 8,
 7,
 7,
 7,
 6,
 6,
 6,
 6,
 6,
 6,
 5,
 5,
 5,
 5,
 5,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [13]:
# establish diagnosis_sub_cat as a list variable 
sub_diagnoses = urgentcare_df['diagnosis_sub_category'].value_counts()

# use a filter reduce list of bins 
sub_diagnosis_types_to_replace = sub_diagnoses[sub_diagnoses<20]
sub_diagnosis_types_to_replace

Fracture                      19
Substance Abuse/ Addiction    19
Thyroid Problems              17
Multiple Sclerosis            16
Crohn's Disease               16
                              ..
Carpal Tunnel Syndrome         1
Urinary Tract Infection        1
Decay/ Cavities                1
Viral Infection                1
Paralysis                      1
Name: diagnosis_sub_category, Length: 135, dtype: int64

In [14]:
# Checking the number of values that will go into 'other' column
sub_diagnosis_types_to_replace = urgentcare_df['diagnosis_sub_category'].isin(sub_diagnosis_types_to_replace.index)
sub_diagnosis_types_to_replace.sum()

498

In [15]:
# replace lower values of bin threshold with list comprehension 
urgentcare_df.loc[sub_diagnosis_types_to_replace, 'diagnosis_sub_category'] = 'Other'
urgentcare_df['diagnosis_sub_category'].value_counts()

Other                     694
Hepatitis                 233
Female Breast Disorder     47
Back Pain                  43
Breast Cancer              37
Autism-PDD-NOS             29
Depression - Severe        23
Eating Disorder            22
Knee Problem               22
Osteoarthritis             21
Name: diagnosis_sub_category, dtype: int64

In [16]:
# Looking at this list, it seems important to perhaps come back and test the model without binning, 
# becuase there are list items that seem infrequent but important; or, at lest increase from 10 to 15-18 etc. 
urgentcare_df['treatment_category'].value_counts()

Pharmacy/Prescription Drugs                                       421
Diagnostic Imaging, Screening and Testing                         355
Mental Health Treatment                                           122
Durable Medical Equipment                                          53
Orthopedic                                                         41
Cancer Treatment                                                   30
Autism Related Treatment                                           29
Pain Management                                                    28
General Surgery                                                    28
Electrical/ Thermal/ Radiofreq. Interventions                      25
Reconstructive/Plastic Surgery                                     24
Special Procedure                                                  20
Diagnostic/Physician Evaluation                                    16
Rehabilitation Services - Skilled Nursing Facility - Inpatient     14
Emergency/Urgent Car

In [17]:
# establish treatment category as a list variable 
treatments = urgentcare_df['treatment_category'].value_counts()

# use a filter reduce list of bins 
treatment_types_to_replace = treatments[treatments<10]
treatment_types_to_replace

Home Health Care                   9
Ear, Nose and Throat Procedures    8
Dental/Orthodontic                 7
OB/GYN Procedures                  7
Urology                            5
Neurosugery                        4
Alternative Treatment              3
Not Applicable                     2
Vision                             2
Chiropractic                       2
Ophthalmology                      2
Preventive Health Screening        1
Name: treatment_category, dtype: int64

In [18]:
# Checking the number of values that will go into 'other' column
treatment_types_to_replace = urgentcare_df['treatment_category'].isin(treatment_types_to_replace.index)
treatment_types_to_replace.sum()

52

In [19]:
# replace lower values of bin threshold with list comprehension 
urgentcare_df.loc[treatment_types_to_replace, 'treatment_category'] = 'Other'
urgentcare_df['treatment_category'].value_counts()

Pharmacy/Prescription Drugs                                       421
Diagnostic Imaging, Screening and Testing                         355
Mental Health Treatment                                           122
Durable Medical Equipment                                          53
Other                                                              52
Orthopedic                                                         41
Cancer Treatment                                                   30
Autism Related Treatment                                           29
General Surgery                                                    28
Pain Management                                                    28
Electrical/ Thermal/ Radiofreq. Interventions                      25
Reconstructive/Plastic Surgery                                     24
Special Procedure                                                  20
Diagnostic/Physician Evaluation                                    16
Rehabilitation Servi

In [20]:
# print value counts to see threshold (19)
list(urgentcare_df['treatment_sub_category'].value_counts())
# health_df['treatment_sub_category'].value_counts()

[237,
 233,
 171,
 83,
 47,
 30,
 23,
 22,
 19,
 18,
 16,
 15,
 15,
 14,
 14,
 12,
 11,
 10,
 10,
 10,
 10,
 9,
 9,
 9,
 8,
 8,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 6,
 6,
 5,
 5,
 5,
 5,
 5,
 4,
 4,
 4,
 4,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [21]:
# establish treatment_sub_cat as a list variable 
sub_treatments = urgentcare_df['treatment_sub_category'].value_counts()

# use a filter reduce list of bins 
sub_treatment_types_to_replace = sub_treatments[sub_treatments<=18]
sub_treatment_types_to_replace

Partial Hospitalization                            18
X-Ray                                              16
Hormones                                           15
ABA-Applied Behavioral Analysis                    15
Skin Treatment                                     14
                                                   ..
Eye Test                                            1
Compression Garments                                1
EECP-Enhanced External Counter Pulsation Device     1
Psychotherapy                                       1
Medical/Surgical Unit                               1
Name: treatment_sub_category, Length: 111, dtype: int64

In [22]:
# Checking the number of values that will go into 'other' column
# This number is a little large; it may be worth 
sub_treatment_types_to_replace = urgentcare_df['treatment_sub_category'].isin(sub_treatment_types_to_replace.index)
sub_treatment_types_to_replace.sum()

433

In [23]:
# replace lower values of bin threshold with list comprehension 
urgentcare_df.loc[sub_treatment_types_to_replace, 'treatment_sub_category'] = 'Other'
urgentcare_df['treatment_sub_category'].value_counts()

Other                                       670
Anti-virals                                 233
Mammography                                 171
Lab Work                                     83
Residential Treatment Center - Admission     47
Acute Psychiatric Facility Admission         30
MRI                                          23
Analgesics                                   22
Arthritis Medications                        19
Name: treatment_sub_category, dtype: int64

In [24]:
# No Changes to this category
urgentcare_df['type'].value_counts()

Medical Necessity               821
Experimental/Investigational    468
Urgent Care                      18
Name: type, dtype: int64

In [25]:
# Splitting target from features, first by making target binary
import numpy as np

urgentcare_df['determination']=np.where(urgentcare_df["determination"]=="Overturned Decision of Health Plan", 1, 0)
urgentcare_df['determination'].value_counts()

1    724
0    583
Name: determination, dtype: int64

In [26]:
# Convert categorical data to numeric with `pd.get_dummies`
dummies_df =pd.get_dummies(urgentcare_df)
dummies_df.head()

Unnamed: 0,determination,diagnosis_category_Cancer,diagnosis_category_Cardiac/Circulatory,diagnosis_category_Central Nervous System/ Neuromuscular,diagnosis_category_Digestive System/ Gastrointestinal,diagnosis_category_Infectious,diagnosis_category_Mental,diagnosis_category_OB-Gyn/ Pregnancy,diagnosis_category_Orthopedic/ Musculoskeletal,diagnosis_category_Other,...,type_Urgent Care,age_range_0-10,age_range_11_20,age_range_21-30,age_range_31-40,age_range_41-50,age_range_51-64,age_range_65+,patient_gender_Female,patient_gender_Male
0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [28]:
# new we will assign target and features
y = dummies_df['determination'].values
X = dummies_df.drop('determination', axis=1)

In [29]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state= 42)

In [30]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [31]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_layer = len(X_train_scaled[0])
hidden_layer_1= 350
hidden_layer_2 = 30

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_layer_1, activation="relu", input_dim = input_layer))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_layer_2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()


# Compile the model
nn.compile(loss="binary_crossentropy", optimizer= "adam", metrics=["accuracy"])


# Train the model
fit_model = nn.fit (X_train_scaled, y_train, epochs= 100)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 350)               21350     
                                                                 
 dense_1 (Dense)             (None, 30)                10530     
                                                                 
 dense_2 (Dense)             (None, 1)                 31        
                                                                 
Total params: 31,911
Trainable params: 31,911
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
