# Healthcare ML Model
## Author: Aron Gu
## Date: October 19, 2024

In [89]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score, accuracy_score, mean_squared_error, mean_absolute_error, roc_curve, auc, roc_auc_score, f1_score, recall_score, precision_score, accuracy_score
from sklearn.neighbors import KNeighborsClassifier

## Load the Healthcare Dataset

In [90]:
health_data = pd.read_csv("Healthcare_Data_Preprocessed.csv")

# Set the option to display all columns
pd.set_option('display.max_columns', None)

## Analyze Healthcare Dataset

### Display Rows of Dataset

In [91]:
health_data.head()

Unnamed: 0,Age,BMI,Blood_Pressure,Cholesterol,Glucose_Level,Heart_Rate,Sleep_Hours,Exercise_Hours,Water_Intake,Stress_Level,Target,Smoking,Alcohol,Diet,MentalHealth,PhysicalActivity,MedicalHistory,Allergies,Diet_Type_Vegan,Diet_Type_Vegetarian,Blood_Group_AB,Blood_Group_B,Blood_Group_O
0,37.0,26.0,111.0,198.0,99.0,72.0,37.0,37.0,37.0,37.0,1,2,2,1,2,1,0,1,False,True,True,False,False
1,37.0,24.0,121.0,199.0,103.0,75.0,37.0,37.0,37.0,37.0,1,0,1,1,2,1,2,2,False,False,True,False,False
2,81.0,27.0,147.0,203.0,100.0,74.0,81.0,81.0,81.0,81.0,0,2,1,2,0,0,1,0,True,False,False,False,False
3,25.0,21.0,150.0,199.0,102.0,70.0,25.0,25.0,25.0,25.0,0,2,0,1,2,1,2,0,True,False,False,True,False
4,24.0,26.0,146.0,202.0,99.0,76.0,24.0,24.0,24.0,24.0,0,0,1,2,0,2,0,2,False,True,False,True,False


In [92]:
health_data.tail()

Unnamed: 0,Age,BMI,Blood_Pressure,Cholesterol,Glucose_Level,Heart_Rate,Sleep_Hours,Exercise_Hours,Water_Intake,Stress_Level,Target,Smoking,Alcohol,Diet,MentalHealth,PhysicalActivity,MedicalHistory,Allergies,Diet_Type_Vegan,Diet_Type_Vegetarian,Blood_Group_AB,Blood_Group_B,Blood_Group_O
9995,5.0,22.0,109.0,203.0,98.0,75.0,5.0,5.0,5.0,5.0,0,2,0,0,2,2,1,0,True,False,True,False,False
9996,94.0,26.0,144.0,203.0,96.0,72.0,94.0,94.0,94.0,94.0,0,0,2,1,0,1,0,2,False,True,False,True,False
9997,37.0,23.0,185.0,198.0,103.0,72.0,37.0,37.0,37.0,37.0,0,1,0,1,0,2,0,1,True,False,True,False,False
9998,50.0,29.0,166.0,200.0,100.0,74.0,50.0,50.0,50.0,50.0,0,1,1,2,0,0,1,1,True,False,True,False,False
9999,69.0,29.0,178.0,203.0,100.0,75.0,69.0,69.0,69.0,69.0,0,2,0,1,2,1,2,0,False,False,True,False,False


### Get Dimensions/Shape of Data Set

In [93]:
health_data.shape

(10000, 23)

### Get Data Types for Each Column

In [94]:
health_data.dtypes

Age                     float64
BMI                     float64
Blood_Pressure          float64
Cholesterol             float64
Glucose_Level           float64
Heart_Rate              float64
Sleep_Hours             float64
Exercise_Hours          float64
Water_Intake            float64
Stress_Level            float64
Target                    int64
Smoking                   int64
Alcohol                   int64
Diet                      int64
MentalHealth              int64
PhysicalActivity          int64
MedicalHistory            int64
Allergies                 int64
Diet_Type_Vegan            bool
Diet_Type_Vegetarian       bool
Blood_Group_AB             bool
Blood_Group_B              bool
Blood_Group_O              bool
dtype: object

### Get Numerical Columns

In [95]:
# Numerical datatypes in dataset(doesn't include int64 since from dataset it's considered categorical)
health_data.select_dtypes(include='float64').dtypes

Age               float64
BMI               float64
Blood_Pressure    float64
Cholesterol       float64
Glucose_Level     float64
Heart_Rate        float64
Sleep_Hours       float64
Exercise_Hours    float64
Water_Intake      float64
Stress_Level      float64
dtype: object

### Get Categorical Columns

In [96]:
# Categorical datatypes in dataset(int64 since from dataset it's considered categorical)
health_data.select_dtypes(include='int64').dtypes

Target              int64
Smoking             int64
Alcohol             int64
Diet                int64
MentalHealth        int64
PhysicalActivity    int64
MedicalHistory      int64
Allergies           int64
dtype: object

### Get General Data Information

In [97]:
health_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age                   10000 non-null  float64
 1   BMI                   10000 non-null  float64
 2   Blood_Pressure        10000 non-null  float64
 3   Cholesterol           10000 non-null  float64
 4   Glucose_Level         10000 non-null  float64
 5   Heart_Rate            10000 non-null  float64
 6   Sleep_Hours           10000 non-null  float64
 7   Exercise_Hours        10000 non-null  float64
 8   Water_Intake          10000 non-null  float64
 9   Stress_Level          10000 non-null  float64
 10  Target                10000 non-null  int64  
 11  Smoking               10000 non-null  int64  
 12  Alcohol               10000 non-null  int64  
 13  Diet                  10000 non-null  int64  
 14  MentalHealth          10000 non-null  int64  
 15  PhysicalActivity    

### Get Summary on Dataset

In [98]:
# For numerical data
health_data.describe(include = 'float64')

Unnamed: 0,Age,BMI,Blood_Pressure,Cholesterol,Glucose_Level,Heart_Rate,Sleep_Hours,Exercise_Hours,Water_Intake,Stress_Level
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,40.228,25.7115,131.0273,199.2521,100.1157,73.5314,41.344,41.344,41.344,41.344
std,24.350238,1.944594,27.631176,2.105941,2.211471,1.724329,27.791396,27.791396,27.791396,27.791396
min,0.0,19.0,22.0,192.0,93.0,67.0,0.0,0.0,0.0,0.0
25%,25.0,24.0,115.0,198.0,99.0,72.0,25.0,25.0,25.0,25.0
50%,37.0,26.0,135.0,199.0,100.0,74.0,37.0,37.0,37.0,37.0
75%,49.0,27.0,150.0,201.0,102.0,75.0,49.0,49.0,49.0,49.0
max,100.0,32.0,225.0,207.0,107.0,80.0,201.0,201.0,201.0,201.0


In [99]:
# For categorical data
int_cols = health_data.select_dtypes(include='int64')
for col in int_cols:
    print("Counts:")
    print(health_data[col].value_counts())
    print("\n")

Counts:
Target
0    5001
1    4999
Name: count, dtype: int64


Counts:
Smoking
0    3354
1    3347
2    3299
Name: count, dtype: int64


Counts:
Alcohol
0    3370
1    3333
2    3297
Name: count, dtype: int64


Counts:
Diet
2    3358
1    3339
0    3303
Name: count, dtype: int64


Counts:
MentalHealth
0    3405
2    3372
1    3223
Name: count, dtype: int64


Counts:
PhysicalActivity
1    3440
2    3283
0    3277
Name: count, dtype: int64


Counts:
MedicalHistory
1    3375
2    3322
0    3303
Name: count, dtype: int64


Counts:
Allergies
0    3366
1    3337
2    3297
Name: count, dtype: int64




In [100]:
# For one hot encoded data
health_data.describe(include = 'bool')

Unnamed: 0,Diet_Type_Vegan,Diet_Type_Vegetarian,Blood_Group_AB,Blood_Group_B,Blood_Group_O
count,10000,10000,10000,10000,10000
unique,2,2,2,2,2
top,False,False,False,False,False
freq,6693,6640,7550,7471,7462


### Identify Missing Values in Dataset

In [101]:
health_data.isnull().sum()

Age                     0
BMI                     0
Blood_Pressure          0
Cholesterol             0
Glucose_Level           0
Heart_Rate              0
Sleep_Hours             0
Exercise_Hours          0
Water_Intake            0
Stress_Level            0
Target                  0
Smoking                 0
Alcohol                 0
Diet                    0
MentalHealth            0
PhysicalActivity        0
MedicalHistory          0
Allergies               0
Diet_Type_Vegan         0
Diet_Type_Vegetarian    0
Blood_Group_AB          0
Blood_Group_B           0
Blood_Group_O           0
dtype: int64

- No missing values in dataset, however from the problem statement there is an issue with some of the entries with the numerical columns as the values are either 0 or -0.

In [102]:
# Identify float64 columns
float_cols = health_data.select_dtypes(include=['float64']).columns

# Filter rows where any float64 column has a value <= 0
health_data[(health_data[float_cols] <= 0).any(axis=1)]


Unnamed: 0,Age,BMI,Blood_Pressure,Cholesterol,Glucose_Level,Heart_Rate,Sleep_Hours,Exercise_Hours,Water_Intake,Stress_Level,Target,Smoking,Alcohol,Diet,MentalHealth,PhysicalActivity,MedicalHistory,Allergies,Diet_Type_Vegan,Diet_Type_Vegetarian,Blood_Group_AB,Blood_Group_B,Blood_Group_O
93,0.0,24.0,106.0,198.0,98.0,74.0,0.0,0.0,0.0,0.0,1,2,2,2,2,1,2,2,False,False,False,False,False
227,0.0,27.0,148.0,197.0,104.0,74.0,0.0,0.0,0.0,0.0,1,2,2,2,2,1,1,0,False,False,False,True,False
255,-0.0,27.0,104.0,198.0,102.0,75.0,-0.0,-0.0,-0.0,-0.0,1,2,0,0,2,1,0,2,False,False,False,False,False
464,0.0,25.0,132.0,198.0,102.0,74.0,0.0,0.0,0.0,0.0,0,2,0,2,1,1,1,0,True,False,True,False,False
568,0.0,27.0,155.0,197.0,103.0,73.0,0.0,0.0,0.0,0.0,1,2,0,2,0,0,2,2,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9674,0.0,25.0,64.0,198.0,99.0,71.0,0.0,0.0,0.0,0.0,1,1,1,0,1,1,1,0,True,False,False,True,False
9852,0.0,23.0,126.0,202.0,98.0,74.0,0.0,0.0,0.0,0.0,0,1,1,0,2,1,1,1,False,True,True,False,False
9868,0.0,25.0,155.0,199.0,102.0,74.0,0.0,0.0,0.0,0.0,1,1,1,1,2,2,2,0,False,True,False,False,True
9894,-0.0,27.0,129.0,199.0,102.0,75.0,-0.0,-0.0,-0.0,-0.0,0,2,2,1,0,0,1,2,True,False,False,True,False


- From the dataset, there are 96 entries in which some of the float column values are either 0 or negative 0, thus using median imputation of the rows for non-zero entries(since age cannot be a decimal number) to fill out the entries for these 96 rows.

In [103]:
# Calculate the median of each column ignoring values <= 0
medians = health_data[float_cols].apply(lambda col: col[col > 0].median())

# Replace values <= 0 with the corresponding column median
health_data[float_cols] = health_data[float_cols].apply(
    lambda col: col.mask(col <= 0, medians[col.name])
)

## Convert int64 Columns to categorical

In [104]:
# Convert all int64 columns to category
int_cols = health_data.select_dtypes(include=['int64']).columns

# Convert each int64 column to 'category' dtype
health_data[int_cols] = health_data[int_cols].astype('category')
health_data.head()

Unnamed: 0,Age,BMI,Blood_Pressure,Cholesterol,Glucose_Level,Heart_Rate,Sleep_Hours,Exercise_Hours,Water_Intake,Stress_Level,Target,Smoking,Alcohol,Diet,MentalHealth,PhysicalActivity,MedicalHistory,Allergies,Diet_Type_Vegan,Diet_Type_Vegetarian,Blood_Group_AB,Blood_Group_B,Blood_Group_O
0,37.0,26.0,111.0,198.0,99.0,72.0,37.0,37.0,37.0,37.0,1,2,2,1,2,1,0,1,False,True,True,False,False
1,37.0,24.0,121.0,199.0,103.0,75.0,37.0,37.0,37.0,37.0,1,0,1,1,2,1,2,2,False,False,True,False,False
2,81.0,27.0,147.0,203.0,100.0,74.0,81.0,81.0,81.0,81.0,0,2,1,2,0,0,1,0,True,False,False,False,False
3,25.0,21.0,150.0,199.0,102.0,70.0,25.0,25.0,25.0,25.0,0,2,0,1,2,1,2,0,True,False,False,True,False
4,24.0,26.0,146.0,202.0,99.0,76.0,24.0,24.0,24.0,24.0,0,0,1,2,0,2,0,2,False,True,False,True,False


## Segregating Variables: Independent and Dependent Variables

In [105]:
X = health_data.drop(['Target'], axis=1) # Get independent features
y = health_data['Target'] # Get dependent or target variable
X.shape, y.shape

((10000, 22), (10000,))

## One-Hot Encode the Categorical Columns in Independent Features

In [106]:
# Identify categorical columns in your dataset
categorical_cols = X.select_dtypes(include=['category']).columns

# Apply one-hot encoding to categorical columns
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=False)

X_encoded.head()

Unnamed: 0,Age,BMI,Blood_Pressure,Cholesterol,Glucose_Level,Heart_Rate,Sleep_Hours,Exercise_Hours,Water_Intake,Stress_Level,Diet_Type_Vegan,Diet_Type_Vegetarian,Blood_Group_AB,Blood_Group_B,Blood_Group_O,Smoking_0,Smoking_1,Smoking_2,Alcohol_0,Alcohol_1,Alcohol_2,Diet_0,Diet_1,Diet_2,MentalHealth_0,MentalHealth_1,MentalHealth_2,PhysicalActivity_0,PhysicalActivity_1,PhysicalActivity_2,MedicalHistory_0,MedicalHistory_1,MedicalHistory_2,Allergies_0,Allergies_1,Allergies_2
0,37.0,26.0,111.0,198.0,99.0,72.0,37.0,37.0,37.0,37.0,False,True,True,False,False,False,False,True,False,False,True,False,True,False,False,False,True,False,True,False,True,False,False,False,True,False
1,37.0,24.0,121.0,199.0,103.0,75.0,37.0,37.0,37.0,37.0,False,False,True,False,False,True,False,False,False,True,False,False,True,False,False,False,True,False,True,False,False,False,True,False,False,True
2,81.0,27.0,147.0,203.0,100.0,74.0,81.0,81.0,81.0,81.0,True,False,False,False,False,False,False,True,False,True,False,False,False,True,True,False,False,True,False,False,False,True,False,True,False,False
3,25.0,21.0,150.0,199.0,102.0,70.0,25.0,25.0,25.0,25.0,True,False,False,True,False,False,False,True,True,False,False,False,True,False,False,False,True,False,True,False,False,False,True,True,False,False
4,24.0,26.0,146.0,202.0,99.0,76.0,24.0,24.0,24.0,24.0,False,True,False,True,False,True,False,False,False,True,False,False,False,True,True,False,False,False,False,True,True,False,False,False,False,True


## Split Data into Train and Test Sets

In [107]:
train_X, test_X, train_y, test_Y = train_test_split(X, y, random_state=42, stratify=y)