# Healthcare Logistic Regression Model
## Author: Aron Gu
## Date: October 26, 2024

In [70]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, r2_score, accuracy_score, mean_squared_error, mean_absolute_error, roc_curve, auc, roc_auc_score, f1_score, recall_score, precision_score, accuracy_score
from sklearn.neighbors import KNeighborsClassifier

## Load the Healthcare Dataset

In [71]:
health_data = pd.read_csv("Healthcare_Dataset_Preprocessed.csv")

# Set the option to display all columns
pd.set_option('display.max_columns', None)

## Analyze Healthcare Dataset

### Display Rows of Dataset

In [72]:
health_data.head()

Unnamed: 0,Age,BMI,Blood_Pressure,Cholesterol,Glucose_Level,Heart_Rate,Sleep_Hours,Exercise_Hours,Water_Intake,Stress_Level,Target,Smoking,Alcohol,Diet,MentalHealth,PhysicalActivity,MedicalHistory,Allergies,Diet_Type_Vegan,Diet_Type_Vegetarian,Blood_Group_AB,Blood_Group_B,Blood_Group_O
0,2.0,26.0,111.0,198.0,99.0,72.0,4.0,1.0,5.0,5.0,1,2,2,1,2,1,0,1,0,1,1,0,0
1,8.0,24.0,121.0,199.0,103.0,75.0,2.0,1.0,2.0,9.0,1,0,1,1,2,1,2,2,0,0,1,0,0
2,81.0,27.0,147.0,203.0,100.0,74.0,10.0,-0.0,5.0,1.0,0,2,1,2,0,0,1,0,1,0,0,0,0
3,25.0,21.0,150.0,199.0,102.0,70.0,7.0,3.0,3.0,3.0,0,2,0,1,2,1,2,0,1,0,0,1,0
4,24.0,26.0,146.0,202.0,99.0,76.0,10.0,2.0,5.0,1.0,0,0,1,2,0,2,0,2,0,1,0,1,0


In [73]:
health_data.tail()

Unnamed: 0,Age,BMI,Blood_Pressure,Cholesterol,Glucose_Level,Heart_Rate,Sleep_Hours,Exercise_Hours,Water_Intake,Stress_Level,Target,Smoking,Alcohol,Diet,MentalHealth,PhysicalActivity,MedicalHistory,Allergies,Diet_Type_Vegan,Diet_Type_Vegetarian,Blood_Group_AB,Blood_Group_B,Blood_Group_O
9544,5.0,22.0,109.0,203.0,98.0,75.0,8.0,1.0,6.0,0.0,0,2,0,0,2,2,1,0,1,0,1,0,0
9545,94.0,26.0,144.0,203.0,96.0,72.0,8.0,4.0,2.0,1.0,0,0,2,1,0,1,0,2,0,1,0,1,0
9546,10.0,23.0,185.0,198.0,103.0,72.0,4.0,5.0,5.0,6.0,0,1,0,1,0,2,0,1,1,0,1,0,0
9547,50.0,29.0,166.0,200.0,100.0,74.0,8.0,2.0,3.0,4.0,0,1,1,2,0,0,1,1,1,0,1,0,0
9548,69.0,29.0,178.0,203.0,100.0,75.0,10.0,-0.0,6.0,2.0,0,2,0,1,2,1,2,0,0,0,1,0,0


### Get Dimensions/Shape of Data Set

In [74]:
health_data.shape

(9549, 23)

### Get Data Types for Each Column

In [75]:
health_data.dtypes

Age                     float64
BMI                     float64
Blood_Pressure          float64
Cholesterol             float64
Glucose_Level           float64
Heart_Rate              float64
Sleep_Hours             float64
Exercise_Hours          float64
Water_Intake            float64
Stress_Level            float64
Target                    int64
Smoking                   int64
Alcohol                   int64
Diet                      int64
MentalHealth              int64
PhysicalActivity          int64
MedicalHistory            int64
Allergies                 int64
Diet_Type_Vegan           int64
Diet_Type_Vegetarian      int64
Blood_Group_AB            int64
Blood_Group_B             int64
Blood_Group_O             int64
dtype: object

### Get Numerical Columns

In [76]:
# Numerical datatypes in dataset(doesn't include int64 since from dataset it's considered categorical)
health_data.select_dtypes(include='float64').dtypes

Age               float64
BMI               float64
Blood_Pressure    float64
Cholesterol       float64
Glucose_Level     float64
Heart_Rate        float64
Sleep_Hours       float64
Exercise_Hours    float64
Water_Intake      float64
Stress_Level      float64
dtype: object

### Get Categorical Columns

In [77]:
# Categorical datatypes in dataset(int64 since from dataset it's considered categorical)
health_data.select_dtypes(include='int64').dtypes

Target                  int64
Smoking                 int64
Alcohol                 int64
Diet                    int64
MentalHealth            int64
PhysicalActivity        int64
MedicalHistory          int64
Allergies               int64
Diet_Type_Vegan         int64
Diet_Type_Vegetarian    int64
Blood_Group_AB          int64
Blood_Group_B           int64
Blood_Group_O           int64
dtype: object

### Get General Data Information

In [78]:
health_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9549 entries, 0 to 9548
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age                   9549 non-null   float64
 1   BMI                   9549 non-null   float64
 2   Blood_Pressure        9549 non-null   float64
 3   Cholesterol           9549 non-null   float64
 4   Glucose_Level         9549 non-null   float64
 5   Heart_Rate            9549 non-null   float64
 6   Sleep_Hours           9549 non-null   float64
 7   Exercise_Hours        9549 non-null   float64
 8   Water_Intake          9549 non-null   float64
 9   Stress_Level          9549 non-null   float64
 10  Target                9549 non-null   int64  
 11  Smoking               9549 non-null   int64  
 12  Alcohol               9549 non-null   int64  
 13  Diet                  9549 non-null   int64  
 14  MentalHealth          9549 non-null   int64  
 15  PhysicalActivity     

### Get Summary on Dataset

In [79]:
# For numerical data
health_data.describe(include = 'float64')

Unnamed: 0,Age,BMI,Blood_Pressure,Cholesterol,Glucose_Level,Heart_Rate,Sleep_Hours,Exercise_Hours,Water_Intake,Stress_Level
count,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0
mean,33.806786,25.660697,130.382658,199.091528,100.225678,73.613782,6.951409,1.892345,3.580899,4.382134
std,24.566473,1.942369,27.878476,1.969234,2.157999,1.681538,2.352152,1.378714,1.622874,2.078593
min,0.0,19.0,22.0,192.0,93.0,67.0,0.0,-0.0,-0.0,0.0
25%,14.0,24.0,113.0,198.0,99.0,73.0,5.0,1.0,2.0,3.0
50%,29.0,26.0,134.0,199.0,100.0,74.0,7.0,2.0,4.0,4.0
75%,50.0,27.0,150.0,200.0,102.0,75.0,9.0,3.0,5.0,6.0
max,100.0,32.0,225.0,207.0,107.0,80.0,14.0,8.0,10.0,12.0


In [80]:
# For categorical data
int_cols = health_data.select_dtypes(include='int64')
for col in int_cols:
    print("Counts:")
    print(health_data[col].value_counts())
    print("\n")

Counts:
Target
1    4979
0    4570
Name: count, dtype: int64


Counts:
Smoking
0    3221
1    3198
2    3130
Name: count, dtype: int64


Counts:
Alcohol
0    3207
1    3181
2    3161
Name: count, dtype: int64


Counts:
Diet
2    3206
1    3193
0    3150
Name: count, dtype: int64


Counts:
MentalHealth
0    3232
2    3217
1    3100
Name: count, dtype: int64


Counts:
PhysicalActivity
1    3303
2    3139
0    3107
Name: count, dtype: int64


Counts:
MedicalHistory
1    3230
2    3182
0    3137
Name: count, dtype: int64


Counts:
Allergies
0    3228
1    3195
2    3126
Name: count, dtype: int64


Counts:
Diet_Type_Vegan
0    6404
1    3145
Name: count, dtype: int64


Counts:
Diet_Type_Vegetarian
0    6343
1    3206
Name: count, dtype: int64


Counts:
Blood_Group_AB
0    7210
1    2339
Name: count, dtype: int64


Counts:
Blood_Group_B
0    7125
1    2424
Name: count, dtype: int64


Counts:
Blood_Group_O
0    7137
1    2412
Name: count, dtype: int64




### Identify Missing Values in Dataset

In [81]:
health_data.isnull().sum()

Age                     0
BMI                     0
Blood_Pressure          0
Cholesterol             0
Glucose_Level           0
Heart_Rate              0
Sleep_Hours             0
Exercise_Hours          0
Water_Intake            0
Stress_Level            0
Target                  0
Smoking                 0
Alcohol                 0
Diet                    0
MentalHealth            0
PhysicalActivity        0
MedicalHistory          0
Allergies               0
Diet_Type_Vegan         0
Diet_Type_Vegetarian    0
Blood_Group_AB          0
Blood_Group_B           0
Blood_Group_O           0
dtype: int64

- No missing values in dataset, however from the problem statement there is an issue with some of the entries with the numerical columns as the values are either 0 or -0.

In [82]:
# Identify float64 columns
float_cols = health_data.select_dtypes(include=['float64']).columns

# Filter rows where any float64 column has a value <= 0
health_data[(health_data[float_cols] <= 0).any(axis=1)]

Unnamed: 0,Age,BMI,Blood_Pressure,Cholesterol,Glucose_Level,Heart_Rate,Sleep_Hours,Exercise_Hours,Water_Intake,Stress_Level,Target,Smoking,Alcohol,Diet,MentalHealth,PhysicalActivity,MedicalHistory,Allergies,Diet_Type_Vegan,Diet_Type_Vegetarian,Blood_Group_AB,Blood_Group_B,Blood_Group_O
2,81.0,27.0,147.0,203.0,100.0,74.0,10.0,-0.0,5.0,1.0,0,2,1,2,0,0,1,0,1,0,0,0,0
7,17.0,26.0,129.0,198.0,97.0,74.0,6.0,-0.0,5.0,3.0,1,2,2,1,1,1,0,0,0,0,1,0,0
35,17.0,25.0,73.0,197.0,101.0,74.0,4.0,-0.0,5.0,6.0,1,2,1,0,1,0,0,2,1,0,0,0,0
39,57.0,25.0,105.0,202.0,103.0,73.0,5.0,0.0,1.0,7.0,0,2,0,0,1,0,0,1,0,1,0,1,0
56,23.0,28.0,100.0,199.0,102.0,74.0,1.0,-0.0,3.0,9.0,1,1,1,1,2,0,1,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9533,37.0,25.0,111.0,197.0,104.0,75.0,4.0,-0.0,3.0,8.0,0,2,0,0,0,1,1,1,1,0,1,0,0
9540,13.0,25.0,136.0,200.0,102.0,76.0,4.0,0.0,4.0,6.0,0,2,0,0,0,1,2,0,0,0,1,0,0
9542,60.0,27.0,126.0,200.0,97.0,77.0,6.0,0.0,1.0,6.0,1,1,0,2,0,1,2,0,0,0,0,0,0
9544,5.0,22.0,109.0,203.0,98.0,75.0,8.0,1.0,6.0,0.0,0,2,0,0,2,2,1,0,1,0,1,0,0


- From the dataset, there are 1911 entries in which some of the float column values are either 0 or negative 0, thus using median imputation of the rows for non-zero entries(since age cannot be a decimal number) to fill out the entries for these 1911 rows.

In [83]:
# Calculate the median of each column ignoring values <= 0
medians = health_data[float_cols].apply(lambda col: col[col > 0].median())

# Replace values <= 0 with the corresponding column median
health_data[float_cols] = health_data[float_cols].apply(
    lambda col: col.mask(col <= 0, medians[col.name])
)