In [44]:
# --- import necessary packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import ipywidgets as widgets
from IPython.display import display
from sklearn.preprocessing import StandardScaler

# 1) id: unique identifier
# 2) gender: "Male", "Female" or "Other"
# 3) age: age of the patient 4) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension 5) heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease 6) ever_married: "No" or "Yes" 7) work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed" 8) Residence_type: "Rural" or "Urban" 9) avg_glucose_level: average glucose level in blood 10) bmi: body mass index 11) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"* 12) stroke: 1 if the patient had a stroke or 0 if not *Note: "Unknown" in smoking_status means that the information is unavailable for this patient
# 4) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension 
# 5) heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease 
# 6) ever_married: "No" or "Yes" 
# 7) work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed" 
# 8) Residence_type: "Rural" or "Urban" 
# 9) avg_glucose_level: average glucose level in blood 
# 10) bmi: body mass index 
# 11) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"* 
# 12) stroke: 1 if the patient had a stroke or 0 if not 
# *Note: "Unknown" in smoking_status means that the information is unavailable for this patient

# Acknowledgements
# (Confidential Source) - Use only for educational purposes If you use this dataset in your research, please credit the author.


# ---------------------------------------------------------------------------------------------

# --- Step 1: Load CSV into data frame ---
# Read CSV file within same directory
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

# Dataframe information
# print(f"Initial Data Frame:")
# df.info()
# print(f"/n")
# View amount of null entries
# df.isnull().sum()

# --- Step 2: Print initial data shape ---
# Initial CSV data frame shape
rows_before = df.shape[0]
cols_before = df.shape[1]
total_cells_before = rows_before * cols_before
print(f"Before filtering:")
print(f" - Number of rows: {rows_before}")
print(f" - Number of columns: {cols_before}")
print(f" - Total number of cells: {total_cells_before}")


# --- Step 3: Drop entries with 'bmi' column NaN values ---
print(f"\nDropping Entries:")

# Count the number of 'NAN' entries in 'bmi' before filtering
nan_before = df['bmi'].isna().sum()
print(f"Number of NaN entries in 'bmi' before filtering: {nan_before}")

# Drop null entries
df_cleaned = df.dropna()

# Count the number of 'NAN' entries in 'bmi' after filtering
nan_after = df_cleaned['bmi'].isna().sum()
print(f"Number of NaN entries in 'bmi' after filtering: {df_cleaned['bmi'].isna().sum()}")


# --- Step 4: Drop entries with 'smoking_status' column Unknown values ---

# Count the number of 'Unknown' entries in 'smoking_status' before filtering
unknown_before = df['smoking_status'].value_counts().get('Unknown', 0)
print(f"Number of 'Unknown' smoking_status entries before filtering: {unknown_before}")

# Drop unknown smoking status as we consider it an incomplete entry
df_cleaned = df_cleaned[df_cleaned['smoking_status'] != 'Unknown']

# Count the number of 'Unknown' entries in 'smoking_status' after filtering
unknown_after = df_cleaned['smoking_status'].value_counts().get('Unknown', 0)
print(f"Number of 'Unknown' smoking_status entries after filtering: {unknown_after}")

# --- Step 5: Drop 'id' column
df_cleaned = df_cleaned.drop('id', axis=1)
print(f"\n Dropping id column")

# ------- TRANSFORMED DATA SHAPE ----------------------------------------------------------------
rows_after = df_cleaned.shape[0]
cols_after = df_cleaned.shape[1]
total_cells_after = rows_after * cols_after
print(f"\nAfter filtering:")
print(f" - Number of rows: {rows_after}")
print(f" - Number of columns: {cols_after}")
print(f" - Total number of cells: {total_cells_after}")

# ------- Implement One-Hot Encoding ----------------------------------------------------------------

# Store categorical columns 
categorical_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

# Perform One-Hot Encoding on categorical columns
df_encoded = pd.get_dummies(df_cleaned, columns=categorical_cols, drop_first=False)

# Display the first few rows of the encoded DataFrame
df_encoded.head()

# Initialize the scaler
scaler = StandardScaler()

# Identify numerical (not to be confused with binary) columns to scale
numerical_cols = ['age', 'avg_glucose_level', 'bmi']

# Fit and transform the numerical columns
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

# Display the first few rows to verify scaling
df_encoded.head()

Before filtering:
 - Number of rows: 5110
 - Number of columns: 12
 - Total number of cells: 61320

Dropping Entries:
Number of NaN entries in 'bmi' before filtering: 201
Number of NaN entries in 'bmi' after filtering: 0
Number of 'Unknown' smoking_status entries before filtering: 1544
Number of 'Unknown' smoking_status entries after filtering: 0

 Dropping id column

After filtering:
 - Number of rows: 3426
 - Number of columns: 11
 - Total number of cells: 37686


Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0.973768,0,1,2.523621,0.864982,1,False,True,False,False,...,False,False,True,False,False,False,True,True,False,False
2,1.663479,0,1,-0.050358,0.302945,1,False,True,False,False,...,False,False,True,False,False,True,False,False,True,False
3,0.018784,0,0,1.318923,0.563401,1,True,False,False,False,...,False,False,True,False,False,False,True,False,False,True
4,1.610424,1,0,1.379514,-0.862253,1,True,False,False,False,...,False,False,False,True,False,True,False,False,True,False
5,1.716533,0,0,1.632992,-0.176842,1,False,True,False,False,...,False,False,True,False,False,False,True,True,False,False
