In [None]:
# Uncomment the following if you first started, but comment out after soy you don't have to install every time
# %pip install pandas
# %pip install scikit-learn

In [151]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer

In [None]:
# Download the dataset from kaggle - "https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset/data"
# This will read the csv file if its in your folder, or you can use the full path to the file
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [None]:
# Drop insufficient and uneeded data ("id", "gender - other")
df.drop(columns=["id"], inplace=True)
df.drop(df[df["gender"] == "Other"].index, inplace=True)

In [None]:
features = ["gender", "hypertension", "heart_disease", "stroke", "bmi"]
categorical_features = ["smoking_status", "work_type", "ever_married", "Residence_type"]
numerical_features = ["age", "avg_glucose_level"] # add bmi later after linear regression model

# Preprocess the data
# 1. ignore changing "features" in the dataset
# 2. Convert cateogorical features to one-hot encoding to make the model use numeric values
# 3. Scale the numerical features to make numbers closer and scale the data
preprocess = make_column_transformer(
    ("passthrough", features), 
    (OneHotEncoder(sparse_output=False), categorical_features),
    (StandardScaler(), numerical_features)
)

In [155]:
preprocess.fit(df)
df_transformed = preprocess.transform(df)
df = pd.DataFrame(
    data = df_transformed,
    columns = features + preprocess.named_transformers_["onehotencoder"].get_feature_names_out().tolist() + numerical_features,
    index = df.index
)

In [156]:
df

Unnamed: 0,gender,hypertension,heart_disease,stroke,bmi,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,ever_married_No,ever_married_Yes,Residence_type_Rural,Residence_type_Urban,age,avg_glucose_level
0,Male,0,1,1,36.6,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.051242,2.70645
1,Female,0,0,1,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.785889,2.121652
2,Male,0,1,1,32.5,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.626174,-0.004867
3,Female,0,0,1,34.4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.255182,1.437473
4,Female,1,0,1,24.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.581949,1.501297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,1,0,0,,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.626174,-0.494481
5106,Female,0,0,0,40.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.6704,0.420922
5107,Female,0,0,0,30.6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,-0.363976,-0.511266
5108,Male,0,0,0,25.6,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.343633,1.328375
