# Stroke Prediction - Binary Classification of Strokes Given Patient Characteristics

## 1. Reading and pre-processing datasets

In [16]:
import numpy as np
import pandas as pd
import mltools as ml

# data1 = np.genfromtxt('healthcare-dataset-stroke-data.csv', delimiter=',')
# data2 = np.genfromtxt('train.csv', delimiter=',')
# print(f"Read healthcare-dataset-stroke-data.csv: {data1.shape}")
# print(f"Read train.csv: {data2.shape}")

# data = np.concatenate((data1, data2), axis=0)
# print(data.shape)

# X = data[:,0]
# X = np.atleast_2d(X).T
# Y = data[:,1]
# Xtr,Xva,Ytr,Yva = ml.splitData(X,Y,0.75)

# print(f"Xtr {Xtr.shape}")
# print(f"Xva {Xva.shape}")
# print(f"Ytr {Ytr.shape}")
# print(f"Yva {Yva.shape}")

### Dropping rows with missing data

In [17]:
df1 = pd.read_csv("data/healthcare-dataset-stroke-data.csv")
df2 = pd.read_csv("data/train.csv")
df = pd.concat([df1, df2])

print("Dataset before processing:")
print("-" * 25)
print(f"Shape: {df.shape}\n")

print(df.astype('object').describe(include='all').loc['unique', :])
print()
print()

print("Dataset after processing:")
print("-" * 25)

df = df[df.smoking_status != "Unknown"]
print(f"Shape after filtering out 'Unknown' values from smoking_status: {df.shape}")

df = df.dropna()
print(f"Shape after dropping NA values: {df.shape}\n")

print(df.astype('object').describe(include='all').loc['unique', :])

Dataset before processing:
-------------------------
Shape: (20414, 12)

id                    19309
gender                    3
age                   106.0
hypertension              2
heart_disease             2
ever_married              2
work_type                 5
Residence_type            2
avg_glucose_level    4644.0
bmi                   440.0
smoking_status            4
stroke                    2
Name: unique, dtype: object


Dataset after processing:
-------------------------
Shape after filtering out 'Unknown' values from smoking_status: (14327, 12)
Shape after dropping NA values: (14187, 12)

id                    13712
gender                    3
age                    82.0
hypertension              2
heart_disease             2
ever_married              2
work_type                 5
Residence_type            2
avg_glucose_level    4053.0
bmi                   406.0
smoking_status            3
stroke                    2
Name: unique, dtype: object


### Standardizing numerical data

In [18]:
from sklearn.preprocessing import StandardScaler

num_cols = df.columns[df.dtypes.apply(lambda c: np.issubdtype(c, np.number))]
num_cols = num_cols.delete([0, 2, 3, 6])
print(f"numerical data to standardize: {num_cols}\n")

scaler = StandardScaler()

df[num_cols] = scaler.fit_transform(df[num_cols])
print(df)

numerical data to standardize: Index(['age', 'avg_glucose_level', 'bmi'], dtype='object')

          id  gender       age  hypertension  heart_disease ever_married  \
0       9046    Male  1.091476             0              1          Yes   
2      31112    Male  1.816131             0              1          Yes   
3      60182  Female  0.088109             0              0          Yes   
4       1665  Female  1.760388             1              0          Yes   
5      56669    Male  1.871874             0              0          Yes   
...      ...     ...       ...           ...            ...          ...   
15298  15298  Female -1.695656             0              0           No   
15299  15299  Female -1.416943             0              0           No   
15300  15300  Female -0.079119             1              0          Yes   
15301  15301  Female  1.537418             0              0          Yes   
15303  15303  Female -1.862884             0              0           No 

### One-hot encoding categorical features

In [19]:
df = pd.get_dummies(df, columns=['gender', 'work_type', 'Residence_type', 'smoking_status'],  dtype=int)
df.replace({"Yes": 1, "No": 0}, inplace=True)

print(df)

          id       age  hypertension  heart_disease  ever_married  \
0       9046  1.091476             0              1             1   
2      31112  1.816131             0              1             1   
3      60182  0.088109             0              0             1   
4       1665  1.760388             1              0             1   
5      56669  1.871874             0              0             1   
...      ...       ...           ...            ...           ...   
15298  15298 -1.695656             0              0             0   
15299  15299 -1.416943             0              0             0   
15300  15300 -0.079119             1              0             1   
15301  15301  1.537418             0              0             1   
15303  15303 -1.862884             0              0             0   

       avg_glucose_level       bmi  stroke  gender_Female  gender_Male  ...  \
0               3.922342  1.080068       1              0            1  ...   
2            

## 2. Dataset and feature exploration