In [252]:
from sklearn.linear_model import LogisticRegression as lr

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn import metrics

In [253]:
# Enabled to remove warnings for demo purposes.
import warnings
warnings.filterwarnings('ignore')

In [255]:
import math
import numpy as np
import pandas as pd
# import scipy.stats as stats
import statsmodels.api as sm

import matplotlib.pyplot as plt
plt.style.use('classic')
%matplotlib inline

import seaborn as sns
sns.set(rc={'figure.figsize':(16,10)}, font_scale=1.3)

In [256]:
df = pd.read_csv('./healthcare-dataset-stroke-data.csv')

In [257]:
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [258]:
# delete row with NA value
df_na = df.dropna()

In [259]:
df_na

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [260]:
df_na.reset_index(drop=True,inplace=True)
df_na.reset_index(inplace=True)

In [261]:
df_na.head(20)

Unnamed: 0,index,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,1,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,2,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,3,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,4,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
5,5,53882,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
6,6,10434,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
7,7,60491,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1
8,8,12109,Female,81.0,1,0,Yes,Private,Rural,80.43,29.7,never smoked,1
9,9,12095,Female,61.0,0,1,Yes,Govt_job,Rural,120.46,36.8,smokes,1


In [262]:
pd.isnull(df_na).any()

index                False
id                   False
gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                  False
smoking_status       False
stroke               False
dtype: bool

In [263]:
df_na['ever_married'] = df_na.ever_married.map({'Yes':1, 'No':0})
# For gender, 1 means Male, 0 means Female
df_na['gender'] = df_na.gender.map({'Male':1, 'Female':0})
# For Residence_type, 1 means Urban, 0 means Rural
df_na['Residence_type'] = df_na.Residence_type.map({'Urban':1, 'Rural':0})

In [264]:
df_na.head(20)

Unnamed: 0,index,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,9046,1.0,67.0,0,1,1,Private,1,228.69,36.6,formerly smoked,1
1,1,31112,1.0,80.0,0,1,1,Private,0,105.92,32.5,never smoked,1
2,2,60182,0.0,49.0,0,0,1,Private,1,171.23,34.4,smokes,1
3,3,1665,0.0,79.0,1,0,1,Self-employed,0,174.12,24.0,never smoked,1
4,4,56669,1.0,81.0,0,0,1,Private,1,186.21,29.0,formerly smoked,1
5,5,53882,1.0,74.0,1,1,1,Private,0,70.09,27.4,never smoked,1
6,6,10434,0.0,69.0,0,0,0,Private,1,94.39,22.8,never smoked,1
7,7,60491,0.0,78.0,0,0,1,Private,1,58.57,24.2,Unknown,1
8,8,12109,0.0,81.0,1,0,1,Private,0,80.43,29.7,never smoked,1
9,9,12095,0.0,61.0,0,1,1,Govt_job,0,120.46,36.8,smokes,1


In [265]:
pd.isnull(df_na).any()
# there is one sample with 'Other' gender'

index                False
id                   False
gender                True
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                  False
smoking_status       False
stroke               False
dtype: bool

In [266]:
# delete the row with 'gender' value of 'Other'
df_na = df_na.dropna()

In [267]:
df_na

Unnamed: 0,index,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,9046,1.0,67.0,0,1,1,Private,1,228.69,36.6,formerly smoked,1
1,1,31112,1.0,80.0,0,1,1,Private,0,105.92,32.5,never smoked,1
2,2,60182,0.0,49.0,0,0,1,Private,1,171.23,34.4,smokes,1
3,3,1665,0.0,79.0,1,0,1,Self-employed,0,174.12,24.0,never smoked,1
4,4,56669,1.0,81.0,0,0,1,Private,1,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4904,4904,14180,0.0,13.0,0,0,0,children,0,103.08,18.6,Unknown,0
4905,4905,44873,0.0,81.0,0,0,1,Self-employed,1,125.20,40.0,never smoked,0
4906,4906,19723,0.0,35.0,0,0,1,Self-employed,0,82.99,30.6,never smoked,0
4907,4907,37544,1.0,51.0,0,0,1,Private,0,166.29,25.6,formerly smoked,0


In [268]:
def BMI_check(x):
    if x >= 30:
        return 1
    else:
        return 0
    
# Use bmi as treatment. The bound is 30. 1 means higher or equal. 0 means lower.
df_na['treatment_bmi'] = df_na.bmi.apply(BMI_check)

In [269]:
df_na

Unnamed: 0,index,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,treatment_bmi
0,0,9046,1.0,67.0,0,1,1,Private,1,228.69,36.6,formerly smoked,1,1
1,1,31112,1.0,80.0,0,1,1,Private,0,105.92,32.5,never smoked,1,1
2,2,60182,0.0,49.0,0,0,1,Private,1,171.23,34.4,smokes,1,1
3,3,1665,0.0,79.0,1,0,1,Self-employed,0,174.12,24.0,never smoked,1,0
4,4,56669,1.0,81.0,0,0,1,Private,1,186.21,29.0,formerly smoked,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4904,4904,14180,0.0,13.0,0,0,0,children,0,103.08,18.6,Unknown,0,0
4905,4905,44873,0.0,81.0,0,0,1,Self-employed,1,125.20,40.0,never smoked,0,1
4906,4906,19723,0.0,35.0,0,0,1,Self-employed,0,82.99,30.6,never smoked,0,1
4907,4907,37544,1.0,51.0,0,0,1,Private,0,166.29,25.6,formerly smoked,0,0


In [270]:
df_na['treatment_bmi'].sum()

1920

In [271]:
df_data = df_na.drop(columns=['index','id','bmi'])
df_data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,smoking_status,stroke,treatment_bmi
0,1.0,67.0,0,1,1,Private,1,228.69,formerly smoked,1,1
1,1.0,80.0,0,1,1,Private,0,105.92,never smoked,1,1
2,0.0,49.0,0,0,1,Private,1,171.23,smokes,1,1
3,0.0,79.0,1,0,1,Self-employed,0,174.12,never smoked,1,0
4,1.0,81.0,0,0,1,Private,1,186.21,formerly smoked,1,0
...,...,...,...,...,...,...,...,...,...,...,...
4904,0.0,13.0,0,0,0,children,0,103.08,Unknown,0,0
4905,0.0,81.0,0,0,1,Self-employed,1,125.20,never smoked,0,1
4906,0.0,35.0,0,0,1,Self-employed,0,82.99,never smoked,0,1
4907,1.0,51.0,0,0,1,Private,0,166.29,formerly smoked,0,0


In [272]:
df_data.to_csv('stroke_data_processed.csv')
df_data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,smoking_status,stroke,treatment_bmi
0,1.0,67.0,0,1,1,Private,1,228.69,formerly smoked,1,1
1,1.0,80.0,0,1,1,Private,0,105.92,never smoked,1,1
2,0.0,49.0,0,0,1,Private,1,171.23,smokes,1,1
3,0.0,79.0,1,0,1,Self-employed,0,174.12,never smoked,1,0
4,1.0,81.0,0,0,1,Private,1,186.21,formerly smoked,1,0
...,...,...,...,...,...,...,...,...,...,...,...
4904,0.0,13.0,0,0,0,children,0,103.08,Unknown,0,0
4905,0.0,81.0,0,0,1,Self-employed,1,125.20,never smoked,0,1
4906,0.0,35.0,0,0,1,Self-employed,0,82.99,never smoked,0,1
4907,1.0,51.0,0,0,1,Private,0,166.29,formerly smoked,0,0
