In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score

In [2]:
data=pd.read_csv('healthcare-dataset-stroke-data.csv')

In [3]:
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
data.shape

(5110, 12)

In [5]:
data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [7]:
data.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

### DataType Conversion

In [8]:
data['age'] = data['age'].astype(int)

### Handling Missing Values

In [9]:
data['bmi'] = data['bmi'].fillna(data['bmi'].mean())

### Data Encoding

In [10]:
oneHotEncoder = OneHotEncoder(sparse=False)
data['gender']=oneHotEncoder.fit_transform(data['gender'].values.reshape(-1,1))
data['ever_married']=oneHotEncoder.fit_transform(data['ever_married'].values.reshape(-1,1))
data['work_type']=oneHotEncoder.fit_transform(data['work_type'].values.reshape(-1,1))
data['Residence_type']=oneHotEncoder.fit_transform(data['Residence_type'].values.reshape(-1,1))
data['smoking_status']=oneHotEncoder.fit_transform(data['smoking_status'].values.reshape(-1,1))

### Data Scaling

In [11]:
sc = StandardScaler()
data['bmi'] = sc.fit_transform(data['bmi'].values.reshape(-1,1))
data['avg_glucose_level'] = sc.fit_transform(data['avg_glucose_level'].values.reshape(-1,1))

In [12]:
data.corr()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,1.0,0.011426,0.003503,0.00355,-0.001296,0.001004,0.002999,-0.002144,0.001092,0.007812,-0.019243,0.006388
gender,0.011426,1.0,0.11958,0.065063,-0.021856,-0.327653,0.10622,0.099111,0.023885,-0.350934,-0.504546,-0.004129
age,0.003503,0.11958,1.0,0.276303,0.263668,0.073279,0.326271,0.028026,0.23806,0.236904,-0.378638,0.245128
hypertension,0.00355,0.065063,0.276303,1.0,0.108306,0.03124,0.160189,-0.021143,0.174474,0.058853,-0.141501,0.127904
heart_disease,-0.001296,-0.021856,0.263668,0.108306,1.0,0.044049,0.038899,-0.085617,0.161857,0.066804,-0.066731,0.134914
ever_married,0.001004,-0.327653,0.073279,0.03124,0.044049,1.0,0.085579,-0.011306,0.017646,-0.195571,-0.281177,0.008939
work_type,0.002999,0.10622,0.326271,0.160189,0.038899,0.085579,1.0,0.025797,0.168751,0.104257,-0.264956,0.038947
Residence_type,-0.002144,0.099111,0.028026,-0.021143,-0.085617,-0.011306,0.025797,1.0,-0.054902,-0.043603,-0.059393,-0.009027
avg_glucose_level,0.001092,0.023885,0.23806,0.174474,0.161857,0.017646,0.168751,-0.054902,1.0,0.068111,-0.095131,0.131945
bmi,0.007812,-0.350934,0.236904,0.058853,0.066804,-0.195571,0.104257,-0.043603,0.068111,1.0,-0.301156,0.064556


In [13]:
data = data.drop(['id'],axis=1)

In [14]:
X = data.drop(['stroke'],axis=1)
Y = data['stroke']

### SMOTE Technique

In [15]:
sm =  SMOTE(random_state=12, sampling_strategy=0.8)

X, y =  sm.fit_resample(X,Y)
#X_sm.shape, y_sm.shape
print('Original dataset shape {}'.format(Counter(Y)))
print('Resampled dataset shape {}'.format(Counter(y)))

Original dataset shape Counter({0: 4861, 1: 249})
Resampled dataset shape Counter({0: 4861, 1: 3888})


In [16]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42)

### Machine Learning

### Naive Bayes

In [17]:
nb = GaussianNB()
model = nb.fit(X_train,y_train)


In [18]:
y_pred = model.predict(X_test)

In [19]:
print(accuracy_score(y_pred,y_test))

0.7291428571428571


In [20]:
print(confusion_matrix(y_pred,y_test))

[[716 229]
 [245 560]]


In [21]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.75      0.76      0.75       945
           1       0.71      0.70      0.70       805

    accuracy                           0.73      1750
   macro avg       0.73      0.73      0.73      1750
weighted avg       0.73      0.73      0.73      1750



### Logistic Regression

In [22]:
lr = LogisticRegression()
model_lr = lr.fit(X_train,y_train)

In [23]:
y_pred_lr = model_lr.predict(X_test)

In [24]:
print('The accuracy score for the Logistic Regression:', round(accuracy_score(y_pred_lr,y_test),2))

The accuracy score for the Logistic Regression: 0.77


In [25]:
print('Confusion matrix for Logistic Regression: \n',confusion_matrix(y_pred_lr,y_test))

Confusion matrix for Logistic Regression: 
 [[758 205]
 [203 584]]


In [26]:
print('Classification Report for Logistic Regression: \n',classification_report(y_pred_lr,y_test))

Classification Report for Logistic Regression: 
               precision    recall  f1-score   support

           0       0.79      0.79      0.79       963
           1       0.74      0.74      0.74       787

    accuracy                           0.77      1750
   macro avg       0.76      0.76      0.76      1750
weighted avg       0.77      0.77      0.77      1750



### Decision Trees

In [27]:
dt = DecisionTreeClassifier()
model_dt = dt.fit(X_train,y_train)

In [28]:
y_pred_dt = model_dt.predict(X_test)

In [29]:
print('The accuracy score for the Decision Tree:',round(accuracy_score(y_pred_dt,y_test),2))

The accuracy score for the Decision Tree: 0.94


In [30]:
print('Confusion matrix for Decision Trees: \n',confusion_matrix(y_pred_dt,y_test))

Confusion matrix for Decision Trees: 
 [[892  43]
 [ 69 746]]


In [31]:
print('Classification Report for Decision Trees: \n',classification_report(y_pred_dt,y_test))

Classification Report for Decision Trees: 
               precision    recall  f1-score   support

           0       0.93      0.95      0.94       935
           1       0.95      0.92      0.93       815

    accuracy                           0.94      1750
   macro avg       0.94      0.93      0.94      1750
weighted avg       0.94      0.94      0.94      1750



### Random Forest

In [32]:
rf= RandomForestClassifier()
model_rf = rf.fit(X_train,y_train)

In [33]:
y_pred_rf = model_rf.predict(X_test)

In [34]:
print(accuracy_score(y_pred_rf,y_test))

0.9605714285714285


In [35]:
print('Confusion matrix for Random Forest: \n',confusion_matrix(y_pred_rf,y_test))

Confusion matrix for Random Forest: 
 [[927  35]
 [ 34 754]]


In [36]:
print('Classification Report for Random Forest: \n',classification_report(y_pred_rf,y_test))

Classification Report for Random Forest: 
               precision    recall  f1-score   support

           0       0.96      0.96      0.96       962
           1       0.96      0.96      0.96       788

    accuracy                           0.96      1750
   macro avg       0.96      0.96      0.96      1750
weighted avg       0.96      0.96      0.96      1750



### Kneighbors

In [37]:
kn = KNeighborsClassifier()
model_kn = kn.fit(X_train,y_train)

In [38]:
y_pred_kn = model_kn.predict(X_test)

In [39]:
print('The accuracy score for KNN:',round(accuracy_score(y_pred_kn,y_test),2))

The accuracy score for KNN: 0.88


In [40]:
print('Confusion matrix for KNN: \n',confusion_matrix(y_pred_kn,y_test))

Confusion matrix for KNN: 
 [[784  26]
 [177 763]]


In [41]:
print('Classification Report for KNN: \n',classification_report(y_pred_kn,y_test))

Classification Report for KNN: 
               precision    recall  f1-score   support

           0       0.82      0.97      0.89       810
           1       0.97      0.81      0.88       940

    accuracy                           0.88      1750
   macro avg       0.89      0.89      0.88      1750
weighted avg       0.90      0.88      0.88      1750

