In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

Load our chosen dataest and view the first 5 rows

In [2]:
stroke_df = pd.read_csv("Stroke.csv")
stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


Describe each column

In [3]:
stroke_df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


Check which columns have null values

In [4]:
stroke_df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

Use mean method to handle null values

In [5]:
numImputer = SimpleImputer(missing_values = np.nan, strategy='mean')
numImputer = numImputer.fit(stroke_df[['bmi']])
stroke_df.bmi = numImputer.transform(stroke_df[['bmi']])
stroke_df.head(60)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,53882,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,10434,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
8,27419,Female,59.0,0,0,Yes,Private,Rural,76.15,28.893237,Unknown,1
9,60491,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1


Drop Unknown and Other rows

In [6]:
stroke_df = stroke_df.replace(['Unknown'], [np.nan])
stroke_df = stroke_df.replace(['Other'], [np.nan])
stroke_df = stroke_df.dropna()
stroke_df.head(50)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,53882,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,10434,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
10,12109,Female,81.0,1,0,Yes,Private,Rural,80.43,29.7,never smoked,1
11,12095,Female,61.0,0,1,Yes,Govt_job,Rural,120.46,36.8,smokes,1


Drop irrelevant columns

In [7]:
stroke_df = stroke_df.drop(['id'], axis=1)
stroke_df.head(50)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
10,Female,81.0,1,0,Yes,Private,Rural,80.43,29.7,never smoked,1
11,Female,61.0,0,1,Yes,Govt_job,Rural,120.46,36.8,smokes,1


In [8]:
stroke_df.gender = stroke_df.gender.replace(['Male','Female'], [0,1])
stroke_df.Residence_type = stroke_df.Residence_type.replace(['Urban','Rural'], [0,1])
stroke_df.ever_married = stroke_df.ever_married.replace(['Yes','No'], ['1','0'])
stroke_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,Private,0,228.69,36.6,formerly smoked,1
1,1,61.0,0,0,1,Self-employed,1,202.21,28.893237,never smoked,1
2,0,80.0,0,1,1,Private,1,105.92,32.5,never smoked,1
3,1,49.0,0,0,1,Private,0,171.23,34.4,smokes,1
4,1,79.0,1,0,1,Self-employed,1,174.12,24.0,never smoked,1


In [9]:
X= stroke_df.drop('stroke', axis = 1)
Y= stroke_df.stroke

In [10]:
X

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,0,67.0,0,1,1,Private,0,228.69,36.600000,formerly smoked
1,1,61.0,0,0,1,Self-employed,1,202.21,28.893237,never smoked
2,0,80.0,0,1,1,Private,1,105.92,32.500000,never smoked
3,1,49.0,0,0,1,Private,0,171.23,34.400000,smokes
4,1,79.0,1,0,1,Self-employed,1,174.12,24.000000,never smoked
...,...,...,...,...,...,...,...,...,...,...
5102,1,57.0,0,0,1,Private,1,77.93,21.700000,never smoked
5105,1,80.0,1,0,1,Private,0,83.75,28.893237,never smoked
5106,1,81.0,0,0,1,Self-employed,0,125.20,40.000000,never smoked
5107,1,35.0,0,0,1,Self-employed,1,82.99,30.600000,never smoked


In [11]:
stroke_df.work_type.value_counts()

Private          2284
Self-employed     663
Govt_job          535
children           69
Never_worked       14
Name: work_type, dtype: int64

In [12]:
stroke_df.smoking_status.value_counts()

never smoked       1892
formerly smoked     884
smokes              789
Name: smoking_status, dtype: int64

In [13]:
labelencoder = LabelEncoder()

for col in X.columns:
    if X[col].dtype =='object':
        X[col] = labelencoder.fit_transform(X[col])

In [14]:
X

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,0,67.0,0,1,1,2,0,228.69,36.600000,0
1,1,61.0,0,0,1,3,1,202.21,28.893237,1
2,0,80.0,0,1,1,2,1,105.92,32.500000,1
3,1,49.0,0,0,1,2,0,171.23,34.400000,2
4,1,79.0,1,0,1,3,1,174.12,24.000000,1
...,...,...,...,...,...,...,...,...,...,...
5102,1,57.0,0,0,1,2,1,77.93,21.700000,1
5105,1,80.0,1,0,1,2,0,83.75,28.893237,1
5106,1,81.0,0,0,1,3,0,125.20,40.000000,1
5107,1,35.0,0,0,1,3,1,82.99,30.600000,1


Now that the data has been cleaned, split our data into train and test models

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)

In [16]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [17]:
Y.value_counts()

0    3363
1     202
Name: stroke, dtype: int64

Since the data is unbalanced with respect to the distribution of class labels, we employ SMOTE before classification

In [18]:
smote = SMOTE(random_state=42)
X_train_sm, Y_train_sm = smote.fit_resample(X_train_scaled, Y_train)

print("Number of samples after SMOTE:", len(X_train_sm), len(Y_train_sm))

Y_train_sm.value_counts()

Number of samples after SMOTE: 5400 5400


0    2700
1    2700
Name: stroke, dtype: int64

Begin Classification, firstly we evaluate the performance of a Decision Tree model

In [19]:
dt_classifier = DecisionTreeClassifier(random_state = 42)
dt_classifier.fit(X_train_sm, Y_train_sm)

In [20]:
Y_pred_dt = dt_classifier.predict(X_test_scaled)
Y_pred_dt

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,

Review and evaluate the performance of our Decision Tree model

In [21]:
acc = metrics.accuracy_score(Y_test, Y_pred_dt)
acc

0.8569424964936886

In [22]:
recall = metrics.recall_score(Y_test, Y_pred_dt)
recall

0.1

In [23]:
precision = metrics.precision_score(Y_test, Y_pred_dt)
precision

0.08064516129032258

In [24]:
f1 = metrics.f1_score(Y_test, Y_pred_dt)
f1

0.08928571428571427

In [25]:
cm = confusion_matrix(Y_test, Y_pred_dt)
cm

array([[606,  57],
       [ 45,   5]], dtype=int64)

In [26]:
print ('accuracy:%.2f\n\n'%(acc))
print('Confusion Matrix:')
print(cm, '\n\n')
print('-------------------------------------')
result = metrics.classification_report(Y_test, Y_pred_dt)
print('Classification Report: \n')
print(result)

accuracy:0.86


Confusion Matrix:
[[606  57]
 [ 45   5]] 


-------------------------------------
Classification Report: 

              precision    recall  f1-score   support

           0       0.93      0.91      0.92       663
           1       0.08      0.10      0.09        50

    accuracy                           0.86       713
   macro avg       0.51      0.51      0.51       713
weighted avg       0.87      0.86      0.86       713



Perform k-Nearest Neighbours classification

In [27]:
knn_classifier = KNeighborsClassifier(n_neighbors = 3)
knn_classifier.fit(X_train_sm, Y_train_sm)

In [28]:
Y_pred_knn = knn_classifier.predict(X_test_scaled)

In [29]:
acc = metrics.accuracy_score(Y_test, Y_pred_knn)
acc

0.8288920056100981

In [30]:
recall = metrics.recall_score(Y_test, Y_pred_knn)
recall

0.14

In [31]:
precision = metrics.precision_score(Y_test, Y_pred_knn)
precision

0.08139534883720931

In [32]:
f1 = metrics.f1_score(Y_test, Y_pred_knn)
f1

0.10294117647058824

In [33]:
cm = confusion_matrix(Y_test, Y_pred_knn)
cm

array([[584,  79],
       [ 43,   7]], dtype=int64)

In [34]:
print ('accuracy:%.2f\n\n'%(acc))
print('Confusion Matrix:')
print(cm, '\n\n')
print('-------------------------------------')
result = metrics.classification_report(Y_test, Y_pred_knn)
print('Classification Report: \n')
print(result)

accuracy:0.83


Confusion Matrix:
[[584  79]
 [ 43   7]] 


-------------------------------------
Classification Report: 

              precision    recall  f1-score   support

           0       0.93      0.88      0.91       663
           1       0.08      0.14      0.10        50

    accuracy                           0.83       713
   macro avg       0.51      0.51      0.50       713
weighted avg       0.87      0.83      0.85       713

