# Constants

In [81]:
ID = "ID"
GENDER = "Gender"
AGE = "Age"
HYPERTENSION = "Hypertension"
HEART_DISEASE = "Heart Disease"
EVER_MARRIED = "Ever Married"
WORK_TYPE = "Work Type"
RESIDENCE_TYPE = "Residence Type"
AVG_GLUCOSE_LEVEL = "Average Glucose Level"
BMI = "BMI"
SMOKING_STATUS = "Smoking Status"
STROKE = "Stroke"

# Import

In [82]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

# Loading Dataset

In [83]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


# Data Cleaning

In [84]:
df.fillna(0, inplace=True)
df.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [85]:
if "number" in df.columns: df.drop(["number"], axis=1)
df.columns = [ID, GENDER, AGE, HYPERTENSION, HEART_DISEASE, EVER_MARRIED, WORK_TYPE, RESIDENCE_TYPE, AVG_GLUCOSE_LEVEL, BMI, SMOKING_STATUS, STROKE]
df.head()

Unnamed: 0,ID,Gender,Age,Hypertension,Heart Disease,Ever Married,Work Type,Residence Type,Average Glucose Level,BMI,Smoking Status,Stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,0.0,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [86]:
df.drop(ID, axis=1, inplace=True)

In [87]:
# gender: male = 0, female = 1, other = 2 ;
# ever married: yes = 1, no = 0 ;
# work type: children = 0, govt job = 1, never worked = 2, private = 3, self-employed = 4 ;
# residence type: rural = 0, urban = 1 ;
# smoking status: formerly smoked = 0, never smoked = 1, smokes = 2, unknown = 3

df[GENDER] = df[GENDER].replace({"Male": 0, "Female": 1, "Other": 2}).astype(int)
df[EVER_MARRIED] = df[EVER_MARRIED].replace({"Yes": 1, "No": 0}).astype(int)
df[WORK_TYPE] = df[WORK_TYPE].replace({"children": 0, "Govt_job": 1, "Never_worked": 2, "Private": 3, "Self-employed": 4}).astype(int)
df[RESIDENCE_TYPE] = df[RESIDENCE_TYPE].replace({"Rural": 0, "Urban": 1}).astype(int)
df[SMOKING_STATUS] = df[SMOKING_STATUS].replace({"formerly smoked": 0, "never smoked": 1, "smokes": 2, "Unknown": 3}).astype(int)
df.head()

  df[GENDER] = df[GENDER].replace({"Male": 0, "Female": 1, "Other": 2}).astype(int)
  df[EVER_MARRIED] = df[EVER_MARRIED].replace({"Yes": 1, "No": 0}).astype(int)
  df[WORK_TYPE] = df[WORK_TYPE].replace({"children": 0, "Govt_job": 1, "Never_worked": 2, "Private": 3, "Self-employed": 4}).astype(int)
  df[RESIDENCE_TYPE] = df[RESIDENCE_TYPE].replace({"Rural": 0, "Urban": 1}).astype(int)
  df[SMOKING_STATUS] = df[SMOKING_STATUS].replace({"formerly smoked": 0, "never smoked": 1, "smokes": 2, "Unknown": 3}).astype(int)


Unnamed: 0,Gender,Age,Hypertension,Heart Disease,Ever Married,Work Type,Residence Type,Average Glucose Level,BMI,Smoking Status,Stroke
0,0,67.0,0,1,1,3,1,228.69,36.6,0,1
1,1,61.0,0,0,1,4,0,202.21,0.0,1,1
2,0,80.0,0,1,1,3,0,105.92,32.5,1,1
3,1,49.0,0,0,1,3,1,171.23,34.4,2,1
4,1,79.0,1,0,1,4,0,174.12,24.0,1,1


# Separating Data and Target

In [88]:
df_data = df.iloc[:, :-1]
df_target = df.iloc[:, -1]
df_data

Unnamed: 0,Gender,Age,Hypertension,Heart Disease,Ever Married,Work Type,Residence Type,Average Glucose Level,BMI,Smoking Status
0,0,67.0,0,1,1,3,1,228.69,36.6,0
1,1,61.0,0,0,1,4,0,202.21,0.0,1
2,0,80.0,0,1,1,3,0,105.92,32.5,1
3,1,49.0,0,0,1,3,1,171.23,34.4,2
4,1,79.0,1,0,1,4,0,174.12,24.0,1
...,...,...,...,...,...,...,...,...,...,...
5105,1,80.0,1,0,1,3,1,83.75,0.0,1
5106,1,81.0,0,0,1,4,1,125.20,40.0,1
5107,1,35.0,0,0,1,4,0,82.99,30.6,1
5108,0,51.0,0,0,1,3,0,166.29,25.6,0


In [89]:
df_target

0       1
1       1
2       1
3       1
4       1
       ..
5105    0
5106    0
5107    0
5108    0
5109    0
Name: Stroke, Length: 5110, dtype: int64

# Splitting into Training and Testing Sets

In [90]:
data_train, data_test, target_train, target_test = train_test_split(df_data, df_target, test_size=0.2, shuffle=True, random_state=0)

# k-NN Classifier

In [91]:
K = 3
knn = KNeighborsClassifier(K)
knn.fit(data_train, target_train)
target_pred = knn.predict(data_test)
print(target_pred)

[0 0 0 ... 0 0 0]


# Evaluating the Model's Accuracy

In [92]:
accuracy_score(target_test, target_pred)

0.9393346379647749

# K-Fold Cross Validation

In [93]:
k = 5
kf = KFold(n_splits=k , shuffle=True, random_state=42)

# Train and Evaluate the Model with K-Fold Cross-Validation

In [94]:
k_neighbors = 3
accuracies = []

for train_index, test_index in kf.split(df_data):
    data_train, data_test = df_data.iloc[train_index], df_data.iloc[test_index]
    target_train, target_test = df_target.iloc[train_index], df_target.iloc[test_index]

    knn = KNeighborsClassifier(n_neighbors=k_neighbors)
    knn.fit(data_train, target_train)

    target_pred = knn.predict(data_test)

    accuracy = accuracy_score(target_test, target_pred)
    accuracies.append(accuracy)

average_accuracy = np.mean(accuracies)
print(average_accuracy)

0.9397260273972602
