In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

## 資料探索

In [3]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,983.0,983.0,983.0,983.0,983.0,913.0,983.0
mean,36449.054934,48.636012,0.1353,0.087487,113.480427,29.611391,0.253306
std,21514.707247,23.124822,0.342218,0.282692,51.125018,7.916709,0.435126
min,91.0,0.16,0.0,0.0,55.22,13.2,0.0
25%,17008.5,31.0,0.0,0.0,78.935,24.2,0.0
50%,36811.0,52.0,0.0,0.0,93.96,28.4,0.0
75%,54874.0,68.0,0.0,0.0,127.52,33.6,1.0
max,72918.0,82.0,1.0,1.0,271.74,64.8,1.0


In [5]:
df.drop("id", axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 983 entries, 0 to 982
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             983 non-null    object 
 1   age                983 non-null    float64
 2   hypertension       983 non-null    int64  
 3   heart_disease      983 non-null    int64  
 4   ever_married       983 non-null    object 
 5   work_type          983 non-null    object 
 6   Residence_type     983 non-null    object 
 7   avg_glucose_level  983 non-null    float64
 8   bmi                913 non-null    float64
 9   smoking_status     983 non-null    object 
 10  stroke             983 non-null    int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 84.6+ KB


In [6]:
df["stroke"].value_counts()

0    734
1    249
Name: stroke, dtype: int64

In [7]:
df["ever_married"].value_counts()

Yes    708
No     275
Name: ever_married, dtype: int64

In [8]:
df["work_type"].value_counts()

Private          579
Self-employed    165
Govt_job         129
children         107
Never_worked       3
Name: work_type, dtype: int64

In [9]:
df["Residence_type"].value_counts()

Urban    496
Rural    487
Name: Residence_type, dtype: int64

In [10]:
df["smoking_status"].value_counts()

never smoked       362
Unknown            280
formerly smoked    182
smokes             159
Name: smoking_status, dtype: int64

In [11]:
df.groupby("stroke").mean(numeric_only=True)

Unnamed: 0_level_0,age,hypertension,heart_disease,avg_glucose_level,bmi
stroke,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,42.159237,0.091281,0.053134,107.013106,29.356108
1,67.728193,0.26506,0.188755,132.544739,30.471292


## 處理缺失數據

In [12]:
df.isnull().sum().sort_values(ascending=False)

bmi                  70
stroke                0
smoking_status        0
avg_glucose_level     0
Residence_type        0
work_type             0
ever_married          0
heart_disease         0
hypertension          0
age                   0
gender                0
dtype: int64

df.groupby("gender")["bmi"].transform("mean")

In [13]:
df["bmi"].fillna(df.groupby("gender")["bmi"].transform("mean"), inplace=True)
df.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

## 類別資料的處理

In [14]:
df = pd.get_dummies(data=df, dtype=int, columns=["gender", "ever_married", "work_type", "Residence_type", "smoking_status" ])
df

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,ever_married_No,ever_married_Yes,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.600000,1,0,1,0,1,...,0,1,0,0,0,1,0,1,0,0
1,61.0,0,0,202.21,29.807366,1,1,0,0,1,...,0,0,1,0,1,0,0,0,1,0
2,80.0,0,1,105.92,32.500000,1,0,1,0,1,...,0,1,0,0,1,0,0,0,1,0
3,49.0,0,0,171.23,34.400000,1,1,0,0,1,...,0,1,0,0,0,1,0,0,0,1
4,79.0,1,0,174.12,24.000000,1,1,0,0,1,...,0,0,1,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
978,80.0,1,0,83.75,29.807366,0,1,0,0,1,...,0,1,0,0,0,1,0,0,1,0
979,81.0,0,0,125.20,40.000000,0,1,0,0,1,...,0,0,1,0,0,1,0,0,1,0
980,35.0,0,0,82.99,30.600000,0,1,0,0,1,...,0,0,1,0,1,0,0,0,1,0
981,51.0,0,0,166.29,25.600000,0,0,1,0,1,...,0,1,0,0,1,0,0,1,0,0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 983 entries, 0 to 982
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   age                             983 non-null    float64
 1   hypertension                    983 non-null    int64  
 2   heart_disease                   983 non-null    int64  
 3   avg_glucose_level               983 non-null    float64
 4   bmi                             983 non-null    float64
 5   stroke                          983 non-null    int64  
 6   gender_Female                   983 non-null    int64  
 7   gender_Male                     983 non-null    int64  
 8   ever_married_No                 983 non-null    int64  
 9   ever_married_Yes                983 non-null    int64  
 10  work_type_Govt_job              983 non-null    int64  
 11  work_type_Never_worked          983 non-null    int64  
 12  work_type_Private               983 

In [16]:
df.corr()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,ever_married_No,ever_married_Yes,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
age,1.0,0.287779,0.294929,0.273895,0.27206,0.481116,0.005575,-0.005575,-0.615252,0.615252,...,-0.080518,0.129511,0.300313,-0.630825,-0.064437,0.064437,-0.32726,0.226206,0.085299,0.050786
hypertension,0.287779,1.0,0.109096,0.188672,0.146757,0.220959,-0.037485,0.037485,-0.13389,0.13389,...,-0.021886,0.003997,0.14066,-0.138247,0.012545,-0.012545,-0.163977,0.056474,0.080292,0.036244
heart_disease,0.294929,0.109096,1.0,0.220357,0.0261,0.208752,-0.106563,0.106563,-0.112766,0.112766,...,-0.017132,0.024477,0.063241,-0.108216,-0.02597,0.02597,-0.075755,0.065603,-0.034863,0.069321
avg_glucose_level,0.273895,0.188672,0.220357,1.0,0.230443,0.2173,-0.11236,0.11236,-0.194678,0.194678,...,-0.005476,0.051442,0.043678,-0.137073,-0.00913,0.00913,-0.107517,0.060016,0.026928,0.033206
bmi,0.27206,0.146757,0.0261,0.230443,1.0,0.054963,0.031212,-0.031212,-0.302718,0.302718,...,-0.037304,0.198613,0.061385,-0.413091,0.002926,-0.002926,-0.234847,0.081808,0.079779,0.097061
stroke,0.481116,0.220959,0.208752,0.2173,0.054963,1.0,-0.025715,0.025715,-0.211876,0.211876,...,-0.032225,0.011104,0.145231,-0.188539,-0.043791,0.043791,-0.123998,0.14392,-0.008229,0.010954
gender_Female,0.005575,-0.037485,-0.106563,-0.11236,0.031212,-0.025715,1.0,-1.0,-0.017031,0.017031,...,-0.028627,0.006513,0.049665,-0.059165,0.006804,-0.006804,-0.053299,-0.074578,0.094896,0.019694
gender_Male,-0.005575,0.037485,0.106563,0.11236,-0.031212,0.025715,-1.0,1.0,0.017031,-0.017031,...,0.028627,-0.006513,-0.049665,0.059165,-0.006804,0.006804,0.053299,0.074578,-0.094896,-0.019694
ever_married_No,-0.615252,-0.13389,-0.112766,-0.194678,-0.302718,-0.211876,-0.017031,0.017031,1.0,-1.0,...,0.088776,-0.17033,-0.146502,0.560777,-0.005626,0.005626,0.319696,-0.192052,-0.09055,-0.070663
ever_married_Yes,0.615252,0.13389,0.112766,0.194678,0.302718,0.211876,0.017031,-0.017031,-1.0,1.0,...,-0.088776,0.17033,0.146502,-0.560777,0.005626,-0.005626,-0.319696,0.192052,0.09055,0.070663


## 移除不需要的欄位

In [17]:
df.drop(["gender_Female","work_type_Never_worked","work_type_children","ever_married_No","Residence_type_Rural","smoking_status_Unknown", "smoking_status_never smoked"], axis=1, inplace=True)
df

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,ever_married_Yes,work_type_Govt_job,work_type_Private,work_type_Self-employed,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.600000,1,1,1,0,1,0,1,1,0
1,61.0,0,0,202.21,29.807366,1,0,1,0,0,1,0,0,0
2,80.0,0,1,105.92,32.500000,1,1,1,0,1,0,0,0,0
3,49.0,0,0,171.23,34.400000,1,0,1,0,1,0,1,0,1
4,79.0,1,0,174.12,24.000000,1,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
978,80.0,1,0,83.75,29.807366,0,0,1,0,1,0,1,0,0
979,81.0,0,0,125.20,40.000000,0,0,1,0,0,1,1,0,0
980,35.0,0,0,82.99,30.600000,0,0,1,0,0,1,0,0,0
981,51.0,0,0,166.29,25.600000,0,1,1,0,1,0,0,1,0


## 特徵縮放 normalization 

In [18]:
from sklearn.preprocessing import MinMaxScaler
scal = MinMaxScaler()
for col_name in df.columns:
    if df[col_name].nunique() > 5: 
        df[col_name] = scal.fit_transform(df[[col_name]])
df

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,ever_married_Yes,work_type_Govt_job,work_type_Private,work_type_Self-employed,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_smokes
0,0.816716,0,1,0.801173,0.453488,1,1,1,0,1,0,1,1,0
1,0.743402,0,0,0.678875,0.321848,1,0,1,0,0,1,0,0,0
2,0.975562,0,1,0.234159,0.374031,1,1,1,0,1,0,0,0,0
3,0.596774,0,0,0.535793,0.410853,1,0,1,0,1,0,1,0,1
4,0.963343,1,0,0.549141,0.209302,1,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
978,0.975562,1,0,0.131766,0.321848,0,0,1,0,1,0,1,0,0
979,0.987781,0,0,0.323203,0.519380,0,0,1,0,0,1,1,0,0
980,0.425709,0,0,0.128256,0.337209,0,0,1,0,0,1,0,0,0
981,0.621212,0,0,0.512978,0.240310,0,1,1,0,1,0,0,1,0


from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for col in df.columns:
    if df[col].dtype=='object':
        df[col]=le.fit_transform(df[col]) 
df

In [19]:
X = df.drop("stroke", axis=1)
y = df['stroke']

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 1. Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=600)
lr.fit(X_train, y_train)

LogisticRegression(max_iter=600)

In [22]:
predictions = lr.predict(X_test)
predictions

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 0])

from sklearn.metrics import accuracy_score
accuracy_using_decision_tree = round(accuracy_score(y_test, predictions)*100, 2)
print("Model accuracy: ", accuracy_using_decision_tree, "%")

In [23]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score

print("Accuracy:", round(accuracy_score(y_test, predictions)*100, 2),"%")
print("Recall:", recall_score(y_test, predictions))
print("Precision:", precision_score(y_test, predictions))

Accuracy: 78.98 %
Recall: 0.4057971014492754
Precision: 0.5714285714285714


In [24]:
pd.DataFrame(confusion_matrix(y_test, predictions),
            columns = ['Predicted Not Stroke', 'Predicted Stroke'],
             index = ['True not Stroke', 'True Stroke']
            )

Unnamed: 0,Predicted Not Stroke,Predicted Stroke
True not Stroke,205,21
True Stroke,41,28


# 2. Decision Tree Classifier

In [25]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier()

In [26]:
X_test

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Male,ever_married_Yes,work_type_Govt_job,work_type_Private,work_type_Self-employed,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_smokes
810,0.780059,1,0,0.122206,0.350775,1,1,0,1,0,1,1,0
801,0.511241,0,0,0.133059,0.352713,0,0,0,1,0,1,0,0
813,0.890029,0,0,0.122668,0.302326,0,1,0,1,0,0,0,0
497,0.865591,0,0,0.163172,0.507752,0,1,0,1,0,1,1,0
67,0.926686,0,0,0.227462,0.137597,1,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
817,0.144673,0,0,0.285516,0.094961,1,0,0,0,0,0,0,0
558,0.963343,0,1,0.191992,0.222868,1,1,0,0,1,1,0,0
318,0.633431,0,0,0.107057,1.000000,1,1,0,0,1,1,0,0
568,0.535679,0,0,0.294615,0.315891,1,1,0,1,0,0,0,0


In [27]:
predictions = dt.predict(X_test)
predictions

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 0])

In [28]:
print("Accuracy:", round(accuracy_score(y_test, predictions)*100, 2),"%")
print("Recall:", recall_score(y_test, predictions))
print("Precision:", precision_score(y_test, predictions))

Accuracy: 71.19 %
Recall: 0.4492753623188406
Precision: 0.3974358974358974


In [29]:
pd.DataFrame(confusion_matrix(y_test, predictions),
            columns = ['Predicted Not Stroke', 'Predicted Stroke'],
             index = ['True not Stroke', 'True Stroke']
            )

Unnamed: 0,Predicted Not Stroke,Predicted Stroke
True not Stroke,179,47
True Stroke,38,31


# 3. Averaged Perceptron

In [30]:
from sklearn.linear_model import Perceptron 
pt = Perceptron(max_iter=100, eta0=0.1, random_state=42)
pt.fit(X_train, y_train)

Perceptron(eta0=0.1, max_iter=100, random_state=42)

In [31]:
predictions = pt.predict(X_test)
predictions

array([1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 0])

In [32]:
print("Accuracy:", round(accuracy_score(y_test, predictions)*100, 2),"%")
print("Recall:", recall_score(y_test, predictions))
print("Precision:", precision_score(y_test, predictions))

Accuracy: 72.88 %
Recall: 0.7246376811594203
Precision: 0.45045045045045046


In [33]:
pd.DataFrame(confusion_matrix(y_test, predictions),
            columns = ['Predicted Not Stroke', 'Predicted Stroke'],
             index = ['True not Stroke', 'True Stroke']
            )

Unnamed: 0,Predicted Not Stroke,Predicted Stroke
True not Stroke,165,61
True Stroke,19,50


# 4. Random Forest Classification

In [34]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [35]:
predictions = rf.predict(X_test)
predictions

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0])

In [36]:
print("Accuracy:", round(accuracy_score(y_test, predictions)*100, 2),"%")
print("Recall:", recall_score(y_test, predictions))
print("Precision:", precision_score(y_test, predictions))

Accuracy: 78.64 %
Recall: 0.463768115942029
Precision: 0.5517241379310345


In [37]:
pd.DataFrame(confusion_matrix(y_test, predictions),
            columns = ['Predicted Not Stroke', 'Predicted Stroke'],
             index = ['True not Stroke', 'True Stroke']
            )

Unnamed: 0,Predicted Not Stroke,Predicted Stroke
True not Stroke,200,26
True Stroke,37,32


# 5. Support Vector Machine

In [38]:
from sklearn import svm
svm = svm.SVC(random_state=42)
svm.fit(X_train, y_train)

SVC(random_state=42)

In [39]:
predictions = svm.predict(X_test)
predictions

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0])

In [40]:
print("Accuracy:", round(accuracy_score(y_test, predictions)*100, 2),"%")
print("Recall:", recall_score(y_test, predictions))
print("Precision:", precision_score(y_test, predictions))

Accuracy: 75.25 %
Recall: 0.2463768115942029
Precision: 0.4473684210526316


In [41]:
pd.DataFrame(confusion_matrix(y_test, predictions),
            columns = ['Predicted Not Stroke', 'Predicted Stroke'],
             index = ['True not Stroke', 'True Stroke']
            )

Unnamed: 0,Predicted Not Stroke,Predicted Stroke
True not Stroke,205,21
True Stroke,52,17


# 6. Neural Networks

In [42]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='lbfgs', 
                    alpha=1e-5,
                    hidden_layer_sizes=(6,), 
                    random_state=42,
                    max_iter=900)

clf.fit(X_train, y_train)   

MLPClassifier(alpha=1e-05, hidden_layer_sizes=(6,), max_iter=900,
              random_state=42, solver='lbfgs')

In [43]:
predictions = clf.predict(X_test)
predictions

array([1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0])

In [44]:
print("Accuracy:", round(accuracy_score(y_test, predictions)*100, 2),"%")
print("Recall:", recall_score(y_test, predictions))
print("Precision:", precision_score(y_test, predictions))

Accuracy: 73.9 %
Recall: 0.4492753623188406
Precision: 0.44285714285714284


In [45]:
pd.DataFrame(confusion_matrix(y_test, predictions),
            columns = ['Predicted Not Stroke', 'Predicted Stroke'],
             index = ['True not Stroke', 'True Stroke']
            )

Unnamed: 0,Predicted Not Stroke,Predicted Stroke
True not Stroke,187,39
True Stroke,38,31


# Model Export

In [46]:
import joblib
joblib.dump(lr, "Stroke-LR.pkl", compress=3)
joblib.dump(dt, "Stroke-DT.pkl", compress=3)
joblib.dump(pt, "Stroke-PT.pkl", compress=3)
joblib.dump(rf, "Stroke-RF.pkl", compress=3)
joblib.dump(svm, "Stroke-SVM.pkl", compress=3)
joblib.dump(clf, "Stroke-CLF.pkl", compress=3)

['Stroke-CLF.pkl']