In [1]:
# import dependencies for training, testing, scaling, and model creation
from path import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier

# import dependencies for model evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
# import cleaned and preprocessed dataset
path = Path("../resources/clean_stroke_df.csv")

clean_stroke_df = pd.read_csv(path)
clean_stroke_df.head()

Unnamed: 0.1,Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0,9046,67,0,1,228,36,1,0,1,...,0,1,0,0,0,1,0,1,0,0
1,2,31112,80,0,1,105,32,1,0,1,...,0,1,0,0,1,0,0,0,1,0
2,3,60182,49,0,0,171,34,1,1,0,...,0,1,0,0,0,1,0,0,0,1
3,4,1665,79,1,0,174,24,1,1,0,...,0,0,1,0,1,0,0,0,1,0
4,5,56669,81,0,0,186,29,1,0,1,...,0,1,0,0,0,1,0,1,0,0


In [3]:
# drop "id" column and Unnamed:0" column (this column is a result of the to_csv() parameter set as index=True)
clean_stroke_df = clean_stroke_df.drop(columns=["Unnamed: 0", "id"])
clean_stroke_df.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67,0,1,228,36,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
1,80,0,1,105,32,1,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
2,49,0,0,171,34,1,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1
3,79,1,0,174,24,1,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
4,81,0,0,186,29,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0


In [4]:
clean_stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4909 entries, 0 to 4908
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype
---  ------                          --------------  -----
 0   age                             4909 non-null   int64
 1   hypertension                    4909 non-null   int64
 2   heart_disease                   4909 non-null   int64
 3   avg_glucose_level               4909 non-null   int64
 4   bmi                             4909 non-null   int64
 5   stroke                          4909 non-null   int64
 6   gender_Female                   4909 non-null   int64
 7   gender_Male                     4909 non-null   int64
 8   gender_Other                    4909 non-null   int64
 9   ever_married_No                 4909 non-null   int64
 10  ever_married_Yes                4909 non-null   int64
 11  work_type_Govt_job              4909 non-null   int64
 12  work_type_Never_worked          4909 non-null   int64
 13  wor

In [5]:
# check distribution of "stroke" target vector
clean_stroke_df["stroke"].value_counts()

0    4700
1     209
Name: stroke, dtype: int64

In [6]:
# create feature and target vector objects, check shapes of both
X = clean_stroke_df.drop(columns=["stroke"]).values
y = clean_stroke_df["stroke"].values
print(X.shape, y.shape)

(4909, 21) (4909,)


In [7]:
X[:5]

array([[ 67,   0,   1, 228,  36,   0,   1,   0,   0,   1,   0,   0,   1,
          0,   0,   0,   1,   0,   1,   0,   0],
       [ 80,   0,   1, 105,  32,   0,   1,   0,   0,   1,   0,   0,   1,
          0,   0,   1,   0,   0,   0,   1,   0],
       [ 49,   0,   0, 171,  34,   1,   0,   0,   0,   1,   0,   0,   1,
          0,   0,   0,   1,   0,   0,   0,   1],
       [ 79,   1,   0, 174,  24,   1,   0,   0,   0,   1,   0,   0,   0,
          1,   0,   1,   0,   0,   0,   1,   0],
       [ 81,   0,   0, 186,  29,   0,   1,   0,   0,   1,   0,   0,   1,
          0,   0,   0,   1,   0,   1,   0,   0]], dtype=int64)

In [8]:
y[:5]

array([1, 1, 1, 1, 1], dtype=int64)

In [9]:
# split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.25, random_state=42, stratify=y
)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(3681, 21) (1228, 21) (3681,) (1228,)


In [10]:
# create an instance of the StandardScaler
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

#print first two rows of scaled training and testing features
print(
f'{X_train_scaled[:2]}\n \
----------\n \
{X_test_scaled[:2]}'
)

[[ 0.66644509 -0.32568345  4.35952231  2.66626516 -0.32047536 -1.21674082
   1.21742795 -0.01648451 -0.72011413  0.72011413 -0.38211902 -0.06178863
   0.85380369 -0.43468781 -0.38909116  1.00572134 -1.00572134 -0.64395731
  -0.4548464  -0.78252934  2.32896624]
 [-0.67061015 -0.32568345 -0.22938293  0.03762395  1.21305703  0.82186772
  -0.82140385 -0.01648451  1.38866877 -1.38866877 -0.38211902 -0.06178863
   0.85380369 -0.43468781 -0.38909116 -0.99431121  0.99431121 -0.64395731
  -0.4548464   1.27790736 -0.42937505]]
 ----------
 [[-0.58147313 -0.32568345 -0.22938293 -0.49701494 -0.70385845  0.82186772
  -0.82140385 -0.01648451 -0.72011413  0.72011413  2.61698568 -0.06178863
  -1.17122942 -0.43468781 -0.38909116 -0.99431121  0.99431121  1.55289798
  -0.4548464  -0.78252934 -0.42937505]
 [ 0.26532852 -0.32568345 -0.22938293 -0.38563184  0.44629084 -1.21674082
   1.21742795 -0.01648451 -0.72011413  0.72011413 -0.38211902 -0.06178863
   0.85380369 -0.43468781 -0.38909116  1.00572134 -1.00