In [1]:
# Required imports
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline

## Load the Data Into a Pandas DataFrame

In [2]:
# Read in the CSV file as a Pandas Dataframe
strokeinfo_df = pd.read_csv(
    Path("Resources/Stroke_prediction_Data.csv")
)

strokeinfo_df.head()


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
#drop rows with null values
strokeinfo_df = strokeinfo_df.dropna()

#copy strokeinfo_df into new dataframe
strokeinfo_drop_df = strokeinfo_df.copy()
strokeinfo_drop_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [4]:
# Get Dummies to transfrom column with encoding function
categorical_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
existing_categorical_cols = [col for col in categorical_cols if col in strokeinfo_drop_df.columns]
# Apply get_dummies
strokeinfo_drop_df = pd.get_dummies(strokeinfo_drop_df, columns=existing_categorical_cols, drop_first=True)

In [5]:
strokeinfo_drop_df.head()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,67.0,0,1,228.69,36.6,1,True,False,True,False,True,False,False,True,True,False,False
2,31112,80.0,0,1,105.92,32.5,1,True,False,True,False,True,False,False,False,False,True,False
3,60182,49.0,0,0,171.23,34.4,1,False,False,True,False,True,False,False,True,False,False,True
4,1665,79.0,1,0,174.12,24.0,1,False,False,True,False,False,True,False,False,False,True,False
5,56669,81.0,0,0,186.21,29.0,1,True,False,True,False,True,False,False,True,True,False,False


In [6]:
# @title avg_glucose_level

from matplotlib import pyplot as plt
strokeinfo_drop_df['avg_glucose_level'].plot(kind='hist', bins=20, title='avg_glucose_level')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [7]:
# Scale age, avg_glucose_level and bmi
scaler = StandardScaler()
strokeinfo_drop_df[['age', 'avg_glucose_level', 'bmi']] = scaler.fit_transform(strokeinfo_drop_df[['age', 'avg_glucose_level', 'bmi']])
strokeinfo_drop_df.head()

#change index to id column
strokeinfo_drop_df.set_index('id', inplace=True)

#copy strokeinfo_drop_df into new dataframe strokeinfo2_df
strokeinfo2_df = strokeinfo_drop_df.copy()
strokeinfo2_df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 4909 entries, 9046 to 44679
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   age                             4909 non-null   float64
 1   hypertension                    4909 non-null   int64  
 2   heart_disease                   4909 non-null   int64  
 3   avg_glucose_level               4909 non-null   float64
 4   bmi                             4909 non-null   float64
 5   stroke                          4909 non-null   int64  
 6   gender_Male                     4909 non-null   bool   
 7   gender_Other                    4909 non-null   bool   
 8   ever_married_Yes                4909 non-null   bool   
 9   work_type_Never_worked          4909 non-null   bool   
 10  work_type_Private               4909 non-null   bool   
 11  work_type_Self-employed         4909 non-null   bool   
 12  work_type_children              490

In [8]:
#split the strokeinfo2_df dataset into x_train and x_test and y_train and y_test
x_train, x_test, y_train, y_test = train_test_split(strokeinfo2_df.drop('stroke', axis=1), strokeinfo2_df['stroke'], test_size=0.2, random_state=42)

In [10]:
#pass the x_train and y_train data through smoteenn to fix imbalance
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=42)
x_resampled, y_resampled = smote_enn.fit_resample(x_train, y_train)
x_resampled.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi
count,6844.0,6844.0,6844.0,6844.0,6844.0
mean,0.501669,0.111046,0.060053,0.340859,0.068987
std,0.995649,0.314212,0.237602,1.255779,0.857608
min,-1.897119,0.0,0.0,-1.129792,-2.240244
25%,-0.127052,0.0,0.0,-0.57946,-0.457578
50%,0.715415,0.0,0.0,-0.175376,-0.024606
75%,1.361892,0.0,0.0,1.082847,0.522937
max,1.735243,1.0,1.0,3.74686,8.035734


In [None]:
#copy the model strokeinfo2_df passed through smote into a new dataset
#strokeinfo2_df, strokeinfo2_df['stroke'] = smote.fit_resample(strokeinfo2_df, strokeinfo2_df['stroke'])
#strokeinfo2_df['stroke'].describe()


## Use PCA to reduce the number of factors

In [11]:
# Import the PCA module
from sklearn.decomposition import PCA

In [12]:
# Instantiate the PCA instance and declare the number of PCA variables
pca = PCA(n_components=4)

In [13]:
# Fit the x_train data into PCA model
X_train_pca = pca.fit_transform(x_resampled)

# Review the first 5 rows of list data
X_train_pca[:5]

array([[-1.17733904,  0.40080689,  0.64052088,  0.651347  ],
       [-0.52217008,  0.88505416,  0.33596884, -0.06974132],
       [ 2.09314527, -0.91805111,  0.58224826,  0.8398904 ],
       [-0.43525852,  1.53150096, -1.05877849, -1.00602921],
       [-0.20974116,  0.84191999, -0.36491159,  0.22442735]])

In [14]:
X_test_pca = pca.transform(x_test)

## PCA explained variance ratio

In [15]:
# Calculate the PCA explained variance ratio
pca.explained_variance_ratio_

array([0.38985364, 0.17051907, 0.12066552, 0.06790748])

In [16]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_pca, y_resampled)  # Train the model on the PCA-transformed training data

# Evaluate the model on the PCA-transformed test data
y_pred = model.predict(X_test_pca)
#Calculate accuracy, precision, recall
from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Accuracy: 0.7067209775967414
Precision: 0.13846153846153847
Recall: 0.8490566037735849


In [17]:
#print confusion matrix on prediction model
confusion_matrix(y_test, y_pred)

array([[649, 280],
       [  8,  45]], dtype=int64)

In [18]:
#run classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.70      0.82       929
           1       0.14      0.85      0.24        53

    accuracy                           0.71       982
   macro avg       0.56      0.77      0.53       982
weighted avg       0.94      0.71      0.79       982



## Dropping environmental variables to find better fit of model

In [19]:
#Copy new dataset from strokeinfo2_df but drop environmental variable columns: gender_Other, ever_married_Yes, work_type_Never_worked, work_type_Private, work_type_Self-employed, work_type_children, Residence_type_Urban
strokeinfo3_df = strokeinfo2_df.drop(columns=['gender_Other', 'ever_married_Yes', 'work_type_Never_worked', 'work_type_Private', 'work_type_Self-employed', 'work_type_children', 'Residence_type_Urban'])
strokeinfo3_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4909 entries, 9046 to 44679
Data columns (total 10 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   age                             4909 non-null   float64
 1   hypertension                    4909 non-null   int64  
 2   heart_disease                   4909 non-null   int64  
 3   avg_glucose_level               4909 non-null   float64
 4   bmi                             4909 non-null   float64
 5   stroke                          4909 non-null   int64  
 6   gender_Male                     4909 non-null   bool   
 7   smoking_status_formerly smoked  4909 non-null   bool   
 8   smoking_status_never smoked     4909 non-null   bool   
 9   smoking_status_smokes           4909 non-null   bool   
dtypes: bool(4), float64(3), int64(3)
memory usage: 287.6 KB


In [20]:
#split the strokeinfo2_df dataset into x_train and x_test and y_train and y_test
x_train2, x_test2, y_train2, y_test2 = train_test_split(strokeinfo3_df.drop('stroke', axis=1), strokeinfo3_df['stroke'], test_size=0.2, random_state=42)

In [21]:
#pass the x_train and y_train data through smoteenn to fix imbalance
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=42)
x_resampled2, y_resampled2 = smote_enn.fit_resample(x_train2, y_train2)
x_resampled.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi
count,6844.0,6844.0,6844.0,6844.0,6844.0
mean,0.501669,0.111046,0.060053,0.340859,0.068987
std,0.995649,0.314212,0.237602,1.255779,0.857608
min,-1.897119,0.0,0.0,-1.129792,-2.240244
25%,-0.127052,0.0,0.0,-0.57946,-0.457578
50%,0.715415,0.0,0.0,-0.175376,-0.024606
75%,1.361892,0.0,0.0,1.082847,0.522937
max,1.735243,1.0,1.0,3.74686,8.035734


In [22]:
# Import the PCA module
from sklearn.decomposition import PCA

In [23]:
# Instantiate the PCA instance and declare the number of PCA variables
pca = PCA(n_components=4)

In [24]:
# Fit the x_train data into PCA model
X_train_pca2 = pca.fit_transform(x_resampled2)

# Review the first 5 rows of list data
X_train_pca2[:5]

array([[-1.31168514,  0.29604663,  0.52984318,  0.74668651],
       [-0.5929467 ,  0.73014314,  0.38799662,  0.02857227],
       [ 2.06722057, -0.76816542,  0.60781725,  0.9428743 ],
       [-0.26417017,  0.6825488 , -0.29657302,  0.3051145 ],
       [-1.49348787, -1.1985681 ,  0.55359797,  0.79547581]])

In [25]:
X_test_pca2 = pca.transform(x_test2)

In [26]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_pca2, y_resampled2)  # Train the model on the PCA-transformed training data

# Evaluate the model on the PCA-transformed test data
y_pred2 = model.predict(X_test_pca2)
#Calculate accuracy, precision, recall
from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy2 = accuracy_score(y_test2, y_pred2)
precision2 = precision_score(y_test2, y_pred2)
recall2 = recall_score(y_test2, y_pred2)

In [27]:
print(f"Accuracy: {accuracy2}")
print(f"Precision: {precision2}")
print(f"Recall: {recall2}")

Accuracy: 0.714867617107943
Precision: 0.14420062695924765
Recall: 0.8679245283018868


In [28]:
#print confusion matrix on prediction model
confusion_matrix(y_test2, y_pred2)

array([[656, 273],
       [  7,  46]], dtype=int64)

In [29]:
#run classification report
print(classification_report(y_test2, y_pred2))

              precision    recall  f1-score   support

           0       0.99      0.71      0.82       929
           1       0.14      0.87      0.25        53

    accuracy                           0.71       982
   macro avg       0.57      0.79      0.54       982
weighted avg       0.94      0.71      0.79       982

