In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Creating the modeling dataset
from sklearn.datasets import make_classification
# Data processing
import pandas as pd
import numpy as np
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Model and performance
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
# Oversampling and under sampling
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from collections import Counter

In [None]:
# loading the csv data to a Pandas DataFrame
df = pd.read_csv('/content/drive/My Drive/DS440/cleaned-stroke-data.csv')
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,31112,Male,80,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,60182,Female,49,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,1665,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,56669,Male,81,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
3421,68398,Male,82,1,0,Yes,Self-employed,Rural,71.97,28.3,never smoked,0
3422,45010,Female,57,0,0,Yes,Private,Rural,77.93,21.7,never smoked,0
3423,44873,Female,81,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
3424,19723,Female,35,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0


In [None]:
# Find the number of data have stroke or not
StrokeNum = df.groupby(['stroke'])['stroke'].count()
print(StrokeNum)

stroke
0    3246
1     180
Name: stroke, dtype: int64


In [None]:
print(df['smoking_status'].unique())
print(df['work_type'].unique())
print(df['ever_married'].unique())
print(df['Residence_type'].unique())
print(df['gender'].unique())

['formerly smoked' 'never smoked' 'smokes']
['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
['Yes' 'No']
['Urban' 'Rural']
['Male' 'Female' 'Other']


In [None]:
one_hot_encoded_data = pd.get_dummies(df, columns = ['smoking_status', 'work_type'])
one_hot_encoded_data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
0,9046,Male,67,0,1,Yes,Urban,228.69,36.6,1,1,0,0,0,0,1,0,0
1,31112,Male,80,0,1,Yes,Rural,105.92,32.5,1,0,1,0,0,0,1,0,0
2,60182,Female,49,0,0,Yes,Urban,171.23,34.4,1,0,0,1,0,0,1,0,0
3,1665,Female,79,1,0,Yes,Rural,174.12,24.0,1,0,1,0,0,0,0,1,0
4,56669,Male,81,0,0,Yes,Urban,186.21,29.0,1,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3421,68398,Male,82,1,0,Yes,Rural,71.97,28.3,0,0,1,0,0,0,0,1,0
3422,45010,Female,57,0,0,Yes,Rural,77.93,21.7,0,0,1,0,0,0,1,0,0
3423,44873,Female,81,0,0,Yes,Urban,125.20,40.0,0,0,1,0,0,0,0,1,0
3424,19723,Female,35,0,0,Yes,Rural,82.99,30.6,0,0,1,0,0,0,0,1,0


In [None]:
df2 = one_hot_encoded_data
df2['gender'] = df2['gender'].replace({'Male': 1, 'Female': 0})
df2['ever_married'] = df2['ever_married'].replace({'Yes': 1, 'No': 0})
df2['Residence_type'] = df2['Residence_type'].replace({'Urban': 1, 'Rural': 0})
df2 = df2[df2['gender'] != 'Other']
df2

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
0,9046,1,67,0,1,1,1,228.69,36.6,1,1,0,0,0,0,1,0,0
1,31112,1,80,0,1,1,0,105.92,32.5,1,0,1,0,0,0,1,0,0
2,60182,0,49,0,0,1,1,171.23,34.4,1,0,0,1,0,0,1,0,0
3,1665,0,79,1,0,1,0,174.12,24.0,1,0,1,0,0,0,0,1,0
4,56669,1,81,0,0,1,1,186.21,29.0,1,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3421,68398,1,82,1,0,1,0,71.97,28.3,0,0,1,0,0,0,0,1,0
3422,45010,0,57,0,0,1,0,77.93,21.7,0,0,1,0,0,0,1,0,0
3423,44873,0,81,0,0,1,1,125.20,40.0,0,0,1,0,0,0,0,1,0
3424,19723,0,35,0,0,1,0,82.99,30.6,0,0,1,0,0,0,0,1,0


In [None]:
X = df2.drop(columns='stroke', axis=1)
y = df2['stroke']

In [None]:
X

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
0,9046,1,67,0,1,1,1,228.69,36.6,1,0,0,0,0,1,0,0
1,31112,1,80,0,1,1,0,105.92,32.5,0,1,0,0,0,1,0,0
2,60182,0,49,0,0,1,1,171.23,34.4,0,0,1,0,0,1,0,0
3,1665,0,79,1,0,1,0,174.12,24.0,0,1,0,0,0,0,1,0
4,56669,1,81,0,0,1,1,186.21,29.0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3421,68398,1,82,1,0,1,0,71.97,28.3,0,1,0,0,0,0,1,0
3422,45010,0,57,0,0,1,0,77.93,21.7,0,1,0,0,0,1,0,0
3423,44873,0,81,0,0,1,1,125.20,40.0,0,1,0,0,0,0,1,0
3424,19723,0,35,0,0,1,0,82.99,30.6,0,1,0,0,0,0,1,0


In [None]:
y

0       1
1       1
2       1
3       1
4       1
       ..
3421    0
3422    0
3423    0
3424    0
3425    0
Name: stroke, Length: 3425, dtype: int64

In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Check the number of records
print('The number of records in the training dataset is', X_train.shape[0])
print('The number of records in the test dataset is', X_test.shape[0])
print(f"The training dataset has {sorted(Counter(y_train).items())[0][1]} records for the majority class and {sorted(Counter(y_train).items())[1][1]} records for the minority class.")

The number of records in the training dataset is 2740
The number of records in the test dataset is 685
The training dataset has 2604 records for the majority class and 136 records for the minority class.


In [None]:
from pandas._libs.hashtable import value_count
print(X_train['gender'].value_counts())
print(X_test['gender'].value_counts())

0    1683
1    1057
Name: gender, dtype: int64
0    403
1    282
Name: gender, dtype: int64


In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0)))

# import SMOTE module from imblearn library
# pip install imblearn (if you don't have imblearn in your system)
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0)))


Before OverSampling, counts of label '1': 136
Before OverSampling, counts of label '0': 2604 

After OverSampling, the shape of train_X: (5208, 17)
After OverSampling, the shape of train_y: (5208,) 

After OverSampling, counts of label '1': 2604
After OverSampling, counts of label '0': 2604


In [None]:
lr1 = LogisticRegression()
lr1.fit(X_train_res, y_train_res.ravel())
predictions = lr1.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.97      0.66      0.78       641
           1       0.12      0.68      0.20        44

    accuracy                           0.66       685
   macro avg       0.54      0.67      0.49       685
weighted avg       0.91      0.66      0.75       685



In [None]:
# logistic regression object
lr = LogisticRegression()

# train the model on train set
lr.fit(X_train, y_train.ravel())

predictions = lr.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.94      1.00      0.97       641
           1       0.00      0.00      0.00        44

    accuracy                           0.94       685
   macro avg       0.47      0.50      0.48       685
weighted avg       0.88      0.94      0.90       685



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
