In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [2]:
#Count NaNs in each column
column_nan_count = df.isnull().sum()
print("NaN count per column:")
print(column_nan_count)

#Summary
df.describe().transpose()

#Remove the ID column as it is not informative
df.drop('id', axis=1, inplace=True, errors='ignore')

df.head()

#Check number of patients with and without stroke
print(df.value_counts("stroke"))

#Remove rows with NaNs
df = df.dropna()

print(df.value_counts("stroke"))

#Check unique values
print(df['gender'].unique())
print(df['ever_married'].unique())
print(df['work_type'].unique())
print(df['Residence_type'].unique())
print(df['smoking_status'].unique())

NaN count per column:
id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64
stroke
0    4861
1     249
Name: count, dtype: int64
stroke
0    4700
1     209
Name: count, dtype: int64
['Male' 'Female' 'Other']
['Yes' 'No']
['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
['Urban' 'Rural']
['formerly smoked' 'never smoked' 'smokes' 'Unknown']


In [3]:
#Check data types per column
df.dtypes

cat_columns = df.select_dtypes(['object']).columns
cat_columns

#Change object type to categorical 
for col in ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']:
    df[col] = df[col].astype('category')

df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.6,1,1
2,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,0,79.0,1,0,1,3,0,174.12,24.0,2,1
5,1,81.0,0,0,1,2,1,186.21,29.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,0,13.0,0,0,0,4,0,103.08,18.6,0,0
5106,0,81.0,0,0,1,3,1,125.20,40.0,2,0
5107,0,35.0,0,0,1,3,0,82.99,30.6,2,0
5108,1,51.0,0,0,1,2,0,166.29,25.6,1,0


In [None]:
#Creating training and test datasets
features = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status']
X = df.loc[:, features]
y = df.loc[:, ['stroke']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=True, stratify=y)

X_train.head()

y_train.head()
print(y_train.value_counts("stroke"))

X_test.head()

y_test.head()
print(y_test.value_counts("stroke"))

stroke
0    3149
1     140
Name: count, dtype: int64
stroke
0    1551
1      69
Name: count, dtype: int64


In [None]:
#Perform UpSampling
ros = RandomOverSampler(sampling_strategy='minority', random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

#Perform DownSampling
rus = RandomUnderSampler(sampling_strategy='majority', random_state=42)
X_resampled_down, y_resampled_down = rus.fit_resample(X_train, y_train)

print(f"After Upsampling: {Counter(y_resampled['stroke'])}")

print(f"After Downsampling: {Counter(y_resampled_down['stroke'])}")

After Upsampling: Counter({0: 3149, 1: 3149})
After Downsampling: Counter({0: 140, 1: 140})


After trying logistic regression with upsampled and downsampled data, results are very similar. 

- Accuracy is the proportion of the predictions that resulted correct. In our case (heaviliy imbalanced data), this metric is not very informative, because just for mere probability, the model is going to correctly predict negative class. 

- Recall is the proportion of actual positives that were classified correctly as positives. This means, out of all patients with stroke, how many of them were correctly diagnosed? This focuses on avoiding missing positives. 

- Precision is the the proportion of predictions classified as positive that are actually positive. This focuses on avoiding false positives. 

In [67]:
#Logistic Regression
clf = LogisticRegression(class_weight='balanced', max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

for t in [0.2, 0.3]:
    y_pred_t = (y_proba > t).astype(int)
    print(f"\nThreshold: {t}")
    print(classification_report(y_test, y_pred_t))

  y = column_or_1d(y, warn=True)


ROC-AUC: 0.8150328446350649
              precision    recall  f1-score   support

           0       0.98      0.75      0.85      1551
           1       0.11      0.70      0.19        69

    accuracy                           0.75      1620
   macro avg       0.55      0.72      0.52      1620
weighted avg       0.95      0.75      0.82      1620


Threshold: 0.2
              precision    recall  f1-score   support

           0       0.99      0.51      0.67      1551
           1       0.08      0.94      0.15        69

    accuracy                           0.53      1620
   macro avg       0.54      0.73      0.41      1620
weighted avg       0.96      0.53      0.65      1620


Threshold: 0.3
              precision    recall  f1-score   support

           0       0.99      0.60      0.75      1551
           1       0.09      0.87      0.16        69

    accuracy                           0.61      1620
   macro avg       0.54      0.74      0.45      1620
weighted avg  

Random Forest with Upsampling resulted in better recall values, therefore improving diagnosis of stroke. The best value for threshold was 0.2. 

In [68]:
#Random Forest
rf = RandomForestClassifier()
rf.fit(X_resampled, y_resampled)

y_pred = rf.predict(X_test)

#Calculates probability of each sample being positive class
y_proba = rf.predict_proba(X_test)[:, 1]

#This probability (0.2) indices the threshold above which the model will predict as positive
y_pred_t = (y_proba > 0.2).astype(int)

print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred_t))

  return fit_method(estimator, *args, **kwargs)


ROC-AUC: 0.7677655369607266
              precision    recall  f1-score   support

           0       0.96      0.92      0.94      1551
           1       0.11      0.25      0.16        69

    accuracy                           0.89      1620
   macro avg       0.54      0.58      0.55      1620
weighted avg       0.93      0.89      0.91      1620



In [69]:
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print("ROC-AUC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1]))
print(classification_report(y_test, y_pred))

  return fit_method(estimator, *args, **kwargs)


ROC-AUC: 0.7672189050542426
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1551
           1       0.00      0.00      0.00        69

    accuracy                           0.96      1620
   macro avg       0.48      0.50      0.49      1620
weighted avg       0.92      0.96      0.94      1620

