In [1]:
import warnings
# Enable warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)


# Exploratory Data Analysis

In [2]:
import pandas as pd

# Load the dataset
data = pd.read_csv("stroke.csv")

In [3]:
# Basic information about the dataset
print("Basic information about the dataset:")
print(data.info())

Basic information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4600 non-null   int64  
 1   gender             4600 non-null   object 
 2   age                4600 non-null   float64
 3   hypertension       4600 non-null   int64  
 4   heart_disease      4600 non-null   int64  
 5   ever_married       4600 non-null   object 
 6   work_type          4600 non-null   object 
 7   Residence_type     4600 non-null   object 
 8   avg_glucose_level  4600 non-null   float64
 9   bmi                4416 non-null   float64
 10  smoking_status     4600 non-null   object 
 11  stroke             4600 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 431.4+ KB
None


In [4]:
# Explore the size of the dataset (number of rows and columns)
print("Size of the dataset:")
print(data.shape)

Size of the dataset:
(4600, 12)


In [5]:
# Check for missing values in the dataset
print("Missing Values:")
print(data.isnull().sum())

Missing Values:
id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  184
smoking_status         0
stroke                 0
dtype: int64


In [6]:
# Handling missing values in the 'bmi' column by filling with the mean value
mean_bmi = data['bmi'].mean()
data['bmi'].fillna(mean_bmi, inplace=True)

In [7]:
# Verify if there are any missing values left
print("\nMissing Values after Imputation:")
print(data.isnull().sum())


Missing Values after Imputation:
id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64


In [8]:
# Analyze the distribution of the target variable 'stroke'
print("Distribution of the target variable 'stroke':")
print(data['stroke'].value_counts())

Distribution of the target variable 'stroke':
0    4374
1     226
Name: stroke, dtype: int64


# Data Preprocessing

In [9]:
# Drop irrelevant columns
data.drop(columns=['id'], inplace=True)

In [10]:
from sklearn.preprocessing import OneHotEncoder

# Encode categorical features using one-hot encoding
cat_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_features = pd.DataFrame(encoder.fit_transform(data[cat_columns]), columns=encoder.get_feature_names_out(cat_columns))
data.drop(columns=cat_columns, inplace=True)
data = pd.concat([data, encoded_features], axis=1)

print(data.head())  # View the preprocessed dataset

     age  hypertension  heart_disease  avg_glucose_level        bmi  stroke  \
0   1.72             0              0              75.79  17.600000       0   
1  79.00             0              0             105.93  25.200000       0   
2  28.00             0              0              87.43  55.700000       0   
3  80.00             1              0              83.75  28.872849       0   
4  72.00             0              0             219.91  28.872849       1   

   gender_Male  gender_Other  ever_married_Yes  work_type_Never_worked  \
0          1.0           0.0               0.0                     0.0   
1          1.0           0.0               1.0                     0.0   
2          1.0           0.0               1.0                     0.0   
3          0.0           0.0               1.0                     0.0   
4          0.0           0.0               1.0                     0.0   

   work_type_Private  work_type_Self-employed  work_type_children  \
0          

# Model Training and Evaluation ( without SMOTE )

In [11]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Split the data into features (X) and target (y)
X = data.drop(columns=['stroke'])
y = data['stroke']

# Model Training and Evaluation (Without SMOTE)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
report_rf = classification_report(y_test, y_pred_rf)
confusion_rf = confusion_matrix(y_test, y_pred_rf)
print("Training and Evaluating Random Forest (Without SMOTE)")
print(report_rf)
print("Confusion Matrix for Random Forest (Without SMOTE):")
print(confusion_rf)
print("-" * 60)

# Gradient Boosting
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
report_gb = classification_report(y_test, y_pred_gb)
confusion_gb = confusion_matrix(y_test, y_pred_gb)
print("Training and Evaluating Gradient Boosting (Without SMOTE)")
print(report_gb)
print("Confusion Matrix for Gradient Boosting (Without SMOTE):")
print(confusion_gb)
print("-" * 60)

# Logistic Regression
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
report_lr = classification_report(y_test, y_pred_lr)
confusion_lr = confusion_matrix(y_test, y_pred_lr)
print("Training and Evaluating Logistic Regression (Without SMOTE)")
print(report_lr)
print("Confusion Matrix for Logistic Regression (Without SMOTE):")
print(confusion_lr)
print("-" * 60)

Training and Evaluating Random Forest (Without SMOTE)
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       874
           1       0.00      0.00      0.00        46

    accuracy                           0.95       920
   macro avg       0.47      0.50      0.49       920
weighted avg       0.90      0.95      0.92       920

Confusion Matrix for Random Forest (Without SMOTE):
[[871   3]
 [ 46   0]]
------------------------------------------------------------
Training and Evaluating Gradient Boosting (Without SMOTE)
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       874
           1       0.00      0.00      0.00        46

    accuracy                           0.94       920
   macro avg       0.47      0.50      0.49       920
weighted avg       0.90      0.94      0.92       920

Confusion Matrix for Gradient Boosting (Without SMOTE):
[[868   6]
 [ 46   0]]
-------------------

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Applying SMOTE to Handle Class Imbalance:

In [12]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Handling ClasApplying SMOTE to Handle Class Imbalance:s Imbalance with SMOTE
X = data.drop(columns=['stroke'])
y = data['stroke']
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the data into features (X) and target (y) for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Model Training and Evaluation
# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
report_rf = classification_report(y_test, y_pred_rf)
confusion_rf = confusion_matrix(y_test, y_pred_rf)

print("Training and Evaluating Random Forest (With SMOTE)")
print(report_rf)
print("Confusion Matrix for Random Forest (With SMOTE):")
print(confusion_rf)
print("-" * 60)

# Gradient Boosting
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
report_gb = classification_report(y_test, y_pred_gb)
confusion_gb = confusion_matrix(y_test, y_pred_gb)

print("Training and Evaluating Gradient Boosting (With SMOTE)")
print(report_gb)
print("Confusion Matrix for Gradient Boosting (With SMOTE):")
print(confusion_gb)
print("-" * 60)

# Logistic Regression
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
report_lr = classification_report(y_test, y_pred_lr)
confusion_lr = confusion_matrix(y_test, y_pred_lr)

print("Training and Evaluating Logistic Regression (With SMOTE)")
print(report_lr)
print("Confusion Matrix for Logistic Regression (With SMOTE):")
print(confusion_lr)
print("-" * 60)

Training and Evaluating Random Forest (With SMOTE)
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       874
           1       0.99      0.96      0.98       876

    accuracy                           0.98      1750
   macro avg       0.98      0.98      0.98      1750
weighted avg       0.98      0.98      0.98      1750

Confusion Matrix for Random Forest (With SMOTE):
[[868   6]
 [ 31 845]]
------------------------------------------------------------
Training and Evaluating Gradient Boosting (With SMOTE)
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       874
           1       0.99      0.94      0.97       876

    accuracy                           0.97      1750
   macro avg       0.97      0.97      0.97      1750
weighted avg       0.97      0.97      0.97      1750

Confusion Matrix for Gradient Boosting (With SMOTE):
[[868   6]
 [ 50 826]]
-------------------------------

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
