In [3]:
# importing the dependencies
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [4]:
# Data Collection and pre processing
stroke_data = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [5]:
stroke_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [6]:
stroke_data.tail()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.2,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0
5109,44679,Female,44.0,0,0,Yes,Govt_job,Urban,85.28,26.2,Unknown,0


In [7]:
stroke_data.shape

(5110, 12)

In [8]:
stroke_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [9]:
# check for null values
stroke_data.isnull().sum()

Unnamed: 0,0
id,0
gender,0
age,0
hypertension,0
heart_disease,0
ever_married,0
work_type,0
Residence_type,0
avg_glucose_level,0
bmi,201


In [10]:
# Fill missing BMI with median
stroke_data['bmi'].fillna(stroke_data['bmi'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  stroke_data['bmi'].fillna(stroke_data['bmi'].median(), inplace=True)


In [11]:
# columns to encode
categorical_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

#create encoder
encoder = OneHotEncoder(drop='first', sparse_output=False)

# fit + transform
encoded_data = encoder.fit_transform(stroke_data[categorical_cols])

# create a new dataframe with the encoded data
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))

# Concatenate bck to original data
stroke_data = pd.concat([stroke_data, encoded_df], axis=1)

# Drop the original categorical columns
stroke_data.drop(columns=categorical_cols, inplace=True)

In [12]:
stroke_data.head()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,67.0,0,1,228.69,36.6,1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
1,51676,61.0,0,0,202.21,28.1,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,31112,80.0,0,1,105.92,32.5,1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,60182,49.0,0,0,171.23,34.4,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,1665,79.0,1,0,174.12,24.0,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [13]:
stroke_data.shape

(5110, 18)

In [14]:
# splitting the data into features and targets
X = stroke_data.drop(columns='stroke', axis=1)
Y = stroke_data['stroke']

In [15]:
# splitting the data into training data and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [16]:
print(X.shape, X_train.shape, X_test.shape)

(5110, 17) (4088, 17) (1022, 17)


In [17]:
# scaling the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
# Model training (Logistic Regression)
model = LogisticRegression(max_iter=1000, class_weight='balanced')

In [19]:
# training the logistic regression model with training data
model.fit(X_train, Y_train)

In [20]:
# model evaluation
# Accuracy score
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy on training data : ', training_data_accuracy)

Accuracy on training data :  0.7363013698630136


In [21]:
# accuracy on testing data
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy on testing data : ', testing_data_accuracy)

Accuracy on testing data :  0.735812133072407


In [22]:
# building a predictive system

def predict_stroke(input_str):

# split string by comma and strip spaces
    input_list = [x.strip() for x in input_str.split(',')]

# Convert numeric fields to float or int (id, age, hypertension, heart_disease, avg_glucose_level, bmi)
    input_list[0] = int(input_list[0])       # id
    input_list[2] = int(input_list[2])       # age
    input_list[3] = int(input_list[3])       # hypertension
    input_list[4] = int(input_list[4])       # heart_disease
    input_list[8] = float(input_list[8])     # avg_glucose_level
    input_list[9] = float(input_list[9])     # bmi

# column names
    columns = ['id','gender','age','hypertension','heart_disease','ever_married',
           'work_type','Residence_type','avg_glucose_level','bmi','smoking_status']

# converting to dataframe
    input_data_df = pd.DataFrame([input_list], columns=columns)

# encode categorical columns
    encoded_input = pd.DataFrame(encoder.transform(input_data_df[categorical_cols]),
                             columns=encoder.get_feature_names_out(categorical_cols))

# Concatenate bck to original data
    input_data_df = pd.concat([input_data_df, encoded_input], axis=1)

# Drop the original categorical columns
    input_data_df.drop(columns=categorical_cols, inplace=True)
# scaling
    input_data_df = scaler.transform(input_data_df)

# predict
    prediction = model.predict(input_data_df)
    print(prediction)

    if (prediction == 0):
      print('Stroke not predicted')
    else:
      print('Stroke predicted')

user_input = input("Enter patient data:")
predict_stroke(user_input)


Enter patient data:42072,Female,50,1,0,Yes,Private,Rural,73.18,30.3,formerly smoked
[0]
Stroke not predicted
