In [101]:
"""
Libraries to be installed
pip install pandas
pip install scikit-learn
"""

'\nLibraries to be installed\npip install pandas\npip install scikit-learn\n'

In [102]:
# Import the libraries required
import pickle
import warnings 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.exceptions import ConvergenceWarning

In [103]:
# Load data
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [104]:
data = df.dropna()

In [105]:
# Drop a row by index
data = data.drop('id', axis =1) 

In [106]:
# View the dataset
data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [107]:
# Select the categorical columns to encode
categorical_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

In [108]:
le_dict = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    le_dict[col] = le
    
# Print the encoded values and their corresponding categories
for col in categorical_cols:
    le = le_dict[col]
    classes = le.classes_
    encoded_values = le.transform(classes)
    decoded_values = le.inverse_transform(encoded_values)
    print(col + ':')
    print('Encoded values:', encoded_values)
    print('Categories:', decoded_values)

gender:
Encoded values: [0 1 2]
Categories: ['Female' 'Male' 'Other']
ever_married:
Encoded values: [0 1]
Categories: ['No' 'Yes']
work_type:
Encoded values: [0 1 2 3 4]
Categories: ['Govt_job' 'Never_worked' 'Private' 'Self-employed' 'children']
Residence_type:
Encoded values: [0 1]
Categories: ['Rural' 'Urban']
smoking_status:
Encoded values: [0 1 2 3]
Categories: ['Unknown' 'formerly smoked' 'never smoked' 'smokes']


In [109]:
# Create a new column "Age_0_1" with values 0 or 1 based on the "age" column
#data["age"] = data["age"].apply(lambda x: 1 if x >= 55 else 0)....uncomment this if you need age limit of 55
# NB: for Age>=50 ==> 1 and for Age<50 ==> 0

In [110]:
# Create a new column "BMI_0_1" with values 0 or 1 based on the "bmi" column
#data["bmi"] = data["bmi"].apply(lambda x: 1 if x >= 30 else 0)....uncomment this if you need BMI limit of 30
# NB: for BMI >30 ==> 1 and for BMI<30 ==> 0

In [111]:
# Concatenate the encoded columns with the numerical features
numerical_cols = ['age', 'hypertension', 'bmi']
X = pd.concat([data[numerical_cols]], axis=1)

In [112]:
# Define features and target
X = data.drop(['stroke'], axis=1)
y = data['stroke']

In [113]:
# Print the column labels
print(X.columns)

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status'],
      dtype='object')


In [114]:
# Print the proccesd new data
data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.6,1,1
2,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,0,79.0,1,0,1,3,0,174.12,24.0,2,1
5,1,81.0,0,0,1,2,1,186.21,29.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,0,13.0,0,0,0,4,0,103.08,18.6,0,0
5106,0,81.0,0,0,1,3,1,125.20,40.0,2,0
5107,0,35.0,0,0,1,3,0,82.99,30.6,2,0
5108,1,51.0,0,0,1,2,0,166.29,25.6,1,0


In [115]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [116]:
# Initialize logistic regression model with L1 regularization
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=ConvergenceWarning)
    model = LogisticRegression(max_iter=2000, penalty='l1', solver='liblinear')

In [117]:
# Fit model on training data
model.fit(X_train, y_train)

In [118]:
# Predict stroke on testing data
y_pred = model.predict(X_test)

In [119]:
# Evaluate model accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Model Accuracy:', accuracy)

Model Accuracy: 0.9460285132382892


In [120]:
# Save the model in a file using pickle
with open('bestLR_model.pkl', 'wb') as f:
    pickle.dump(model, f)