In [331]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
import pickle


In [332]:
data = pd.read_csv('/Users/abhinavbanerjee/code/aban371818/strokesense/strokesenseapp/data/stroke_data.csv')
data.head()


Unnamed: 0,Age,Gender,SES,Hypertension,Heart_Disease,BMI,Avg_Glucose,Diabetes,Smoking_Status,Stroke
0,66.788137,Male,Medium,1,0,25.842521,96.428681,1,Current,1
1,86.393609,Female,Medium,1,1,32.531061,133.350068,0,Never,1
2,76.158579,Female,Medium,1,0,40.681244,111.489589,0,Never,0
3,72.47787,Female,Low,0,1,33.003193,125.692465,0,Former,0
4,59.881635,Male,Low,0,0,26.062443,123.218486,1,Never,1


In [333]:
data.describe()

Unnamed: 0,Age,Hypertension,Heart_Disease,BMI,Avg_Glucose,Diabetes,Stroke
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,69.757847,0.6058,0.3028,28.042252,109.115918,0.196,0.2978
std,9.869468,0.488703,0.459492,4.901424,16.421142,0.396988,0.457314
min,27.706561,0.0,0.0,15.037988,45.569135,0.0,0.0
25%,63.128574,0.0,0.0,24.660786,98.188349,0.0,0.0
50%,69.796042,1.0,0.0,28.066932,109.022208,0.0,0.0
75%,76.402885,1.0,1.0,31.39017,120.341179,0.0,1.0
max,99.417448,1.0,1.0,47.495955,176.180688,1.0,1.0


# Features to be encoded

In [334]:
data.Smoking_Status.value_counts()

Never      5951
Current    2062
Former     1987
Name: Smoking_Status, dtype: int64

In [335]:
data.SES.value_counts()

Medium    5106
Low       2870
High      2024
Name: SES, dtype: int64

In [336]:
data.Gender.value_counts()

Female    5037
Male      4963
Name: Gender, dtype: int64

# Encoding Categorical Features

In [337]:
ohe = OneHotEncoder()
categorical_features = ['Smoking_Status', 'SES', 'Gender']
ohe_data = ohe.fit_transform(data[['Smoking_Status', 'SES', 'Gender']]).toarray()
encoded_feature_names = ohe.get_feature_names_out(categorical_features)
encoded_feature_names
encoded_df = pd.DataFrame(ohe_data, columns=encoded_feature_names)
encoded_df.head()


Unnamed: 0,Smoking_Status_Current,Smoking_Status_Former,Smoking_Status_Never,SES_High,SES_Low,SES_Medium,Gender_Female,Gender_Male
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [338]:
data = pd.concat([data, encoded_df], axis=1)
data.head()

Unnamed: 0,Age,Gender,SES,Hypertension,Heart_Disease,BMI,Avg_Glucose,Diabetes,Smoking_Status,Stroke,Smoking_Status_Current,Smoking_Status_Former,Smoking_Status_Never,SES_High,SES_Low,SES_Medium,Gender_Female,Gender_Male
0,66.788137,Male,Medium,1,0,25.842521,96.428681,1,Current,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,86.393609,Female,Medium,1,1,32.531061,133.350068,0,Never,1,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
2,76.158579,Female,Medium,1,0,40.681244,111.489589,0,Never,0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3,72.47787,Female,Low,0,1,33.003193,125.692465,0,Former,0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,59.881635,Male,Low,0,0,26.062443,123.218486,1,Never,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


# Split dependent and target feature set

In [339]:
# X = data.drop(['Stroke', 'Smoking_Status', 'SES', 'Gender'], axis=1)
# y = data['Stroke']

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scaling of numeric features

In [340]:
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# Training the LR model

In [341]:
lreg = LogisticRegression()

In [342]:
# lreg.fit(X_train_scaled, y_train)

In [343]:
# lreg.score(X_test_scaled, y_test)

In [344]:
# y_pred = lreg.predict(X_test_scaled)
# cm = confusion_matrix(y_test, y_pred)
# cm

# Next Iteration with fewer features

In [345]:
data.head()

Unnamed: 0,Age,Gender,SES,Hypertension,Heart_Disease,BMI,Avg_Glucose,Diabetes,Smoking_Status,Stroke,Smoking_Status_Current,Smoking_Status_Former,Smoking_Status_Never,SES_High,SES_Low,SES_Medium,Gender_Female,Gender_Male
0,66.788137,Male,Medium,1,0,25.842521,96.428681,1,Current,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,86.393609,Female,Medium,1,1,32.531061,133.350068,0,Never,1,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
2,76.158579,Female,Medium,1,0,40.681244,111.489589,0,Never,0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3,72.47787,Female,Low,0,1,33.003193,125.692465,0,Former,0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,59.881635,Male,Low,0,0,26.062443,123.218486,1,Never,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [346]:
X1 = data.drop(['Smoking_Status', 'SES', 'Gender','Stroke',
                'SES_High', 'SES_Low', 'SES_Medium',
                'Gender_Female',
                'Smoking_Status_Former', 'Smoking_Status_Never',
                'Smoking_Status_Current'
                ], axis=1)

In [347]:
y1 = data['Stroke']

In [348]:
X1.head(1)
X1.columns

Index(['Age', 'Hypertension', 'Heart_Disease', 'BMI', 'Avg_Glucose',
       'Diabetes', 'Gender_Male'],
      dtype='object')

# Split the new feature set into train and test sets

In [349]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3, random_state=42)

# Scale numeric features

In [350]:
scaler = StandardScaler()
X1_train_scaled = scaler.fit_transform(X1_train)
X1_test_scaled = scaler.transform(X1_test)

# Train the model with new features

In [351]:
lreg.fit(X1_train_scaled, y1_train)
lreg.score(X1_test_scaled, y1_test)

0.792

In [352]:
y1_pred = lreg.predict(X1_test_scaled)
cm1 = confusion_matrix(y1_test, y1_pred)
cm1

array([[1867,  261],
       [ 363,  509]])

In [353]:
model_filename = '/Users/abhinavbanerjee/code/aban371818/strokesense/strokesenseapp/models/lreg_model.pkl'

with open(model_filename, 'wb') as file:
    pickle.dump(lreg, file)

with open('/Users/abhinavbanerjee/code/aban371818/strokesense/strokesenseapp/models/scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)