# Interactive Stroke Prediction Function

To use this function, run all code, and in the stroke_prediction() fill in your personal information in this order: Age, Average Glucose Level, BMI, Hypertension, Heart Disease, Gender, Ever Married, Work Type, Residence Type, and Smoking status and example is provided for clarification

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [4]:
# filling NA in BMI
print('Replacing null BMI values with the median:', df['bmi'].median()) 
median_value = 28.1 # can see from info table that the median for bmi is 28.1
df['bmi'].fillna(value=median_value, inplace=True)

Replacing null BMI values with the median: 28.1


In [5]:
# One 'Other' value in gender, replacing with majority value, Female.
df['gender'] = df['gender'].replace('Other', 'Female')

In [8]:
df['hypertension'] = df['hypertension'].replace({0 : 'No', 1 : 'Yes'})
df['heart_disease'] = df['heart_disease'].replace({0: 'No', 1 : 'Yes'})
df = df[['age', 'hypertension','heart_disease', 'avg_glucose_level','bmi','stroke','gender','ever_married','work_type','Residence_type','smoking_status']]

In [7]:
# Using these variables for prediction function at end
std_age = df['age'].std()
mean_age = df['age'].mean()
mean_avgglu = df['avg_glucose_level'].mean()
std_avgglu = df['avg_glucose_level'].std()
mean_bmi = df['bmi'].mean()
std_bmi = df['bmi'].std()

In [9]:
import scipy.stats as stats
df['age'] = stats.zscore(df['age'])
df['avg_glucose_level'] = stats.zscore(df['avg_glucose_level'])
df['bmi'] = stats.zscore(df['bmi'])

In [13]:
df1 = pd.get_dummies(df, drop_first = True) 
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   age                             5110 non-null   float64
 1   avg_glucose_level               5110 non-null   float64
 2   bmi                             5110 non-null   float64
 3   stroke                          5110 non-null   int64  
 4   hypertension_Yes                5110 non-null   uint8  
 5   heart_disease_Yes               5110 non-null   uint8  
 6   gender_Male                     5110 non-null   uint8  
 7   ever_married_Yes                5110 non-null   uint8  
 8   work_type_Never_worked          5110 non-null   uint8  
 9   work_type_Private               5110 non-null   uint8  
 10  work_type_Self-employed         5110 non-null   uint8  
 11  work_type_children              5110 non-null   uint8  
 12  Residence_type_Urban            51

In [14]:
X_d = df1.drop('stroke', axis = 1)
y_d = df1.stroke

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_d, y_d, test_size=0.25, random_state=42)

In [18]:
# best model
clf = RandomForestClassifier(class_weight = 'balanced', max_depth = 6, n_estimators = 10)
clf.fit(X_train, y_train)

RandomForestClassifier(class_weight='balanced', max_depth=6, n_estimators=10)

In [19]:
X = df.drop('stroke', axis = 1)

In [20]:
#create function to determine prediction of new input
def predict_stroke(age, avg_glucose_level, bmi, hypertension, 
                   heart_disease, gender, ever_married, work_type, 
                   Residence_type, smoking_status, clf, mean_age, std_age, mean_avgglu, 
                   std_avgglu, mean_bmi, std_bmi):
    age  = (age - mean_age)/std_age
    
    avg_glucose_level = (avg_glucose_level - mean_avgglu)/std_avgglu
    
    bmi = (bmi - mean_bmi)/std_bmi
    
    X_new = {'age': age, 'avg_glucose_level': avg_glucose_level, 'bmi': bmi, 'hypertension':hypertension,
             'heart_disease': heart_disease, 'gender' : gender, 'ever_married': ever_married, 'work_type': work_type,
             'Residence_type':Residence_type, 'smoking_status': smoking_status}
    
    df2 = X.append(X_new, ignore_index = True)
    
    df3 = pd.get_dummies(df2, drop_first = True)
    
    df4 = pd.DataFrame(columns=list(X_test.columns))
    df4.loc[0] = df3.iloc[-1]
    
    y_pred = clf.predict(df4)
    
    proba = clf.predict_proba(df4)
    
    return y_pred, proba

In [21]:
#Here is where you fill in your information in this order: Age, Average Glucose Level, BMI, Hypertension, 
#Heart Disease, Gender, Ever Married, Work Type, Residence Type, and Smoking status and example is provided

X = predict_stroke(34, 350, 35, 'Yes', 'No', 'Female', 'Yes', 'Private', 'Urban', 'never smoked', clf, mean_age, std_age, mean_avgglu, std_avgglu, mean_bmi, std_bmi)

In [22]:
print(X)

(array([0]), array([[0.7700231, 0.2299769]]))


The first output will be a 1 or a 0. 1 indicates stroke, 0 indicates no stroke. The 