In [1]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, poisson
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import numpy as np
import pickle
from joblib import dump, load

In [2]:
df = pd.read_csv("train_strokes.csv")
df = df.drop(['id'], axis=1)


In [4]:
df = df.drop(df[df['gender'] == 'Other'].index)

In [5]:
# the fact that both smoking status and BMI with NaN values represents such a large subset of the stroke data is likely
# meaningful - stroke patients arent being measured for weight when they come in or asked about smoking habits.
# additionally, they could have poor medical care
# for whatever reason, given this is important so we cannot impute. Instead I unfortunately removed these features.

In [6]:
df = df.drop('bmi', axis = 1)

In [7]:
df = df.drop('smoking_status', axis = 1)

In [8]:
oldX, y = df.drop('stroke', axis=1).values, df['stroke'].values

In [9]:
encoder = OneHotEncoder(handle_unknown='ignore')

#encode columns with string data
ct = ColumnTransformer([
    ('encoder', encoder, [0, 4, 5, 6]) 
], remainder='passthrough')

X = ct.fit_transform(oldX)

In [10]:
# over and under sampling
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek

smote_enn = SMOTEENN(random_state=0)


X_resampled, y_resampled = smote_enn.fit_resample(X, y)

smote_tomek = SMOTETomek(random_state=0)
X_resampled, y_resampled = smote_tomek.fit_resample(X, y)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled)

In [12]:
# for other linear models would have used predict_proba
pipe = Pipeline([('lr', LinearSVC(C=0.0001, class_weight={1: 1.5}))
]).fit(X_train, y_train)



In [13]:
dump(pipe, 'savedModel.joblib') 

['savedModel.joblib']

In [14]:
dump(ct, 'savedColumnTransformer.joblib')

['savedColumnTransformer.joblib']

In [15]:
from sklearn.metrics import classification_report

predictions = pipe.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.88      0.61      0.72     10644
           1       0.70      0.92      0.79     10588

    accuracy                           0.76     21232
   macro avg       0.79      0.76      0.76     21232
weighted avg       0.79      0.76      0.76     21232



In [17]:
def gather_user_data_output_result(pipe, ct):
    gender = input("please input gender as either: 'male' or 'female'")
    age = float(input("please input age as float"))
    hypertension = int(input("please input if you have hypertension as either 1 for yes or 0 for no"))
    heartdisease = int(input("please input if you have heart disease as either 1 for yes or 0 for no"))
    ever_married = input("please input if you have ever been married as either 'Yes' or 'No'")
    work_type = input("please input your current work type as one of the following: 'children', 'Private', 'Never_worked', 'Govt_job', 'Self-employed'")
    Residence_type = input("please input your residence type as either 'Rural' or 'Urban'")
    avg_glucose_level = float(input("please input your average glucose level as a float"))
    row = [gender, age, hypertension, heartdisease, ever_married, work_type, Residence_type, avg_glucose_level]
    row = ct.transform([row])
    result = pipe.predict(row)
    result = result[0]
    if result == 0:
        print("According to the model you will likely not have a stroke.")
    elif result == 1:
        print("According to the model you may likely  have a stroke. Please see a doctor")
    return result
    

               

In [18]:
gather_user_data_output_result(pipe, ct)

please input gender as either: 'male' or 'female' female
please input age as float 70
please input if you have hypertension as either 1 for yes or 0 for no 1
please input if you have heart disease as either 1 for yes or 0 for no 1
please input if you have ever been married as either 'Yes' or 'No' Yes
please input your current work type as one of the following: 'children', 'Private', 'Never_worked', 'Govt_job', 'Self-employed' children
please input your residence type as either 'Rural' or 'Urban' Rural
please input your average glucose level as a float 13


According to the model you may likely  have a stroke. Please see a doctor


1