In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [None]:
data.head()

In [None]:
data.drop(columns=['id'],inplace=True)

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.bmi.replace(to_replace=np.nan, value=data.bmi.mean(),inplace=True)

In [None]:
data.describe()

In [None]:
all_columns = list(data.columns)
categorical_data_cols  = [column for column in all_columns if len(data[column].unique())<=5]
continuous_data_cols  = [column for column in all_columns if column not in categorical_data_cols]

In [None]:
plt.figure(figsize = (15,15))
i = 1
for column in categorical_data_cols[:-1]:
    plt.subplot(4, 2, i)
    sns.countplot(x = data[column], hue = data["stroke"])
    i+=1
plt.show()

In [None]:
plt.figure(figsize = (15,15))
i = 1
for column in categorical_data_cols[:-1]:
    type_count = data.groupby(column)["stroke"].sum()
    plt.subplot(4, 2, i)
    x = type_count.index
    y = type_count.values
    plt.barh(x, y)
    plt.title(f"{column} vs Stroke")
    for index, value in enumerate(y):
        plt.text(value, index,
                 value)
    i+=1


plt.show()

In [None]:
plt.figure(figsize = (15, 10))
i = 1
for column in continuous_data_cols:
    plt.subplot(2, 2, i)
    sns.distplot(data[column])
    i+=1
plt.show()

In [None]:
plt.figure(figsize=(15,15))
sns.pairplot(data[['gender','age','hypertension','heart_disease','avg_glucose_level','bmi','stroke']],hue='stroke',kind='kde')
plt.show()

In [None]:
plt.figure(figsize = (10,5))
stroke_count = data["stroke"].value_counts()
x = stroke_count.index
y = stroke_count.values
plt.barh(x,y)
for index, value in enumerate(y):
        plt.text(value, index,
                 value)

plt.title(f"Stroke Count")
plt.show()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(data.corr(method='pearson'), annot=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
data = data[data['smoking_status']!='Unknown']

In [None]:
# gender
le = LabelEncoder()
le.fit(data.gender.drop_duplicates())
data.gender = le.transform(data.gender)

# residence type
le.fit(data.Residence_type.drop_duplicates())
data.Residence_type = le.transform(data.Residence_type)

# ever married
le.fit(data.ever_married.drop_duplicates())
data.ever_married = le.transform(data.ever_married)

# smoking status
le.fit(data.smoking_status.drop_duplicates())
data.smoking_status = le.transform(data.smoking_status)

# work type
le.fit(data.work_type.drop_duplicates())
data.work_type = le.transform(data.work_type)

In [None]:
data

In [None]:
x = data.drop(["stroke"] , axis = 1)
y = data["stroke"]

In [None]:
x.isnull().sum()

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
x_smote, y_smote = smote.fit_resample(x, y)

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_smote, y_smote, test_size=0.2, random_state=0)

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, classification_report
from sklearn.model_selection import cross_val_score, KFold

In [None]:
models = []
models.append(['Logistic Regression', LogisticRegression(random_state = 0)])
models.append(['Support Vector Machine (RBF Classifier)', SVC(kernel = 'rbf', random_state = 0)])
models.append(['XGBClassifier', XGBClassifier(disable_default_eval_metric = True, random_state = 0)])
models.append(['RandomForest', RandomForestClassifier(random_state = 0)])
models.append(['AdaBoostClassifier', AdaBoostClassifier(base_estimator = models[3][1], random_state = 0)])

In [None]:
arr1 = []
for i in range(len(models)):
    arr2 = []
    model = models[i][1]
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    train_accuracies = model.score(x_train, y_train) * 100
    cm = confusion_matrix(y_test, y_pred)
    cr = classification_report(y_test, y_pred)
    accuracies = cross_val_score(estimator = model, X = x_train , y = y_train, cv = KFold(n_splits=10, random_state=2020,
                                shuffle=True), scoring = "accuracy")
    kfold = accuracies.mean()*100
    stdd = accuracies.std()*100
    roc_auc = roc_auc_score(y_test, y_pred)

    print(models[i][0] + " Reports:")
    print("Training Accuracy: {}".format(train_accuracies))
    print(cm)
    print(cr)
    print('K-Fold Validation Mean Accuracy: {}'.format(kfold))
    print('Standard Deviation: {}'.format(stdd))
    print('ROC AUC Score: {}'.format(roc_auc))
    print("---"*20)
    arr2.extend([models[i][0], kfold, stdd, roc_auc])
    arr1.append(arr2)


In [None]:
df2 = pd.DataFrame(arr1 , columns = ['Model','K-Fold Mean Accuracy','Std.Deviation','ROC_AUC'])

df2.sort_values(by = ["ROC_AUC"] , inplace = True , ascending = False)
df2

In [None]:
sns.barplot(x = "ROC_AUC" , y = "Model" , data = df2)
plt.title("Model Compare");

In [None]:
model = models[2][1]
model.fit(x_train, y_train)

In [None]:
feature_importance = model.feature_importances_
print(feature_importance)

feat_importances = pd.Series(feature_importance, index=data.drop(["stroke"] , axis = 1).columns)
feat_importances = feat_importances.nlargest(10)
feat_importances.plot(kind='barh' , figsize=(10,10))

In [None]:
values = np.array([[1, 22.0, 1, 1, 1, 0, 0, 200.0, 30.0, 0]])
predict = model.predict(values)
print(predict)