In [142]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [143]:
df = pd.read_csv('data/healthcare-dataset-stroke-data.csv')
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [144]:
only_valid_bmi = df[~df.isna().apply(lambda x: any(x), axis=1)]
df = only_valid_bmi

In [145]:
one_hot = pd.get_dummies(df)
X = one_hot.drop(['id', 'stroke'], axis=1)
X.shape

(4909, 21)

In [146]:
X.dtypes

age                               float64
hypertension                        int64
heart_disease                       int64
avg_glucose_level                 float64
bmi                               float64
gender_Female                       uint8
gender_Male                         uint8
gender_Other                        uint8
ever_married_No                     uint8
ever_married_Yes                    uint8
work_type_Govt_job                  uint8
work_type_Never_worked              uint8
work_type_Private                   uint8
work_type_Self-employed             uint8
work_type_children                  uint8
Residence_type_Rural                uint8
Residence_type_Urban                uint8
smoking_status_Unknown              uint8
smoking_status_formerly smoked      uint8
smoking_status_never smoked         uint8
smoking_status_smokes               uint8
dtype: object

In [147]:
y = one_hot['stroke'].values
y

array([1, 1, 1, ..., 0, 0, 0])

In [148]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [156]:
pipe = Pipeline([('scaling', StandardScaler()), ('dim_reduction', PCA()), ('clf', RandomForestClassifier())])

param_grid = {'clf': [RandomForestClassifier(), SVC()], 
              'dim_reduction__n_components': [20, 15,10]
             # ,'clf__n_components': [5,10,20]
             }

grid = GridSearchCV(pipe, param_grid)
grid.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('scaling', StandardScaler()),
                                       ('dim_reduction', PCA()),
                                       ('clf', RandomForestClassifier())]),
             param_grid={'clf': [RandomForestClassifier(), SVC()],
                         'dim_reduction__n_components': [20, 15, 10]})

In [150]:
pd.DataFrame(grid.cv_results_).sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_dim_reduction__n_components,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,0.088866,0.012297,0.041374,0.001495,SVC(),20,"{'clf': SVC(), 'dim_reduction__n_components': 20}",0.957938,0.959239,0.95788,0.95788,0.95788,0.958164,0.000538,1
4,0.107549,0.006528,0.035414,0.001788,SVC(),15,"{'clf': SVC(), 'dim_reduction__n_components': 15}",0.957938,0.959239,0.95788,0.95788,0.95788,0.958164,0.000538,1
5,0.099769,0.007887,0.035175,0.001916,SVC(),10,"{'clf': SVC(), 'dim_reduction__n_components': 10}",0.957938,0.959239,0.95788,0.95788,0.95788,0.958164,0.000538,1
0,0.628539,0.0371,0.015737,0.000273,RandomForestClassifier(),20,"{'clf': RandomForestClassifier(), 'dim_reducti...",0.955224,0.959239,0.95788,0.95788,0.956522,0.957349,0.001367,4
1,0.561115,0.007953,0.013722,0.000397,RandomForestClassifier(),15,"{'clf': RandomForestClassifier(), 'dim_reducti...",0.953867,0.959239,0.95788,0.956522,0.956522,0.956806,0.001782,5
2,0.550224,0.01362,0.013669,0.000344,RandomForestClassifier(),10,"{'clf': RandomForestClassifier(), 'dim_reducti...",0.953867,0.959239,0.955163,0.955163,0.955163,0.955719,0.00183,6


In [151]:
grid.score(X_test, y_test)

0.9552117263843648