In [1]:
from matplotlib import pyplot as plt
from sklearn import tree
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
# get and clean data
strokedata = pd.read_csv("stroke_data.csv")

# replacing null BMI values with median value
strokedata["bmi"].fillna(strokedata["bmi"].median(), inplace=True)
# drop instance with Other gender
strokedata = strokedata.drop(strokedata[strokedata["gender"] == "Other"].index)
# drop useless columns
strokedata = strokedata.drop(
    [
        "id"
    ],
    axis=1,
)
# create one-hot encoding
strokedata = pd.get_dummies(strokedata, columns=["smoking_status", 'work_type', 'Residence_type', 'ever_married', 'gender'])
strokedata = strokedata.drop(
    [ 
        'work_type_Govt_job',
    ],
    axis=1,
)


len(strokedata.columns)
X = strokedata.iloc[:, 0:19]

In [3]:
#Splitting into training and testing sets
train_set, test_set = train_test_split(strokedata, test_size=.15, stratify=strokedata["stroke"], random_state=42)


#Creating input and label data
target = 'stroke'
X_train = train_set.drop(target, axis=1)
X_test = test_set.drop(target, axis=1)
y_train = train_set[target]
y_test = test_set[target]

count = Counter(y_train)

In [4]:
# preprocess
# scale
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# oversample
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

In [5]:
# Cross-Validation
param_grid = {
    "criterion": ['gini', 'entropy', 'log_loss'],
    "ccp_alpha": [0.1, 0.01, 0.001, 0.0001],
    "n_estimators": [10, 50, 100, 500, 1000],
}

grid = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, verbose=3, scoring="roc_auc")

grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_estimator_)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV 1/5] END ccp_alpha=0.1, criterion=gini, n_estimators=10;, score=0.821 total time=   0.1s
[CV 2/5] END ccp_alpha=0.1, criterion=gini, n_estimators=10;, score=0.773 total time=   0.1s
[CV 3/5] END ccp_alpha=0.1, criterion=gini, n_estimators=10;, score=0.500 total time=   0.0s
[CV 4/5] END ccp_alpha=0.1, criterion=gini, n_estimators=10;, score=0.812 total time=   0.0s
[CV 5/5] END ccp_alpha=0.1, criterion=gini, n_estimators=10;, score=0.798 total time=   0.0s
[CV 1/5] END ccp_alpha=0.1, criterion=gini, n_estimators=50;, score=0.812 total time=   0.2s
[CV 2/5] END ccp_alpha=0.1, criterion=gini, n_estimators=50;, score=0.815 total time=   0.2s
[CV 3/5] END ccp_alpha=0.1, criterion=gini, n_estimators=50;, score=0.808 total time=   0.2s
[CV 4/5] END ccp_alpha=0.1, criterion=gini, n_estimators=50;, score=0.828 total time=   0.2s
[CV 5/5] END ccp_alpha=0.1, criterion=gini, n_estimators=50;, score=0.822 total time=   0.2s
[CV 1/5]

KeyboardInterrupt: 

In [None]:
rf = grid.best_estimator_
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print(classification_report(y_test, y_pred))