# Import Libraries

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support, classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import seaborn as sns

# Import Dataset

In [None]:
data = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
data

# Preprocessing

In [None]:
data.drop(columns = 'id', inplace = True)

In [None]:
data.isna().sum()

# Dealing with Outlier

In [None]:
data['bmi'].fillna(data['bmi'].median(),inplace = True)

In [None]:
data.isna().sum()

# One Hot Encoding

In [None]:
cat_col = data.select_dtypes(include = 'object')
cat_col.columns

In [None]:
data = pd.get_dummies( data =data, drop_first = True)
data

# Split Dependent & Independent Features

In [None]:
X = data.drop(columns = 'stroke')
Y = data['stroke']

# Train Test Split

In [None]:
X_train1, X_test1 ,Y_train1, Y_test1 = train_test_split(X,Y,test_size =0.4, random_state =42, stratify=Y)
X_train2, X_test2 ,Y_train2, Y_test2 = train_test_split(X,Y,test_size =0.3, random_state =42, stratify=Y)
X_train3, X_test3 ,Y_train3, Y_test3 = train_test_split(X,Y,test_size =0.2, random_state =42, stratify=Y)

# Over Sampling

In [None]:
over_sampler = RandomOverSampler(random_state=42)
X_train1, Y_train1 = over_sampler.fit_resample(X_train1, Y_train1)
X_train2, Y_train2 = over_sampler.fit_resample(X_train2, Y_train2)
X_train3, Y_train3 = over_sampler.fit_resample(X_train3, Y_train3)

# Under Sampling

In [None]:
#under_sampler = RandomUnderSampler(random_state=42)
#X_train1, Y_train1 = under_sampler.fit_resample(X_train1, Y_train1)
#X_train2, Y_train2 = under_sampler.fit_resample(X_train2, Y_train2)
#X_train3, Y_train3 = under_sampler.fit_resample(X_train3, Y_train3)

# Hyperparameter - GridSearchCV

In [None]:
rfc_clf = RandomForestClassifier()

classifiers = ['grid_rf']
param_rf = {'n_estimators':[100, 200, 300], 'criterion':['gini','entropy'], 'bootstrap':[True], 'max_samples':[0.4,0.5, 0.6]}

grid_rfc1 = GridSearchCV(rfc_clf , param_grid = param_rf ,cv = 3, return_train_score=True, verbose=0)
grid_rfc2 = GridSearchCV(rfc_clf , param_grid = param_rf ,cv = 3, return_train_score=True, verbose=0)
grid_rfc3 = GridSearchCV(rfc_clf , param_grid = param_rf ,cv = 3, return_train_score=True, verbose=0)

# Train 60-40 Split

In [None]:
grid_rfc1.fit(X_train1,Y_train1)
print('Best Parameters :', grid_rfc1.best_params_)
best_1 = RandomForestClassifier(**grid_rfc1.best_params_)
best_1.fit(X_train1, Y_train1)
pred1 = best_1.predict(X_test1)
print('Accuracy is ', accuracy_score(Y_test1, pred1)*100, end='\n\n')
cm = confusion_matrix(Y_test1, pred1)
print(cm)
print(classification_report(Y_test1, pred1))
sns.heatmap(cm, annot=True);

In [None]:
tp = cm[0][0]
fp = cm[0][1]
fn = cm[1][0]
tn = cm[1][1]

p = tp/(tp+fp)
r = tp/(tp+fn)
f1 = 2*p*r/(p+r)
print(p,r,f1)

# Train 70-30 Split

In [None]:
grid_rfc2.fit(X_train2,Y_train2)
print('Best Parameters :', grid_rfc2.best_params_)
best_2 = RandomForestClassifier(**grid_rfc2.best_params_)
best_2.fit(X_train2, Y_train2)
pred2 = best_2.predict(X_test2)
print('Accuracy is ', accuracy_score(Y_test2, pred2)*100, end='\n\n')
cm = confusion_matrix(Y_test2, pred2)
print(cm)
print(classification_report(Y_test2, pred2))
print(cm)
sns.heatmap(cm, annot=True);

In [None]:
tp = cm[0][0]
fp = cm[0][1]
fn = cm[1][0]
tn = cm[1][1]

p = tp/(tp+fp)
r = tp/(tp+fn)
f1 = 2*p*r/(p+r)
print(p,r,f1)

# Train 80-20 Split

In [None]:
grid_rfc3.fit(X_train3,Y_train3)
print('Best Parameters :', grid_rfc3.best_params_)
best_3 = RandomForestClassifier(**grid_rfc3.best_params_)
best_3.fit(X_train3, Y_train3)
pred3 = best_3.predict(X_test3)
print('Accuracy is ', accuracy_score(Y_test3, pred3)*100, end='\n\n')
cm = confusion_matrix(Y_test3, pred3)
print(cm)
print(classification_report(Y_test3, pred3))
print(cm)
sns.heatmap(cm, annot=True);

In [None]:
tp = cm[0][0]
fp = cm[0][1]
fn = cm[1][0]
tn = cm[1][1]

p = tp/(tp+fp)
r = tp/(tp+fn)
f1 = 2*p*r/(p+r)
print(p,r,f1)

# Results

In [None]:
out = pd.DataFrame(best_1.get_params(), index=['60-40 Split'])
out2 = pd.DataFrame(best_2.get_params(), index=['70-30 Split'])
out3 = pd.DataFrame(best_3.get_params(), index=['80-20 Split'])

out = out.append(out2)
out = out.append(out3)
out = out[['criterion', 'max_samples', 'n_estimators']]
out

In [None]:
out['Split'] = out.index
out = out[['Split', 'criterion', 'max_samples', 'n_estimators']]
out.reset_index(drop=True, inplace=True)
out

In [None]:
score = dict()
res = None
split = None
p = pd.DataFrame(columns=['Split', 'Precision-Class-0','Precision-Class-1','Recall-Class-0','Recall-Class-1','FScore-Class-0','FScore-Class-1','Support-Class-0','Support-Class-1'])

for i in range(3):
    if(i==0):
        res = precision_recall_fscore_support(Y_test1, pred1)
        split = '60-40 Split'
    if(i==1):
        res = precision_recall_fscore_support(Y_test2, pred2)
        split = '70-30 Split'
    if(i==2):
        res = precision_recall_fscore_support(Y_test3, pred3)
        split = '80-20 Split'

    score = {'Split': split,
             'Precision-Class-0': res[i][0],
             'Precision-Class-1': res[i][1],
             'Recall-Class-0': res[i][0],
             'Recall-Class-1': res[i][1],
             'FScore-Class-0': res[i][0],
             'FScore-Class-1': res[i][1],
             'Support-Class-0': res[i][0],
             'Support-Class-1': res[i][1]}
    
    p = p.append(score, ignore_index=True)

In [None]:
out = pd.merge(out, p, on='Split', how='inner')
out

In [None]:
out.to_csv('output.csv')