In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# Choose the dataset to work with
df = pd.read_pickle('./dataframes/df_e.pkl')
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,stroke,bmi,smoking_status
0,0,67.0,0,1,1,2,1,228.69,1,36.6,1.0
1,1,61.0,0,0,1,3,0,202.21,1,30.046602,2.0
2,0,80.0,0,1,1,2,0,105.92,1,32.5,2.0
3,1,49.0,0,0,1,2,1,171.23,1,34.4,3.0
4,1,79.0,1,0,1,3,0,174.12,1,24.0,2.0


In [5]:
# Split into X and y
X = df.drop('stroke', axis=1)
y = df['stroke']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Fit the model to the data
model = RandomForestClassifier()
model.fit(X_train, y_train);

In [6]:
model.score(X_train, y_train)

1.0

In [7]:
def evaluation_metrics(model, X, y):
    
    cv_f1 = cross_val_score(model, X, y, cv=5, scoring='f1')
    cv_recall = cross_val_score(model, X, y, cv=5, scoring='recall')
    cv_pre = cross_val_score(model, X, y, cv=5, scoring='precision')
    
    # Evaluate the classifier
    print('Classifier metrics on the test set')
    print(f'F1 score: {np.mean(cv_f1)*100:.2f}%')
    print(f'Recall: {np.mean(cv_recall)*100:.2f}%')
    print(f'Precision: {np.mean(cv_pre)*100:.2f}%')

In [13]:
evaluation_metrics(model,X_test,y_test)

Classifier metrics on the test set
F1 score: 6.81%
Recall: 1.25%
Precision: 40.00%


In [16]:
y_preds = model.predict(X_test)
confusion_matrix(y_test, y_preds)

array([[1198,    0],
       [  80,    0]], dtype=int64)

In [17]:
pd.crosstab(y_test, y_preds, rownames=['Actual Labels'], colnames=['Predicted Labels'])

Predicted Labels,0
Actual Labels,Unnamed: 1_level_1
0,1198
1,80


In [15]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1198
           1       0.00      0.00      0.00        80

    accuracy                           0.94      1278
   macro avg       0.47      0.50      0.48      1278
weighted avg       0.88      0.94      0.91      1278



In [10]:
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [11]:
grid = {'n_estimators': [10, 100, 200, 500, 1000, 2000],
        'max_depth' : [None, 5, 10, 20, 30],
        'max_features': ['auto', 'sqrt'],
        'min_samples_split' : [2, 4, 6],
        'min_samples_leaf' : [1, 2, 4]}

np.random.seed(42)


# Split into x & y
X = df.drop('stroke', axis=1)
y = df['stroke']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

# Instantiate RandomForestClassifier
model = RandomForestClassifier(n_jobs=1)

# Setup RandomedSearchCV
rs_model = RandomizedSearchCV(estimator=model,
                           param_distributions=grid,
                           n_iter=10, # number of models to try
                           cv=5,
                           verbose=2)

# Fit the RandomizedSearch version of clf
rs_model.fit(X_train, y_train);

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=500, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=10 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=500, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=10, total=   1.0s
[CV] n_estimators=500, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=10 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s


[CV]  n_estimators=500, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=10, total=   1.0s
[CV] n_estimators=500, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=10 
[CV]  n_estimators=500, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=10, total=   1.0s
[CV] n_estimators=500, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=10 
[CV]  n_estimators=500, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=10, total=   1.0s
[CV] n_estimators=500, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=10 
[CV]  n_estimators=500, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=10, total=   1.1s
[CV] n_estimators=1000, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=None 
[CV]  n_estimators=1000, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=None, total=   2.1s
[CV] n_estimators=1000, min_samples_split=6, m

[CV]  n_estimators=2000, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30, total=   4.5s
[CV] n_estimators=2000, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30 
[CV]  n_estimators=2000, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30, total=   4.2s
[CV] n_estimators=2000, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30 
[CV]  n_estimators=2000, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30, total=   4.4s
[CV] n_estimators=2000, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30 
[CV]  n_estimators=2000, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30, total=   4.5s
[CV] n_estimators=2000, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30 
[CV]  n_estimators=2000, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30, total=   4.4s


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   58.7s finished


In [12]:
rs_model.best_params_

{'n_estimators': 500,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 10}

In [13]:
rs_model = RandomForestClassifier()
rs_model.fit(X_train, y_train)

RandomForestClassifier()

In [16]:
evaluation_metrics(rs_model,X_test,y_test)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classifier metrics on the test set
F1 score: 6.93%
Recall: 2.50%
Precision: 40.00%
