### Configure Notebook

#### Load Python Libraries

In [8]:
import logging
import os
import shutil
import sys

import joblib
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

#### Define Global Variables

In [9]:
data_dir = "../data/raw/"  # The directory where the dataset is located.
feature_columns = [
    "gender",
    "age",
    "hypertension",
    "heart_disease",
    "ever_married",
    "work_type",
    "residence_type",
    "avg_glucose_level",
    "bmi",
    "smoking_status",
]  # Feature columns.
target_column = "stroke"  # Prediction column.
random_state = 42  # Ensure that pipeline is reproducible.

#### Configure Logger

In [10]:
log_format = (
    "[%(asctime)s] - p%(process)s %(name)s %(lineno)d - %(levelname)s:%(message)s"
)
logging.basicConfig(
    level=logging.INFO,
    stream=sys.stdout,
    format=log_format,
    datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger()

### Load Raw Dataset

In [11]:
df_raw = pd.read_csv(
    os.path.join(data_dir, "raw_stroke_records.csv")
)  # Load raw dataset as Pandas DataFrame.
logger.info(f"Raw Dataset Number of Records: {len(df_raw)}")

df_raw.head()

[2022-06-03 22:35:35] - p62524 root 4 - INFO:Raw Dataset Number of Records: 5110


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


### Prepare Dataset For Modelling

In [12]:
df_processed = df_raw.drop("id", axis=1).reset_index(drop=True)  # Drop id column.

df_processed = df_processed[
    df_processed["gender"].isin(["Male", "Female"])
].reset_index(
    drop=True
)  # Ensure gender only corresponds to Male and Female.
logger.info(f"Processed Dataset Number of Rows: {len(df_processed)}")

df_processed.head()

[2022-06-03 22:35:35] - p62524 root 8 - INFO:Processed Dataset Number of Rows: 5109


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [13]:
df_processed.to_csv(
    "../data/processed/stroke_records.csv", index=False
)  # Export processed Pandas DataFrame as .csv file.

### Define Features and Targets

In [14]:
X = df_processed[feature_columns]  # Select training features.
y = df_processed[target_column]  # Select predictor variable.

#### Evaluate Numeric and Categorical Columns

In [15]:
numeric_columns = X.select_dtypes(
    include=["int64", "float64"]
).columns  # Evaluate numeric columns using dtype.
logger.info(f"Numeric Columns: {numeric_columns}")

categorical_columns = X.select_dtypes(
    include=["object", "bool"]
).columns  # Evaluate categorical columns using dtype.
logger.info(f"Categorical Columns: {numeric_columns}")

[2022-06-03 22:35:35] - p62524 root 4 - INFO:Numeric Columns: Index(['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi'], dtype='object')
[2022-06-03 22:35:35] - p62524 root 9 - INFO:Categorical Columns: Index(['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi'], dtype='object')


### Define Pipeline

In [16]:
preprocess_pipeline = ColumnTransformer(
    [
        (
            "num_imputer",
            SimpleImputer(missing_values=np.nan, strategy="median"),
            numeric_columns,
        ),
        ("categorical_encoder", OneHotEncoder(), categorical_columns),
    ],
    remainder="passthrough",
)

In [17]:
pipeline = Pipeline(
    steps=[
        ("preprocess", preprocess_pipeline),
        ("classifier", RandomForestClassifier()),
    ]
)

#### Define Hyperparameter Dictionary

In [18]:
param_grid = {
    "classifier__max_depth": [1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, None],
    "classifier__criterion": ["gini", "entropy"],
}

#### Optimize Hyperparamters Using K-Fold Cross-Validation

In [19]:
search = RandomizedSearchCV(
    pipeline,
    param_grid,
    n_iter=20,
    cv=KFold(n_splits=5, shuffle=True, random_state=random_state),
)

In [20]:
logger.info(f"Starting Pipeline Training.")

train_acc, val_acc = [], []  # Define empty lists.
for train_ind, val_ind in KFold(
    n_splits=5, shuffle=True, random_state=random_state
).split(X, y):
    X_train, y_train = X.iloc[train_ind], y[train_ind]  # Select fold training data.
    X_val, y_val = X.iloc[val_ind], y[val_ind]  # Select fold validation data.

    search.fit(X_train, y_train)  # Fit model using training data.

    y_hat_train = search.predict(X_train)
    train_acc.append(
        accuracy_score(y_train, y_hat_train)
    )  # Evaluate fold train accuracy.

    y_hat_val = search.predict(X_val)
    val_acc.append(
        accuracy_score(y_val, y_hat_val)
    )  # Evaluate fold validation accuracy.

[2022-06-03 22:35:35] - p62524 root 1 - INFO:Starting Pipeline Training.


#### Assess Performance Metrics

In [23]:
mean_train_acc = np.round(np.mean(train_acc), 4)  # Evaluate average training accuracy.
logger.info(f"Training Accuracy: {mean_train_acc}")

mean_val_acc = np.round(np.mean(val_acc), 4)  # Evaluate average validation accuracy.
logger.info(f"Validation Accuracy: {mean_val_acc}")

logger.info(f"Optimized Hyperparameters: {search.best_params_}")

[2022-06-03 22:39:33] - p62524 root 2 - INFO:Training Accuracy: 0.9586
[2022-06-03 22:39:33] - p62524 root 5 - INFO:Validation Accuracy: 0.9509
[2022-06-03 22:39:33] - p62524 root 7 - INFO:Optimized Hyperparameters: {'classifier__max_depth': 4, 'classifier__criterion': 'gini'}


#### Export Pipeline

In [24]:
if not os.path.exists("pipelines"):
    os.makedirs("pipelines")
joblib.dump(
    search.best_estimator_, f"pipelines/RF_A_{mean_val_acc}.joblib"
)  # NOTE: GridSearchCV returns model fitted to full dataset (see: https://stackoverflow.com/questions/34143829/sklearn-how-to-save-a-model-created-from-a-pipeline-and-gridsearchcv-using-jobli).

logger.info(f"Exported Pipeline: pipelines/RF_A_{mean_val_acc}.joblib")

[2022-06-03 22:39:35] - p62524 root 7 - INFO:Exported Pipeline: pipelines/RF_A_0.9509.joblib
