<a href="https://colab.research.google.com/github/Zfeng0207/FIT3199-FYP/blob/dev%2Fzfeng/02_baseline_model_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Baseline Model Evaluation

In [2]:
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/FIT3199-FYP/python-lib')

In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

from scipy.sparse import hstack
from pyngrok import ngrok

from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

import shap
import lime
from lime.lime_tabular import LimeTabularExplainer

### Initialize Mlflow server

In [4]:
from getpass import getpass
import os

os.environ['MLFLOW_TRACKING_USERNAME'] = "Zfeng0207"
os.environ['MLFLOW_TRACKING_PASSWORD'] = ""
os.environ['MLFLOW_TRACKING_PROJECTNAME'] = "stroke-prediction-dagshub-repo"

mlflow.set_tracking_uri(f'https://dagshub.com/' + os.environ['MLFLOW_TRACKING_USERNAME'] + '/' + os.environ['MLFLOW_TRACKING_PROJECTNAME'] + '.mlflow')
print(f'https://dagshub.com/' + os.environ['MLFLOW_TRACKING_USERNAME'] + '/' + os.environ['MLFLOW_TRACKING_PROJECTNAME'] + '.mlflow')

https://dagshub.com/Zfeng0207/stroke-prediction-dagshub-repo.mlflow


### Import Recurrent Stroke Patient Data

In [36]:
recurrent_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/FIT3199-FYP/input/00_recurrent_stroke_patient.csv')

In [37]:
recurrent_df.drop(columns=['dod','numeric_part',"icd_code", "icd_version", "numeric_part", "dod", "icd_title"],inplace=True)
recurrent_df = recurrent_df.dropna(subset=['charttime'])

In [22]:
recurrent_df.columns

Index(['subject_id', 'stay_id', 'charttime', 'temperature', 'heartrate',
       'resprate', 'o2sat', 'sbp', 'dbp', 'rhythm', 'pain', 'seq_num',
       'icd_code', 'icd_version', 'icd_title', 'Stroke_Y/N', 'gender',
       'anchor_age', 'anchor_year', 'anchor_year_group'],
      dtype='object')

## Preprocessing Pipeline

In [40]:
recurrent_df.isna().sum()

Unnamed: 0,0
subject_id,0
stay_id,0
charttime,0
temperature,32373
heartrate,3100
resprate,4090
o2sat,5341
sbp,3324
dbp,3324
rhythm,65248


In [45]:
recurrent_df.dtypes

Unnamed: 0,0
subject_id,int64
stay_id,int64
charttime,object
temperature,float64
heartrate,float64
resprate,float64
o2sat,float64
sbp,float64
dbp,float64
rhythm,object


In [43]:
import pandas as pd
import numpy as np
import pickle
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from fuzzywuzzy import process, fuzz

# Define column groups
right_skewed_features = ["temperature", "heartrate", "resprate", "o2sat", "sbp", "dbp"]
date_features = ["anchor_age", "anchor_year","anchor_year_group"]
text_feature = "pain"  # Text column for TF-IDF
categorical_features = ["gender", "rhythm","subject_id", "stay_id", "seq_num"]  # Include grouped rhythm
target_column = "Stroke_Y/N"
untouched_features = ["charttime"]

# ✅ **2️⃣ Function to Interpolate Missing Values**
def interpolate_features(df):
    df = df.copy()
    for col in ["heartrate", "resprate", "o2sat", "sbp", "dbp"]:
        df[col] = df[col].interpolate(method="linear")
    return df

# ✅ **3️⃣ Function to Fuzzy Match 'rhythm' Column**
reference_groups = [
    "Normal Sinus Rhythm", "Sinus Arrhythmia", "Sinus Bradycardia", "Sinus Tachycardia",
    "Atrial Fibrillation", "Atrial Flutter", "Paced Rhythm", "Junctional Rhythm",
    "First Degree AV Block", "Bundle Branch Block (BBB)", "Supraventricular Tachycardia (SVT)",
    "Ventricular Ectopy (PVCs, Bigeminy, Trigeminy)", "Unknown / Invalid Entry"
]

def match_rhythm_column(df):
    """Performs fuzzy matching on the 'rhythm' column and modifies the DataFrame in place."""

    def match_rhythm(rhythm):
        if pd.isna(rhythm):
            return "Unknown / Invalid Entry"
        rhythm = rhythm.lower().strip()
        best_match, score = process.extractOne(rhythm, reference_groups, scorer=fuzz.token_sort_ratio)
        return best_match if score > 80 else "Unknown / Invalid Entry"

    df["rhythm"] = df["rhythm"].apply(match_rhythm)


# ✅ **4️⃣ Function to Replace Nulls in 'pain' Column with "null"**
def replace_null_pain(df):
    df = df.copy()
    df["pain"] = df["pain"].fillna("null")
    return df

# ✅ **5️⃣ TF-IDF for Text Column**
vectorizer = TfidfVectorizer(stop_words="english", max_features=500)

# ✅ **6️⃣ Preprocessing for Numerical & Categorical Features**
log_transformer = FunctionTransformer(np.log1p, validate=True)
mm_scaler = MinMaxScaler()
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# ✅ **7️⃣ ColumnTransformer to Process All Features**
preprocessor = ColumnTransformer([
    ("log", log_transformer, right_skewed_features),  # Log transform
    ("scaler", mm_scaler, date_features),  # Scale date-related features
    ("tfidf", vectorizer, text_feature),  # TF-IDF for text column
    ("ohe", ohe, categorical_features),  # One-hot encode categorical features
], sparse_threshold=0)  # Ensure dense output

# ✅ **8️⃣ Pipeline to Apply Preprocessing**
pipeline = Pipeline([
    ("replace_null_pain", FunctionTransformer(replace_null_pain)),  # Step 1: Replace NaN in pain column
    ("interpolation", FunctionTransformer(interpolate_features)),  # Step 2: Interpolation
    ("fuzzy_matching", FunctionTransformer(match_rhythm_column)),  # Step 3: Fuzzy matching
    ("preprocessor", preprocessor)  # Step 4: Feature transformations
])

# ✅ **9️⃣ Save Pipeline to a Pickle File**
with open("preprocessing_pipeline.pkl", "wb") as f:
    pickle.dump(pipeline, f)

print("✅ Preprocessing pipeline saved successfully!")


✅ Preprocessing pipeline saved successfully!


# Time Series Splitting

In [44]:
pipeline.fit(recurrent_df.drop(columns=["Stroke_Y/N"]))



ValueError: Expected 2D array, got scalar array instead:
array=None.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit

# Assume 'df' is your dataset
X = recurrent_df.drop(columns=["Stroke_Y/N"])
y = recurrent_df["Stroke_Y/N"]

# TimeSeriesSplit with 3 folds
tscv = TimeSeriesSplit(n_splits=3)

fold_results = []

for fold, (train_idx, test_idx) in enumerate(tscv.split(X), start=1):
    print(f"Processing Fold {fold}...")

    # Split train and test data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Fit pipeline on train data
    pipeline.fit(X_train)

    # Transform train and test data
    X_train_transformed = pipeline.transform(X_train)
    X_test_transformed = pipeline.transform(X_test)

    # Store results
    fold_results.append({
        "fold": fold,
        "X_train": X_train_transformed,
        "y_train": y_train,
        "X_test": X_test_transformed,
        "y_test": y_test
    })

    print(f"Fold {fold} | Train Size: {X_train.shape[0]} | Test Size: {X_test.shape[0]}")


Processing Fold 1...


ValueError: Input X contains NaN.
FunctionTransformer does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
split_df

In [None]:
split_df.shape

In [None]:
split_df

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit

# Define TimeSeriesSplit with enough splits for train-validation-test
n_splits = 3
tscv = TimeSeriesSplit(n_splits=n_splits)

# Select a few example patients for visualization
example_patients = df["subject_id"].unique()[:3]  # Select first 3 patients

# Plot each patient’s fold splits
fig, axes = plt.subplots(len(example_patients), 1, figsize=(12, 8), sharex=True)

if len(example_patients) == 1:
    axes = [axes]  # Ensure axes is iterable even with one patient

for ax, subject_id in zip(axes, example_patients):
    group = df[df["subject_id"] == subject_id]
    X = group[features].values

    ax.set_title(f"Patient {subject_id} - Time Series CV Folds")

    for fold, (train_val_idx, test_idx) in enumerate(tscv.split(X)):
        # Further split train_val into train and validation
        train_idx = train_val_idx[:-len(test_idx)]  # Majority for training
        val_idx = train_val_idx[-len(test_idx):]  # Last portion for validation

        # Plot train, validation, and test as connected sequences
        ax.plot(train_idx, [fold] * len(train_idx), color="blue", linestyle="-", label="Train" if fold == 0 else "")
        ax.plot(val_idx, [fold] * len(val_idx), color="orange", linestyle="-", label="Validation" if fold == 0 else "")
        ax.plot(test_idx, [fold] * len(test_idx), color="red", linestyle="-", label="Test" if fold == 0 else "")

    ax.set_ylabel("Fold")
    ax.legend()

plt.xlabel("Time Index (Sequential Admissions)")
plt.suptitle("Time Series Train-Validation-Test Splits for Selected Patients")
plt.show()


For each patient:

The first row (Fold 0) → Uses early visits as training, then selects a later visit for validation, and even later for testing.
The second row (Fold 1) → Expands training with more past visits, shifts validation forward, and chooses a new later visit for testing.
The third row (Fold 2) → The pattern repeats, progressively including more past visits for training and delaying the test set.

# Data Preprocessing

In [None]:
# Load dataset
df = recurrent_df.copy()  # Ensure `processed_df` is preloaded

# Convert `charttime` to datetime for correct time-based sorting
df["charttime"] = pd.to_datetime(df["charttime"])

# Sort data by patient, stay_id, and time
df = df.sort_values(by=["subject_id", "stay_id", "charttime"])

#### Impute Missing Health Reading Values

In [None]:
df.loc[:, 'heartrate'] = df['heartrate'].interpolate(method='linear')
df.loc[:, 'resprate'] = df['resprate'].interpolate(method='linear')
df.loc[:, 'o2sat'] = df['o2sat'].interpolate(method='linear')
df.loc[:, 'sbp'] = df['sbp'].interpolate(method='linear')
df.loc[:, 'dbp'] = df['dbp'].interpolate(method='linear')

#### Grouping Rhythm Groups

In [None]:
import pandas as pd
from fuzzywuzzy import process, fuzz

# Define reference groups (manually curated categories)
reference_groups = [
    "Normal Sinus Rhythm",
    "Sinus Arrhythmia",
    "Sinus Bradycardia",
    "Sinus Tachycardia",
    "Atrial Fibrillation",
    "Atrial Flutter",
    "Paced Rhythm",
    "Junctional Rhythm",
    "First Degree AV Block",
    "Bundle Branch Block (BBB)",
    "Supraventricular Tachycardia (SVT)",
    "Ventricular Ectopy (PVCs, Bigeminy, Trigeminy)",  # Includes PVCs, Bigeminy, Trigeminy
    "Unknown / Invalid Entry"  # For noise or unclear values
]


# Function to normalize and fuzzy match rhythms
def match_rhythm(rhythm):
    if pd.isna(rhythm):
        return None
    rhythm = rhythm.lower().strip()  # Normalize text

    best_match, score = process.extractOne(rhythm, reference_groups, scorer=fuzz.token_sort_ratio)

    if score > 80:  # Only group if confidence is high
        return best_match
    return rhythm  # Keep original if no good match

# Apply fuzzy matching to the DataFrame column
df['grouped_rhythm'] = df['rhythm'].apply(match_rhythm)


In [None]:
print(f"Before running fuzzyword match: {df['rhythm'].nunique()} rhythm groups")
print(f"After running fuzzyword match: {df['grouped_rhythm'].nunique()} rhythm groups")

### Baseline Model Training

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Define models
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Naïve Bayes": GaussianNB()
    # "Support Vector Machine": SVC(probability=True)
}

# Start MLflow experiment
mlflow.set_experiment("Stroke Prediction Experiment")

# Train and evaluate each model
for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        mlflow.sklearn.autolog()

        # Train model on all training patients
        model.fit(X_train, y_train)

        # Evaluate on test patients
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None  # Some models don't support predict_proba

        # Calculate evaluation metrics
        acc = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, zero_division=0)
        recall = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)
        auc_roc = roc_auc_score(y_test, y_prob) if y_prob is not None else None

        # Log metrics in MLflow
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        if auc_roc is not None:
            mlflow.log_metric("auc_roc", auc_roc)

        # Print model performance immediately after logging
        print(f"{model_name} - Accuracy: {acc:.4f}, "
              f"Precision: {precision:.4f}, "
              f"Recall: {recall:.4f}, "
              f"F1 Score: {f1:.4f}, "
              f"AUC-ROC: {auc_roc:.4f}" if auc_roc is not None else "")

        print("-" * 80)  # Separator for readability


In [None]:
# # Save X_test and y_test
# X_test.to_csv("X_test.csv", index=False)
# y_test.to_csv("y_test.csv", index=False)

# # Save X_val and y_val
# X_val.to_csv("X_val.csv", index=False)
# y_val.to_csv("y_val.csv", index=False)

# print("Test and validation sets saved successfully.")


### SHAP and LIME Analysis

#### SHAP

In [None]:
#  create a SHAP explainer and calculate SHAP values for the test set
explainer_shap = shap.TreeExplainer(model)
shap_values = explainer_shap.shap_values(X_test)


In [None]:
import shap
import matplotlib.pyplot as plt

# Create SHAP summary plot
shap.summary_plot(shap_values, X_test)


##### Summary Plot Insights

Notes:

**Observations**


**Key Takeaway**


In [None]:
X_test

##### Dependence Plot Insights

Notes:


**Observations**


**Key Takeaway**



#### LIME

In [None]:
# lime_explanation = explainer_lime.explain_instance(X_test.iloc[0].values, model.predict_proba, num_features=10)
# lime_explanation.as_pyplot_figure()
# plt.show()

##### Insights
**Observations**



**Key Takeaway**
