In [7]:
import os
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import optuna
from typing import Any, Dict, List, Optional, Tuple

In [2]:
full_dataset = pd.read_parquet('./data/preprocess_data/heart_2022.parquet')

In [20]:
target = 'HadHeartAttack'

In [12]:
def create_train_validation_test_sets(
    X: pd.DataFrame, y: pd.Series
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray, np.ndarray]:
    """
    Split the dataset into training, valiidation and testing sets.
    Args:
        X (pd.DataFrame): Feature matrix.
        y (pd.Series): Labels.
        test_size (float): Proportion of the dataset to include in the test split.
    Returns:
        Tuple: 
        X_train, y_train: Training set (60%)
        X_val, y_val: Validation set (20%)
        X_test, y_test: Test set (20%)
    """
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
    return X_train, X_val, X_test, y_train.values, y_val.values, y_test.values

In [22]:
X = full_dataset[full_dataset.columns[full_dataset.columns != target]]
y = full_dataset[target]

In [23]:
X_train, X_val, X_test, y_train, y_val, y_test = create_train_validation_test_sets(X, y)

In [47]:
def train_model(
    X_train: pd.DataFrame, y_train: np.ndarray, categorical_features: list[str]
) -> lgb.LGBMClassifier:
    """
    Train a LightGBM classifier.
    """
    lgbm = lgb.LGBMClassifier(objective="binary", metric="binary_logloss", importance_type="split", is_unbalance=True)
    lgbm.fit(
        X_train,
        y_train,
        feature_name=X_train.columns.to_list(),
        categorical_feature=categorical_features,
        eval_metric="binary_logloss"
    )
    return lgbm

In [44]:
def validation(predicted_prob: np.ndarray, y_true: np.ndarray) -> Dict[str, Any]:
    """
    Evaluate classification performance with various metrics.

    Args:
        predicted_prob (np.ndarray): Predicted probabilities
        y_tue (np.ndarray): Ground truth binary labels (bool).

    Returns:
        dict: A dictionary with metrics
    """
    prediction_threshold = float(os.getenv("PREDICTION_THRESHOLD", "0.05"))
    predictions_binary = (predictions_prob > prediction_threshold).astype(int)

    fpr, tpr, _ = metrics.roc_curve(y, predictions_prob)
    metrics = {
        "log_loss": metrics.log_loss(y, predictions_prob),
        "brier_score": metrics.brier_score_loss(y, predictions_prob),
        "auc": metrics.auc(fpr, tpr)
    }
    return metrics

# Experiment 1:
Use lightGBM.

1. Train model using train + validation sets.
2. Evaluate using test set.

In [41]:
categorical_features = list(full_dataset.select_dtypes(include=['category']).columns)

In [48]:
lgbm = train_model(X_train=pd.concat([X_train, X_val]), y_train=np.concatenate([y_train, y_val]), categorical_features=categorical_features)

[LightGBM] [Info] Number of positive: 10748, number of negative: 186069
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.053270 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 834
[LightGBM] [Info] Number of data points in the train set: 196817, number of used features: 38
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.054609 -> initscore=-2.851398
[LightGBM] [Info] Start training from score -2.851398


# Experiment 3:
Use lightGBM and Optuna.

1. Train model using train set and do hyperparameter tuning using Optuna.
2. Evaluate the model using the test set.
3. Refit the model using the validation set.
4. Evaluate the refitted model using the test set.
5. Compare the evaluations before and after refit.

In [None]:
def objective(trial, X_train, y_train, X_test, y_test, categorical_features):
    """
    Objective function for optuna hyperparameter tunning
    """
    params_range = {
        "objective": "binary",
        "metric": "binary_logloss",
        "importance_type": "split",
        "verbosity": -1,
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "subsample": trial.suggest_float("subsample", 0.4, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.05),
    }

    lgbm = lgb.LGBMClassifier(**params_range)

    lgbm.fit(
        X_train,
        y_train,
        feature_name=X_train.columns.to_list(),
        categorical_feature=categorical_features,
        eval_metric="binary_logloss",
    )
    predictions_prob = lgbm.predict_proba(X_test)[:, 1]
    brier_score = sklearn.metrics.brier_score_loss(y_test, predictions_prob, sample_weight=None)
    return brier_score