In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, average_precision_score, roc_curve
from typing import Any, Dict, List, Optional, Tuple

In [2]:
random_seed = 42

In [3]:
# Generate a dataset with 2 classes
# flip_y=0.1, class_sep=0.6 make the dataset more realistic
dataset, target = make_classification(n_samples=250000, n_features=20, n_informative=15, n_classes=2, random_state=random_seed,
                                      flip_y=0.1, class_sep=0.6)
X = pd.DataFrame(dataset)
X.columns = X.columns.astype(str)
y = target

In [4]:
def create_train_validation_test_sets(
    X: pd.DataFrame, y: np.array
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray, np.ndarray]:
    """
    Split the dataset into training, validation and testing sets.
    Args:
        X (pd.DataFrame): Feature matrix.
        y (pd.Series): Labels.
        test_size (float): Proportion of the dataset to include in the test split.
    Returns:
        Tuple: 
        X_train, y_train: Training set (60%)
        X_val, y_val: Validation set (20%)
        X_test, y_test: Test set (20%)
    """
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [5]:
X_train, X_val, X_test, y_train, y_val, y_test = create_train_validation_test_sets(X, y)

In [6]:
def train_model(
    X_train: pd.DataFrame, y_train: np.ndarray
) -> xgb.XGBClassifier:
    """
    Train a XGBoost classifier.
    """
    xgb_model = xgb.XGBClassifier(n_estimators=100, objective="binary:logistic", eval_metric="logloss", seed=random_seed)
    xgb_model.fit(
        X_train,
        y_train,
    )
    return xgb_model

In [7]:
def train_with_incremental_learning(
    X_train: pd.DataFrame, y_train: np.ndarray, init_model: xgb.XGBClassifier
) -> xgb.XGBClassifier:
    xgb_refit = xgb.XGBClassifier(n_estimators=100, objective="binary:logistic", eval_metric="logloss", seed=random_seed)
    xgb_refit.fit(
        X_train,
        y_train,
        xgb_model=init_model
    )
    return xgb_refit

In [8]:
def validation(predicted_y: np.ndarray, y_true: np.ndarray) -> Dict[str, Any]:
    """
    Evaluate classification performance with various metrics.

    Args:
        predicted_y (np.ndarray): Predicted target
        y_true (np.ndarray): Ground truth binary labels (bool).
    Returns:
        dict: A dictionary with metrics
    """
    fpr, tpr, _ = roc_curve(y_true, predicted_y)
    metrics = {
        # "log_loss": metrics.log_loss(y, predictions_prob),
        # "brier_score": metrics.brier_score_loss(y, predictions_prob),
        "auc": auc(fpr, tpr),
        "pr_auc": average_precision_score(y_true, predicted_y)
    }
    return metrics

# Experiment 1: Training from scratch
1. Train a Classification model using train + validation sets.
2. Evaluate using test set.

In [9]:
model = train_model(X_train=pd.concat([X_train, X_val]), y_train=np.concatenate([y_train, y_val]))

In [10]:
predicted_target_train_val= model.predict(X_test)

In [11]:
validation(predicted_target_train_val, y_test)

{'auc': 0.9114589961252385, 'pr_auc': 0.8754824512362412}

# Experiment 2: Incremental learning
1. Train initial model using train set.
2. Train model with incremental learning using validation set.
3. Evaluate using test set.

In [12]:
model_train = train_model(X_train=X_train, y_train=y_train)

In [13]:
predicted_target_train = model_train.predict(X_test)

In [14]:
validation(predicted_target_train, y_test)

{'auc': 0.9103395957222544, 'pr_auc': 0.8736598481435579}

In [15]:
model_refit_val = train_with_incremental_learning(X_train=X_val, y_train=y_val, init_model=model_train)

In [16]:
predicted_target_refit_val= model_refit_val.predict(X_test)

In [17]:
validation(predicted_target_refit_val, y_test)

{'auc': 0.9090790072684426, 'pr_auc': 0.8723241331596789}