# XGBoost Model Calibration

This notebook contains cells extracted from `xgboost_evaluation.ipynb`, focusing on model calibration techniques and their evaluation.

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc, roc_auc_score, precision_recall_fscore_support
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.calibration import CalibrationDisplay, CalibratedClassifierCV
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [None]:
# Define input path
input_parquet_path = 'data/preprocessed_data.parquet'
# Load the preprocessed data
print(f"Loading preprocessed data from {input_parquet_path}...")
try:
    df = pd.read_parquet(input_parquet_path)
    print("Data loaded successfully.")
    df.info()
except FileNotFoundError:
    print(f"Error: File not found at {input_parquet_path}. Please ensure '1_consolidate_data.ipynb' has been run.")
except Exception as e:
    print(f"\nAn error occurred while loading the Parquet file: {e}")

In [None]:
# Separate train and test sets based on the 'split' column
train_df = df[df['split'] == 'train']
test_df = df[df['split'] == 'test']

# Separate features (X) and target (y)
X_train_scaled = train_df.drop(['Class', 'split'], axis=1)
y_train = train_df['Class']

X_test_scaled = test_df.drop(['Class', 'split'], axis=1)
y_test = test_df['Class']

print(f"Training features shape: {X_train_scaled.shape}, Training target shape: {y_train.shape}")
print(f"Test features shape: {X_test_scaled.shape}, Test target shape: {y_test.shape}")

In [None]:
# Convert target variable 'Class' from object ('n'/'y') to numeric (0/1) if necessary
if y_train.dtype == 'object':
    print("\nConverting target variable 'Class' to numeric (n=0, y=1)...")
    y_train = y_train.map({'n': 0, 'y': 1})
    y_test = y_test.map({'n': 0, 'y': 1})
    print("Target variable converted.")
else:
    print("\nTarget variable 'Class' is already numeric or in an unexpected format.")

In [None]:
# Calculate class distribution in the training set for scale_pos_weight
scale_pos_weight_val = 1 # Default
if y_train.dtype == 'int64' or y_train.dtype == 'int32':
    count_class_0 = (y_train == 0).sum()
    count_class_1 = (y_train == 1).sum()
    print(f"\nTraining data class distribution: Class 0 (n): {count_class_0}, Class 1 (y): {count_class_1}")
    if count_class_1 > 0:
        scale_pos_weight_val = count_class_0 / count_class_1
        print(f"Calculated scale_pos_weight: {scale_pos_weight_val:.4f}")
    else:
        print("Warning: No positive class (1) instances in y_train. scale_pos_weight set to 1.")
else:
    print(f"Warning: y_train is not numeric (dtype: {y_train.dtype}). scale_pos_weight set to 1.")

In [None]:
# Define and Train XGBoost Classifier
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False, # Recommended to avoid warnings
    random_state=42,
    n_estimators=100, # Default, can be tuned
    scale_pos_weight=scale_pos_weight_val
)

xgb_clf.fit(X_train_scaled, y_train)
print("Model training complete.")


n_splits_cv = 3 # Reduced for dummy example
strat_k_fold = StratifiedKFold(n_splits=n_splits_cv, shuffle=True, random_state=42)
scoring_metrics = {
    'accuracy': 'accuracy',
    'roc_auc': 'roc_auc',
    'f1': 'f1_weighted'
}

print("--- Placeholder variables defined ---")
print(f"X_train_scaled shape: {X_train_scaled.shape}, y_train shape: {y_train.shape}")
print(f"X_test_scaled shape: {X_test_scaled.shape}, y_test shape: {y_test.shape}")
print(f"xgb_clf: {type(xgb_clf)}")
print(f"scale_pos_weight_val: {scale_pos_weight_val}")
# --- End of Placeholder ---

In [None]:
def evaluate_model(y_true, y_pred, X_features_for_proba, model, model_name):
    """Calculates, prints, and plots evaluation metrics for a binary classifier."""
    print(f"\n--- {model_name} Evaluation ---")
    
    # Accuracy
    accuracy = accuracy_score(y_true, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    
    # Classification Report
    print("\nClassification Report:")
    target_names = ['Class n (0)', 'Class y (1)'] if np.all(np.isin(y_true.unique(), [0, 1])) else None
    print(classification_report(y_true, y_pred, target_names=target_names, zero_division=0))
    
    # Confusion Matrix
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_true, y_pred)
    print(cm)
    
    # Plot Confusion Matrix
    plt.figure(figsize=(6, 4))
    classes = model.classes_ if hasattr(model, 'classes_') else [0, 1]
    sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', 
                xticklabels=classes, 
                yticklabels=classes)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.show()

    # ROC Curve and AUC
    if hasattr(model, "predict_proba") and X_features_for_proba is not None:
        y_pred_proba = model.predict_proba(X_features_for_proba)[:, 1]
        fpr, tpr, thresholds_roc = roc_curve(y_true, y_pred_proba)
        roc_auc = auc(fpr, tpr)
        print(f"\nROC AUC Score: {roc_auc:.4f}")

        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'Receiver Operating Characteristic (ROC) - {model_name}')
        plt.legend(loc="lower right")
        plt.show()
    else:
        print("\nROC Curve not available: Model lacks predict_proba or features for probabilities not provided.")

## Note on Dependencies

The following cells depend on variables defined and computed in the earlier parts of the `xgboost_evaluation.ipynb` notebook. To run this notebook independently, you will need to ensure the following (and their own dependencies) are defined and available in the kernel:

- `X_train_scaled`: Scaled training features.
- `y_train`: Training target.
- `X_test_scaled`: Scaled test features.
- `y_test`: Test target.
- `xgb_clf`: The initially trained (uncalibrated) XGBoost model.
- `scale_pos_weight_val`: Value for `scale_pos_weight` used in XGBoost.
- `threshold_values`: NumPy array of thresholds for tuning.
- `n_splits_cv`: Integer, number of CV splits.
- `strat_k_fold`: An instance of `StratifiedKFold`.
- `scoring_metrics`: Dictionary defining scoring for CV.

These are typically prepared during data loading, preprocessing, and initial model training phases.

## Model Calibration Analysis

A calibration curve (also known as a reliability diagram) helps to assess how well the probabilistic predictions of a classifier are calibrated.
Ideally, if a model predicts a class with a probability of `p`, then among all instances where it predicts `p`, approximately `p * 100%` of them should actually belong to that class.
A perfectly calibrated model will have a curve that lies along the diagonal.


A bias in the training dataset, such as a skew in the class distribution, means that the model will naturally predict a higher probability for the majority class than the minority class on average.

In [None]:
from sklearn.calibration import CalibrationDisplay


print("\n--- Model Calibration Analysis ---")


# Display calibration curve
plt.figure(figsize=(8, 7))
ax_calibration = plt.gca() # Get current axes
calibration_disp = CalibrationDisplay.from_estimator(
    xgb_clf,
    X_test_scaled,
    y_test,
    n_bins=10, # Number of bins to discretize the [0, 1] interval
    ax=ax_calibration,
    name='XGBoost'
)
plt.title('Calibration Curve (Reliability Diagram)')
plt.grid(True)
plt.show()

### Applying Platt Scaling (Sigmoid Calibration)


Since the initial calibration curve might not be perfectly diagonal, we can attempt to improve it using Platt Scaling. This method trains a logistic regression model on the outputs of the original XGBoost classifier to produce better-calibrated probabilities.


We will use `CalibratedClassifierCV` with `method='sigmoid'` to apply Platt Scaling. The calibrator will be trained using cross-validation on the training data.

### Applying Isotonic Regression Calibration


Isotonic Regression is another method to calibrate probabilities. Unlike Platt Scaling, which assumes a sigmoid relationship, Isotonic Regression is non-parametric and fits a non-decreasing function. It can be more powerful if the distortion is not sigmoid-shaped, but may require more data to avoid overfitting.


We will use `CalibratedClassifierCV` with `method='isotonic'`.

In [None]:
from sklearn.calibration import CalibratedClassifierCV


print("\n--- Applying Platt Scaling and Isotonic Regression ---")


# --- Platt Scaling (Sigmoid) --- 
base_clf_for_sigmoid = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False, 
    random_state=42,
    n_estimators=100,
    scale_pos_weight=scale_pos_weight_val
)
calibrated_xgb_clf_sigmoid = CalibratedClassifierCV(
    estimator=base_clf_for_sigmoid, 
    method='sigmoid', 
    cv=5
)
print("Fitting CalibratedClassifierCV with Platt Scaling...")
calibrated_xgb_clf_sigmoid.fit(X_train_scaled, y_train)
print("Platt Scaling fitting complete.")


# --- Isotonic Regression --- 
base_clf_for_isotonic = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False, 
    random_state=42,
    n_estimators=100,
    scale_pos_weight=scale_pos_weight_val
)
calibrated_xgb_clf_isotonic = CalibratedClassifierCV(
    estimator=base_clf_for_isotonic, 
    method='isotonic', 
    cv=5 
)
print("\nFitting CalibratedClassifierCV with Isotonic Regression...")
calibrated_xgb_clf_isotonic.fit(X_train_scaled, y_train)
print("Isotonic Regression fitting complete.")


# --- Display comparative calibration curves --- 
plt.figure(figsize=(10, 9))
ax_calibrated = plt.gca()
print("\nPlotting calibration curves...")
CalibrationDisplay.from_estimator(
    xgb_clf, 
    X_test_scaled, 
    y_test, 
    ax=ax_calibrated, 
    name='XGBoost (Uncalibrated)', 
    n_bins=10
)
CalibrationDisplay.from_estimator(
    calibrated_xgb_clf_sigmoid, 
    X_test_scaled, 
    y_test, 
    ax=ax_calibrated, 
    name='XGBoost (Platt Scaled)',
    n_bins=10
)
CalibrationDisplay.from_estimator(
    calibrated_xgb_clf_isotonic, 
    X_test_scaled, 
    y_test, 
    ax=ax_calibrated, 
    name='XGBoost (Isotonic)',
    n_bins=10
)
plt.title('Calibration Curve: Uncalibrated vs. Platt Scaled vs. Isotonic XGBoost')
plt.xlabel('Mean Predicted Probability (Positive Class)')
plt.ylabel('Fraction of Positives')
plt.grid(True)
plt.legend(loc='lower right')
plt.show()

print("Calibration plot displayed.")

In [None]:
print("\n--- Evaluating Platt Scaled Model ---")
y_pred_platt_calibrated = calibrated_xgb_clf_sigmoid.predict(X_test_scaled)
evaluate_model(y_test, y_pred_platt_calibrated, X_test_scaled, calibrated_xgb_clf_sigmoid, "XGBoost (Platt Scaled)")

In [None]:
print("\n--- Evaluating Isotonic Regression Calibrated Model ---")
y_pred_isotonic_calibrated = calibrated_xgb_clf_isotonic.predict(X_test_scaled)
evaluate_model(y_test, y_pred_isotonic_calibrated, X_test_scaled, calibrated_xgb_clf_isotonic, "XGBoost (Isotonic)")

### Cross-Validation of Isotonic Regression Calibrated Model


To get a more robust estimate of the Isotonic Regression calibrated model's performance and ensure the improvements are not due to a specific train-test split during the calibration phase, we perform an outer cross-validation on the entire `CalibratedClassifierCV` process. This involves fitting the `CalibratedClassifierCV` (which itself uses internal CV for calibration) on different folds of the training data.

In [None]:
print("\n--- Cross-Validation for Isotonic Regression Calibrated XGBoost ---")


# Define a fresh base XGBoost classifier for the CalibratedClassifierCV
# This is crucial because CalibratedClassifierCV will fit this estimator internally during its own CV process.
base_clf_for_cv_isotonic = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False, 
    random_state=42,
    n_estimators=100,
    scale_pos_weight=scale_pos_weight_val
)


# Create the CalibratedClassifierCV with Isotonic Regression for the outer cross-validation
# The 'cv=5' (or another integer) inside CalibratedClassifierCV is for its internal calibration process.
# The 'cv=strat_k_fold' in cross_val_score is for the outer performance evaluation loop.
cv_calibrated_xgb_clf_isotonic = CalibratedClassifierCV(
    estimator=base_clf_for_cv_isotonic, 
    method='isotonic', 
    cv=5 # Internal CV for calibration. Could be different from outer CV folds.
)


print(f"Performing {n_splits_cv}-fold outer cross-validation on the Isotonic Calibrated model...")
print("(Note: CalibratedClassifierCV also performs internal CV for calibration on each outer fold)")


cv_results_isotonic_calibrated = {}
for metric_name, scorer in scoring_metrics.items():
    try:
        # We use strat_k_fold (defined earlier for uncalibrated model CV) for the outer cross-validation loop
        scores = cross_val_score(cv_calibrated_xgb_clf_isotonic, X_train_scaled, y_train, cv=strat_k_fold, scoring=scorer)
        cv_results_isotonic_calibrated[metric_name] = scores
        print(f"Outer CV {metric_name.upper()} scores (Isotonic Calibrated): {scores}")
        print(f"Mean Outer CV {metric_name.upper()} (Isotonic Calibrated): {np.mean(scores):.4f} (+/- {np.std(scores):.4f})")
    except Exception as e:
        print(f"Could not calculate outer CV {metric_name.upper()} (Isotonic Calibrated). Error: {e}")
    print("-" * 30)

### Threshold Tuning for Isotonic Calibrated Model (Class n (0))


Since Isotonic calibration alters the model's probability outputs, it's beneficial to re-evaluate the optimal classification threshold for our specific needs (e.g., maximizing F1-score for Class 0) using these new, calibrated probabilities.

In [None]:
# Get predicted probabilities for the positive class (Class 1, 'y') from the ISOTONIC CALIBRATED xgb_clf
# The calibrated_xgb_clf_isotonic model should already be fitted from the previous calibration cell.
y_pred_proba_isotonic_calibrated = calibrated_xgb_clf_isotonic.predict_proba(X_test_scaled)[:, 1]

# Define a range of threshold values to test
threshold_values = np.arange(0.05, 1.0, 0.05)

precisions_class0_isotonic = []
recalls_class0_isotonic = []
f1s_class0_isotonic = []


print("\n--- Threshold Tuning for Class n (0) (Isotonic Calibrated Model) ---")
print(f"{'Threshold':<10} | {'Precision (0)':<15} | {'Recall (0)':<12} | {'F1-score (0)':<12} | {'TP (0)':<7} | {'FP (0)':<7} | {'FN (0)':<7}")
print("-" * 80)


# threshold_values is already defined from the earlier threshold tuning section
for thresh_iso in threshold_values:
    # If prob_for_class_1 >= threshold, predict 1 (y), else 0 (n)
    y_pred_at_threshold_isotonic = (y_pred_proba_isotonic_calibrated >= thresh_iso).astype(int)
    
    # Calculate metrics for Class 0 (label 0)
    p_iso, r_iso, f_iso, s_iso = precision_recall_fscore_support(y_test, y_pred_at_threshold_isotonic, labels=[0, 1], zero_division=0)
    
    # Confusion matrix for this threshold to get TP, FP, FN for class 0
    cm_thresh_isotonic = confusion_matrix(y_test, y_pred_at_threshold_isotonic, labels=[0,1])
    tp_c0_iso = cm_thresh_isotonic[0,0] if cm_thresh_isotonic.shape == (2,2) else 0
    fp_c0_iso = cm_thresh_isotonic[1,0] if cm_thresh_isotonic.shape == (2,2) else 0 
    fn_c0_iso = cm_thresh_isotonic[0,1] if cm_thresh_isotonic.shape == (2,2) else 0

    precisions_class0_isotonic.append(p_iso[0])
    recalls_class0_isotonic.append(r_iso[0])
    f1s_class0_isotonic.append(f_iso[0])
    
    print(f"{thresh_iso:<10.2f} | {p_iso[0]:<15.4f} | {r_iso[0]:<12.4f} | {f_iso[0]:<12.4f} | {tp_c0_iso:<7} | {fp_c0_iso:<7} | {fn_c0_iso:<7}")


# Plotting the metrics for Isotonic Calibrated Model
plt.figure(figsize=(12, 7))
plt.plot(threshold_values, precisions_class0_isotonic, label='Precision (Class 0 - Isotonic Cal.)', marker='o')
plt.plot(threshold_values, recalls_class0_isotonic, label='Recall (Class 0 - Isotonic Cal.)', marker='x')
plt.plot(threshold_values, f1s_class0_isotonic, label='F1-score (Class 0 - Isotonic Cal.)', marker='s')
plt.title('Precision, Recall, and F1-score for Class n (0) vs. Threshold (Isotonic Calibrated Model)')
plt.xlabel('Threshold (Probability for Class y (1))')
plt.ylabel('Score')
plt.xticks(np.round(threshold_values,2))
plt.legend()
plt.grid(True)
plt.show()


# Find threshold that maximizes F1-score for Class 0 for Isotonic Calibrated Model
if f1s_class0_isotonic and not all(v == 0 for v in f1s_class0_isotonic):
    optimal_idx_f1_class0_isotonic = np.argmax(f1s_class0_isotonic)
    optimal_threshold_f1_class0_isotonic = threshold_values[optimal_idx_f1_class0_isotonic]
    print(f"\nOptimal threshold for maximizing F1-score for Class n (0) (Isotonic Calibrated Model): {optimal_threshold_f1_class0_isotonic:.2f}")
    print(f"  Precision (Class 0) at this threshold: {precisions_class0_isotonic[optimal_idx_f1_class0_isotonic]:.4f}")
    print(f"  Recall (Class 0) at this threshold: {recalls_class0_isotonic[optimal_idx_f1_class0_isotonic]:.4f}")
    print(f"  F1-score (Class 0) at this threshold: {f1s_class0_isotonic[optimal_idx_f1_class0_isotonic]:.4f}")
else:
    print("\nCould not determine optimal threshold for F1-score (Class 0) (Isotonic Calibrated Model) as no valid F1 scores were calculated or all were zero.")