In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample


In [2]:
# Load data
df = pd.read_csv("../data/merged_predictions.csv")

df.head()

Unnamed: 0,varity_id,varity_label,varity_score,mvp_id,mvp_label,mvp_score,mutscore_id,mutscore_label,mutscore_score
0,867544,0,0.007357,867544,0,9e-06,867544,0,0.004
1,54418,0,0.008473,54418,0,0.000279,54418,0,0.133
2,220258,1,0.906004,220258,1,0.998743,220258,1,0.886
3,37852,0,0.004501,37852,0,0.0,37852,0,0.001
4,37628,1,0.992058,37628,1,0.999998,37628,1,0.985


In [3]:
# Check consistency of labels
assert (df["varity_label"] == df["mvp_label"]).all()
assert (df["varity_label"] == df["mutscore_label"]).all()

print(f"All labels are identical across models.")

All labels are identical across models.


In [4]:
y_true = df["varity_label"].values  # Same for all models
score_varity = df["varity_score"].values
score_mvp = df["mvp_score"].values
score_mutscore = df["mutscore_score"].values

In [5]:
def bootstrap_auc_ci(y_true, y_score, n_boot=1000, alpha=0.05, random_state=42):
    """
    Compute stratified bootstrap confidence interval for AUC.
    y_true: array-like of 0/1 labels
    y_score: array-like of model scores
    n_boot: number of bootstrap iterations (>=2000 recommended)
    alpha: 1 - confidence level (0.05 -> 95% CI)
    """
    rng = np.random.RandomState(random_state)
    y_true = np.asarray(y_true)
    y_score = np.asarray(y_score)

    pos_idx = np.where(y_true == 1)[0]
    neg_idx = np.where(y_true == 0)[0]

    aucs = []
    for _ in range(n_boot):
        # stratified resampling: keep the ratio of positive and negative samples
        pos_bs = resample(pos_idx, replace=True, n_samples=len(pos_idx), random_state=rng)
        neg_bs = resample(neg_idx, replace=True, n_samples=len(neg_idx), random_state=rng)
        idx = np.concatenate([pos_bs, neg_bs])
        aucs.append(roc_auc_score(y_true[idx], y_score[idx]))

    aucs = np.sort(aucs)
    ci_low = np.quantile(aucs, alpha/2)
    ci_high = np.quantile(aucs, 1 - alpha/2)
    auc_hat = roc_auc_score(y_true, y_score)

    return auc_hat, ci_low, ci_high, aucs

In [7]:
# VARITY
auc_v, lo_v, hi_v, aucs_varity = bootstrap_auc_ci(y_true, score_varity)
# MVP
auc_m, lo_m, hi_m, aucs_mvp = bootstrap_auc_ci(y_true, score_mvp)
# MutScore
auc_ms, lo_ms, hi_ms, aucs_mut = bootstrap_auc_ci(y_true, score_mutscore)

print(f"VARITY   AUC = {auc_v:.3f}  [95% CI: {lo_v:.3f} – {hi_v:.3f}]")
print(f"MVP      AUC = {auc_m:.3f}  [95% CI: {lo_m:.3f} – {hi_m:.3f}]")
print(f"MutScore AUC = {auc_ms:.3f} [95% CI: {lo_ms:.3f} – {hi_ms:.3f}]")


VARITY   AUC = 1.000  [95% CI: 1.000 – 1.000]
MVP      AUC = 0.999  [95% CI: 0.996 – 1.000]
MutScore AUC = 0.996 [95% CI: 0.988 – 1.000]


In [11]:
from math import sqrt
from scipy.stats import norm

In [12]:
def _compute_midrank(x):
    """Compute midranks used in DeLong covariance estimate."""
    J = np.argsort(x)
    Z = x[J]
    N = len(x)
    T = np.zeros(N, dtype=float)
    i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]:
            j += 1
        T[i:j] = 0.5*(i + j - 1) + 1
        i = j
    T2 = np.empty(N, dtype=float)
    T2[J] = T
    return T2

def _fast_delong(y_true, scores):
    """Compute DeLong covariance for correlated ROC AUCs."""
    y_true = np.asarray(y_true)
    assert set(np.unique(y_true)) <= {0,1}
    n = y_true.size
    order = np.argsort(-scores[0, :])
    y = y_true[order]
    n1 = int(np.sum(y))
    n0 = int(n - n1)

    aucs = []
    v01 = []
    v10 = []

    for k in range(scores.shape[0]):
        s = scores[k, order]
        tx = _compute_midrank(s)
        ty1 = _compute_midrank(s[y == 1])
        ty0 = _compute_midrank(-s[y == 0])

        auc = (np.sum(tx[y == 1]) - n1*(n1+1)/2) / (n1*n0)
        aucs.append(auc)
        v01_k = (ty1 - (n1+1)/2) / n0
        v10_k = 1 - (ty0 - (n0+1)/2) / n1
        v01.append(v01_k)
        v10.append(v10_k)

    aucs = np.array(aucs, dtype=float)
    v01 = np.array(v01, dtype=float)
    v10 = np.array(v10, dtype=float)

    s01 = np.cov(v01)
    s10 = np.cov(v10)
    auc_cov = s01 / n0 + s10 / n1
    return aucs, auc_cov

def delong_test(y_true, score1, score2):
    """
    DeLong test for correlated ROC AUCs (two models on the same test set).
    Returns: auc1, auc2, delta, var, z, p
    """
    s = np.vstack([np.asarray(score1), np.asarray(score2)])
    aucs, auc_cov = _fast_delong(np.asarray(y_true), s)
    delta = float(aucs[0] - aucs[1])
    var = float(auc_cov[0,0] + auc_cov[1,1] - 2*auc_cov[0,1])
    z = delta / sqrt(var) if var > 0 else 0.0
    p = 2 * (1 - norm.cdf(abs(z)))
    return float(aucs[0]), float(aucs[1]), delta, var, float(z), float(p)


In [13]:
pairs = [
    ("VARITY", score_varity, "MVP", score_mvp),
    ("VARITY", score_varity, "MutScore", score_mutscore),
    ("MVP", score_mvp, "MutScore", score_mutscore)
]

for name1, s1, name2, s2 in pairs:
    auc1, auc2, delta, var, z, p = delong_test(y_true, s1, s2)
    print(f"{name1} vs {name2}: ΔAUC={delta:.4f}, z={z:.3f}, p={p:.3f}")


VARITY vs MVP: ΔAUC=0.0008, z=0.009, p=0.993
VARITY vs MutScore: ΔAUC=0.0039, z=0.070, p=0.944
MVP vs MutScore: ΔAUC=0.0032, z=0.040, p=0.968
