## Pairwise McNemars test

In [None]:
import numpy as np
from statsmodels.stats.contingency_tables import mcnemar

# Load your predictions and ground truth
y_true = np.load("y_test.npy")
y_pred_lr = np.load("y_pred_lr.npy")
y_pred_rf = np.load("y_pred_rf.npy")
y_pred_svm = np.load("y_pred_svm.npy")

# Create a baseline model that always predicts class 1
y_pred_baseline = np.ones_like(y_true)

# Define the McNemar test function
def run_mcnemar(y_true, pred_a, pred_b, name_a="Model A", name_b="Model B"):
    # Contingency table: [[both correct, a correct only], [b correct only, both wrong]]
    a_correct = pred_a == y_true
    b_correct = pred_b == y_true

    both_correct = np.sum(a_correct & b_correct)
    a_only = np.sum(a_correct & ~b_correct)
    b_only = np.sum(~a_correct & b_correct)
    both_wrong = np.sum(~a_correct & ~b_correct)

    table = [[both_correct, a_only],
             [b_only, both_wrong]]

    print(f"\nMcNemar's Test: {name_a} vs {name_b}")
    print("Contingency Table:", table)

    result = mcnemar(table, exact=False, correction=True)
    print(f"Statistic = {result.statistic:.4f}, p-value = {result.pvalue:.40f}")

# Run comparisons
run_mcnemar(y_true, y_pred_lr, y_pred_rf, "Logistic Regression", "Random Forest")
run_mcnemar(y_true, y_pred_lr, y_pred_svm, "Logistic Regression", "SVM")
run_mcnemar(y_true, y_pred_rf, y_pred_svm, "Random Forest", "SVM")

# Compare each model to baseline
run_mcnemar(y_true, y_pred_lr, y_pred_baseline, "Logistic Regression", "Baseline")
run_mcnemar(y_true, y_pred_rf, y_pred_baseline, "Random Forest", "Baseline")
run_mcnemar(y_true, y_pred_svm, y_pred_baseline, "SVM", "Baseline")


# Bootstrapping for precision intervals  

In [None]:
import numpy as np
from sklearn.metrics import precision_score
from sklearn.utils import resample

# Load predictions and ground truth
y_true = np.load("y_test.npy")
y_pred_lr = np.load("y_pred_lr.npy")
y_pred_rf = np.load("y_pred_rf.npy")
y_pred_svm = np.load("y_pred_svm.npy")

# Baseline: predict all class 1
y_pred_baseline = np.ones_like(y_true)

# Bootstrapping function
def bootstrap_precision(y_true, y_pred, n_bootstraps=1000, random_state=42):
    rng = np.random.RandomState(random_state)
    precision_scores = []

    for _ in range(n_bootstraps):
        indices = rng.choice(len(y_true), size=len(y_true), replace=True)
        y_true_sample = y_true[indices]
        y_pred_sample = y_pred[indices]
        try:
            score = precision_score(y_true_sample, y_pred_sample, pos_label=1)
        except:
            score = 0.0  # In case no positive predictions exist
        precision_scores.append(score)

    return np.mean(precision_scores), np.percentile(precision_scores, [2.5, 97.5])

# Run for each model
models = {
    "Logistic Regression": y_pred_lr,
    "Support Vector Machine": y_pred_svm,
    "Random Forest": y_pred_rf,
    "Baseline": y_pred_baseline,
}

results = {}

for model_name, preds in models.items():
    mean_prec, ci = bootstrap_precision(y_true, preds)
    results[model_name] = (mean_prec, ci)

# Print formatted results
print("Model\t\t\tMean Precision\t\t95% CI")
for model, (mean_prec, ci) in results.items():
    print(f"{model:25s} {mean_prec:.3f} \t\t [{ci[0]:.3f}, {ci[1]:.3f}]")
