In [1]:
import os, time, copy
import sys
sys.path.insert(1, '../../')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import normalize
from tqdm import tqdm
from scipy.optimize import brentq
from scipy.stats import norm, binom
from concentration import linfty_dkw, linfty_binom, wsr_iid

In [2]:
# Get data 2006-2014 from the following link: https://darchive.mblwhoilibrary.org/handle/1912/7341
# Unzip and merge the datasets in the following directory
calib_data = np.load('../calib-outputs.npz')
test_data = np.load('../test-outputs.npz')
calib_preds = calib_data['preds'].astype(int)
calib_labels = calib_data['labels'].astype(int)
test_preds = test_data['preds'].astype(int)
test_labels = test_data['labels'].astype(int)
classes = np.load('../classes.npy')
num_classes = classes.shape[0]

plankton_classes = np.isin(classes,['mix','mix_elongated','detritus','bad', 'bead', 'bubble', 'other_interaction', 'pollen', 'spore'],invert=True)
plankton_classes_list = np.array(np.where(plankton_classes)[0])

true_count = np.isin(test_labels, plankton_classes_list).sum()
N = test_labels.shape[0]
true_frac = true_count/N
uncorrected_est = np.isin(test_preds, plankton_classes_list).sum()

calib_preds = np.isin(calib_preds, plankton_classes_list)
calib_labels = np.isin(calib_labels, plankton_classes_list)
test_preds = np.isin(test_preds, plankton_classes_list)
test_labels = np.isin(test_labels, plankton_classes_list)

print(f"Calib acc: {(calib_preds == calib_labels).astype(int).mean()}")
print(f"Test acc: {(test_preds == test_labels).astype(int).mean()}")

Calib acc: 0.962313466496375
Test acc: 0.9789407941012395


In [3]:
# Look at the unique classes 
calib_uq, calib_uq_counts = np.unique(calib_labels, return_counts=True)
calib_uq_freq = calib_uq_counts/calib_uq_counts.sum()
calib_uq_sort = np.argsort(calib_uq_freq)[::-1]
calib_uq_freq = calib_uq_freq[calib_uq_sort]; calib_uq = calib_uq[calib_uq_sort];
calib_uq_cumsum = np.cumsum(calib_uq_freq)

In [4]:
# Problem setup
alpha = 0.05
delta = 0.04
K = 2
nu = np.array([0,1])
n_max = calib_preds.shape[0]
ns = np.linspace(1000, n_max, 100)
num_trials = 1

In [46]:
def form_qfhat(test_preds):
    # Construct the point estimate
    target_uq, target_uq_counts = np.unique(test_preds, return_counts=True)
    target_uq_freq = target_uq_counts/target_uq_counts.sum()
    target_uq_sort = np.argsort(target_uq_freq)[::-1]
    target_uq_freq = target_uq_freq[target_uq_sort]; target_uq = target_uq[target_uq_sort];
    qfhat = target_uq_freq
    return qfhat

In [47]:
def naiveML(test_preds, alpha, delta):
    N = test_preds.shape[0]
    qfhat = form_qfhat(test_preds)
    naive_epsilon = np.sqrt(1/(2*N) * np.log(1/delta))
    naive_lb = qfhat[1] - naive_epsilon
    naive_ub = qfhat[1] + naive_epsilon
    naive_count_lb = int(binom.ppf(alpha-delta, N, naive_lb))
    naive_count_ub = int(binom.ppf(1-(alpha-delta), N, naive_ub))
    return [naive_count_lb, naive_count_ub]

In [48]:
def ppi_iid(calib_preds, test_preds, calib_labels, alpha, delta):
    N = test_preds.shape[0]
    qfhat = form_qfhat(test_preds)

    bias_estimate = (calib_preds.astype(float) - calib_labels.astype(float)).mean()
    grid = np.linspace(0.48,0.52,1000)
    step = calib_preds.shape[0]
    possible_biases = bias_estimate + 2*wsr_iid((calib_preds.astype(float) - calib_labels.astype(float)+1)/2, delta, grid, intersection=False)-1
    iid_lb = qfhat[1] + possible_biases.min()
    iid_ub = qfhat[1] + possible_biases.max()
    iid_count_lb = int(binom.ppf(alpha-delta, N, iid_lb))
    iid_count_ub = int(binom.ppf(1-(alpha-delta), N, iid_ub))
    return [iid_count_lb, iid_count_ub]

In [49]:
def ppi_label_shift(calib_preds, test_preds, calib_labels, K, alpha, delta):
    # Construct the confusion matrix
    n = calib_preds.shape[0]
    N = test_preds.shape[0]

    # Construct column-normalized confusion matrix Ahat
    C = np.zeros((K,K)).astype(int)
    for j in range(K):
        for l in range(K):
            C[j,l] = np.bitwise_and(calib_preds == j,calib_labels == l).astype(int).sum()
    Ahat = C / C.sum(axis=0)
    # Invert Ahat
    Ahatinv = np.linalg.inv(Ahat)
    qfhat = form_qfhat(test_preds)
    # Calculate the bound
    point_estimate = nu@Ahatinv@qfhat

    nmin = C.sum(axis=0).min()

    def invert_theta(theta): return np.sqrt(1/(4*nmin))*(norm.ppf(1-(theta*delta)/(2*K**2)) - norm.ppf((theta*delta)/(2*K**2))) - np.sqrt(2/N*np.log(2/((1-theta)*delta)))
    theta = brentq(invert_theta,1e-9,1-1e-9)
    epsilon1 = max([linfty_binom(C.sum(axis=0)[k], K, theta*delta, Ahat[:,k]) for k in range(K)])
    epsilon2 = linfty_dkw(N,K,(1-theta)*delta)

    qyhat_lb = np.clip(point_estimate - epsilon1 - epsilon2, 0, 1)
    qyhat_ub = np.clip(point_estimate + epsilon1 + epsilon2, 0, 1)

    count_plankton_lb = int(binom.ppf(alpha-delta, N, qyhat_lb))
    count_plankton_ub = int(binom.ppf(1-(alpha-delta), N, qyhat_ub))
    return [count_plankton_lb, count_plankton_ub]

In [50]:
# Naive
naive_lb, naive_ub = naiveML(test_preds, alpha, delta)
naive_fraclb, naive_fracub = naive_lb/N, naive_ub/N
# Without label shift
ppi_iid_lb, ppi_iid_ub = ppi_iid(calib_preds, test_preds, calib_labels, alpha, delta)
ppi_iid_fraclb, ppi_iid_fracub = ppi_iid_lb/N, ppi_iid_ub/N
# With label shift
ppi_ls_lb, ppi_ls_ub = ppi_label_shift(calib_preds, test_preds, calib_labels, K, alpha, delta)
ppi_ls_fraclb, ppi_ls_fracub = ppi_ls_lb/N, ppi_ls_ub/N

In [51]:
print(f"The true number of plankton observed in 2014 was {true_count} ({true_count/N*100:.1f}%).")
print(f"The naive baseline for the number of plankton observed in 2014 is [{naive_lb},{naive_ub}] ([{100*naive_fraclb:.1f}%,{100*naive_fracub:.1f}%]).")
print(f"The prediction-powered baseline (no label shift) for the number of plankton observed in 2014 is [{ppi_iid_lb},{ppi_iid_ub}] ([{100*ppi_iid_fraclb:.1f}%,{100*ppi_iid_fracub:.1f}%]).")
print(f"The prediction-powered confidence interval for the number of plankton observed in 2014 is [{ppi_ls_lb},{ppi_ls_ub}] ([{100*ppi_ls_fraclb:.1f}%,{100*ppi_ls_fracub:.1f}%]).")

The true number of plankton observed in 2014 was 23538 (7.1%).
The naive baseline for the number of plankton observed in 2014 is [21011,23136] ([6.4%,7.0%]).
The prediction-powered baseline (no label shift) for the number of plankton observed in 2014 is [19497,20637] ([5.9%,6.3%]).
The prediction-powered confidence interval for the number of plankton observed in 2014 is [13670,23785] ([4.1%,7.2%]).
