In [81]:
import os, time, copy
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import normalize
from scipy.optimize import brentq
from scipy.stats import binom
from tqdm import tqdm
from utils import get_train_val_split

In [2]:
# Get data 2006-2014 from the following link: https://darchive.mblwhoilibrary.org/handle/1912/7341
# Unzip and merge the datasets in the following directory
data = np.load('./test-outputs.npz')
n = 100000
shuffle_idx = np.array([1,]*n + [0,]*(data['preds'].shape[0]-n)).astype(bool)
np.random.shuffle(shuffle_idx)
val_preds = data['preds'][shuffle_idx]
val_labels = data['labels'][shuffle_idx]
test_preds = data['preds'][np.bitwise_not(shuffle_idx)]
test_labels = data['labels'][np.bitwise_not(shuffle_idx)]
classes = np.load('./classes.npy')

In [3]:
# Convert to binary
plankton_classes = np.array(np.where(np.isin(classes,['mix','mix_elongated','detritus','bad'],invert=True))[0])
val_preds = np.isin(val_preds, plankton_classes)
val_labels = np.isin(val_labels, plankton_classes)
test_preds = np.isin(test_preds, plankton_classes)
test_labels = np.isin(test_labels, plankton_classes)

In [63]:
test_pred_counts = np.array([test_preds.shape[0]-test_preds.sum(), test_preds.sum()])

In [149]:
n_y1 = val_labels.sum()
n_y0 = val_labels.shape[0]-val_labels.sum()
N = test_preds.shape[0]

n_f0y0 = ((val_labels == 0) & (val_preds == 0)).astype(int).sum()
n_f1y1 = ((val_labels == 1) & (val_preds == 1)).astype(int).sum()
N_f1 = test_preds.sum()
N_f0 = N-N_f1

print(f"val acc among non-plankton: {n_f0y0/n_y0:.3f} | val acc among plankton: {n_f1y1/n_y1:.3f} | predicted fraction plankton: {N_f1/N:.3f}")

val acc among non-plankton: 0.991 | val acc among plankton: 0.655 | predicted fraction plankton: 0.056


In [157]:
# Run MAI, estimating confusion matrix
delta = 0.5

def invert_for_lb_0(r): return binom.cdf(n_f0y0,n_y0,r)-(1-delta/8)
def invert_for_lb_1(r): return binom.cdf(n_f1y1,n_y1,r)-(1-delta/8)
def invert_for_ub_0(r): return binom.cdf(n_f0y0,n_y0,r)-(delta/8)
def invert_for_ub_1(r): return binom.cdf(n_f1y1,n_y1,r)-(delta/8)

def invert_for_lb_f(r): return binom.cdf(N_f1,N,r)-(1-delta/8)
def invert_for_ub_f(r): return binom.cdf(N_f1,N,r)-(delta/8)

c0_lb = brentq(invert_for_lb_0,0,1)
c0_ub = brentq(invert_for_ub_0,0,1)

c1_lb = brentq(invert_for_lb_1,0,1)
c1_ub = brentq(invert_for_ub_1,0,1)

f1_lb = brentq(invert_for_lb_f,0,1)
f1_ub = brentq(invert_for_ub_f,0,1)

A_lb = np.array([[c0_lb, 1-c1_ub], [1-c0_lb, c1_ub]])
A_ub = np.array([[c0_ub, 1-c1_lb], [1-c0_ub, c1_lb]])

qyhat_lb = (np.linalg.inv(A_lb)@np.array([1-f1_lb, f1_lb]))[1]
qyhat_ub = (np.linalg.inv(A_ub)@np.array([1-f1_ub, f1_ub]))[1]

count_plankton_lb = binom.ppf(delta/8, N, qyhat_lb)
count_plankton_ub = binom.ppf(1-delta/8, N, qyhat_ub)

In [161]:
print(f"The model-assisted confidence interval for the number of plankton observed in 2014 is [{int(count_plankton_lb)},{int(count_plankton_ub)}].")
print(f"The true number of plankton observed in 2014 was {test_labels.sum()}, which lies in the interval.")
print(f"The point estimate was {test_preds.sum()}, which does not lie in the interval.")

The model-assisted confidence interval for the number of plankton observed in 2014 is [15823,17486].
The true number of plankton observed in 2014 was 16528, which lies in the interval.
The point estimate was 12860, which does not lie in the interval.
