In [403]:
import os, time, copy
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import normalize
from scipy.optimize import brentq
from scipy.stats import binom
from tqdm import tqdm

In [404]:
# Get data 2006-2014 from the following link: https://darchive.mblwhoilibrary.org/handle/1912/7341
# Unzip and merge the datasets in the following directory
calib_data = np.load('../calib-outputs.npz')
test_data = np.load('../test-outputs.npz')
calib_preds = calib_data['preds'].astype(int)
calib_labels = calib_data['labels'].astype(int)
test_preds = test_data['preds'].astype(int)
test_labels = test_data['labels'].astype(int)
classes = np.load('../classes.npy')
num_classes = classes.shape[0]

plankton_classes = np.isin(classes,['mix','mix_elongated','detritus','bad', 'bead', 'bubble', 'other_interaction', 'pollen', 'spore'],invert=True)

In [405]:
# Look at the unique classes 
uq, uq_counts = np.unique(calib_labels, return_counts=True)
uq_freq = uq_counts/uq_counts.sum()
uq_sort = np.argsort(uq_freq)[::-1]
uq_freq = uq_freq[uq_sort]; uq = uq[uq_sort];
uq_cumsum = np.cumsum(uq_freq)
print(np.searchsorted(uq_cumsum,0.97))

12


In [406]:
# Problem setup
alpha = 0.02
K = 10
nu = plankton_classes.astype(int)
nu_trunc = nu[uq]
uq = uq[:K]

In [407]:
# Construct the confusion matrix
n = calib_preds.shape[0]
N = test_preds.shape[0]

C = np.zeros((num_classes,num_classes)).astype(int)
for j in range(num_classes):
    for l in range(num_classes):
        C[j,l] = ((calib_preds == j) & (calib_labels == l)).sum()
Ahat = C / (C.sum(axis=1)[:,None])
Ahat = np.nan_to_num(Ahat)
Ahat_small = Ahat[uq]
Ahat_small = Ahat_small[:,uq]
print(np.array_str(Ahat_small, precision=3))

[[0.903 0.048 0.001 0.015 0.005 0.009 0.001 0.005 0.001 0.   ]
 [0.053 0.749 0.005 0.032 0.022 0.01  0.046 0.016 0.001 0.002]
 [0.014 0.024 0.492 0.008 0.142 0.    0.049 0.061 0.056 0.003]
 [0.014 0.043 0.036 0.643 0.04  0.    0.018 0.035 0.    0.002]
 [0.045 0.042 0.183 0.031 0.41  0.    0.047 0.058 0.091 0.001]
 [0.171 0.009 0.    0.001 0.    0.612 0.    0.    0.    0.   ]
 [0.    0.016 0.134 0.004 0.119 0.    0.458 0.023 0.    0.001]
 [0.013 0.033 0.04  0.033 0.02  0.    0.107 0.633 0.    0.   ]
 [0.011 0.006 0.131 0.    0.177 0.    0.    0.004 0.619 0.   ]
 [0.005 0.02  0.1   0.028 0.063 0.001 0.043 0.004 0.003 0.467]]


  Ahat = C / (C.sum(axis=1)[:,None])


In [408]:
# Construct the point estimate
uq, uq_counts = np.unique(test_preds, return_counts=True)
uq_freq = uq_counts/uq_counts.sum()
uq_sort = np.argsort(uq_freq)[::-1]
uq_freq = uq_freq[uq_sort]; uq = uq[uq_sort];
uq = uq[:K]
qf = uq_freq[:K]
nu1 = nu_trunc[:K]
nu2 = nu_trunc[K:]

In [409]:
# Run MAI
estimate = nu1@np.linalg.inv(Ahat_small)@qf
print(np.linalg.inv(Ahat_small)@qf)
print(estimate)
print(nu1)
#count_plankton_lb = binom.ppf(delta/8, N, qyhat_lb)
#count_plankton_ub = binom.ppf(1-delta/8, N, qyhat_ub)

#counts = N*(np.linalg.pinv(Ahat)@fhat)[plankton_classes].sum()
#print(Ahat.sum(axis=0))

[ 0.938  0.063  0.039 -0.006 -0.109 -0.253  0.025 -0.02   0.009 -0.004]
-0.2098040032301526
[0 0 1 1 0 1 1 1 1 1]


In [385]:
print(f"The model-assisted estimate for the number of plankton observed in 2014 is {counts}.")
print(f"The true number of plankton observed in 2014 was {test_labels.mean()}, which lies in the interval.")
print(f"The point estimate was {test_preds.sum()}, which does not lie in the interval.")

NameError: name 'counts' is not defined