In [1]:
import numpy as np
import json
import pickle
from scipy.special import softmax

In [2]:
data = json.load(open('data/temporal_test.json'))

In [3]:
def brier_score(probabilities, answer_idx):
    answer = np.zeros_like(probabilities)
    answer[answer_idx] = 1
    return ((probabilities - answer) ** 2).sum() / 2

def calib_err(confidence, correct, p='2', beta=50):
    # beta is target bin size
    confidence = np.array(confidence)
    correct = np.array(correct)
    idxs = np.argsort(confidence)
    confidence = confidence[idxs]
    correct = correct[idxs]
    bins = [[i * beta, (i + 1) * beta] for i in range(len(confidence) // beta)]
    bins[-1] = [bins[-1][0], len(confidence)]

    cerr = 0
    total_examples = len(confidence)
    for i in range(len(bins) - 1):
        bin_confidence = confidence[bins[i][0]:bins[i][1]]
        bin_correct = correct[bins[i][0]:bins[i][1]]
        num_examples_in_bin = len(bin_confidence)

        if num_examples_in_bin > 0:
            difference = np.abs(np.nanmean(bin_confidence) - np.nanmean(bin_correct))

            if p == '2':
                cerr += num_examples_in_bin / total_examples * np.square(difference)
            elif p == '1':
                cerr += num_examples_in_bin / total_examples * difference
            elif p == 'infty' or p == 'infinity' or p == 'max':
                cerr = np.maximum(cerr, difference)
            else:
                assert False, "p must be '1', '2', or 'infty'"

    if p == '2':
        cerr = np.sqrt(cerr)

    return cerr

## Static metrics

In [11]:
folder = 't5_large_top10_linear_wdecay1e-2_lr5e-5_bs8_ep10_retrbm25ce'
result_file = f'checkpoint/{folder}/results_epoch10.obj'

raw_logits = pickle.load(open(result_file, 'rb'))

In [6]:
id2logits = {}
for logits in raw_logits:
    id, answer, logits = logits
    id2logits[id + answer] = logits

In [12]:
tf, mc, reg = [],[],[]
reg_answers = []
tf_brier, mc_brier = [],[]
tf_conf, tf_correct, mc_conf, mc_correct = [],[],[],[]

assert len(data) == len(raw_logits)

for i, (obj, logits) in enumerate(zip(data, raw_logits)):
    # id, logits = logits
    logits = id2logits[obj['question_id'] + str(obj['answers'][0])]

    if obj['answers'][0] in ['yes', 'no']:
        assert len(logits) == 2, (len(logits),i)
        probabilities = softmax(logits)
        answer_idx = ['no', 'yes'].index(obj['answers'][0])
        
        tf_conf.append(probabilities.max())
        tf_correct.append(probabilities.argmax() == answer_idx)
        tf_brier.append(brier_score(probabilities, answer_idx))

    elif type(obj['choices']) is dict:
        assert len(logits) == 1, len(logits)

        answer = float(obj['answers'][0])
        
        reg.append(np.abs(logits[0] - answer))
        reg_answers.append(logits[0])

    else:
        assert len(logits) == 12, len(logits)
        probabilities = softmax(logits)
        answer_idx = ord(obj['answers'][0]) - ord('A')

        mc_conf.append(probabilities.max())
        mc_correct.append(probabilities.argmax() == answer_idx)
        mc_brier.append(brier_score(probabilities, answer_idx))



In [5]:
print(f"{np.mean(tf_correct)*100:.2f}, {np.mean(mc_correct)*100:.2f}, {np.mean(reg)*100:.2f}")
print(f"{(np.mean(tf_correct) + np.mean(mc_correct) - np.mean(reg)) * 50:.2f}")
print(calib_err(tf_conf, tf_correct), calib_err(mc_conf, mc_correct))
print(f"{calib_err(tf_conf + mc_conf, tf_correct + mc_correct):.2f}")
print(f"{np.mean(tf_brier)*100:.2f}, {np.mean(mc_brier)*100:.2f}")
print(f"{(np.mean(tf_brier) + np.mean(mc_brier) + np.mean(reg)) * 100/3:.2f}")

## Temporal metrics

In [7]:
folder = 'temporal_t5_large_top1_seqlen64_fixed_wdecay1e-2_lr5e-5_bs8_ep5_retrbm25ce_finetune0_adjusttarget1'
result_file = f'checkpoint/{folder}/results_epoch5.obj'

raw_logits = pickle.load(open(result_file, 'rb'))

In [9]:
tf, mc, reg = [],[],[]
tf_brier, mc_brier = [],[]
tf_conf, tf_correct, mc_conf, mc_correct = [],[],[],[]

assert len(data) == len(raw_logits)

for obj, seq_logits in zip(data, raw_logits):
    # seq_logits [SEQ, C]
    logits = seq_logits[-1]
    if obj['answers'][0] in ['yes', 'no']:
        assert len(logits) == 2, len(logits)
        probabilities = softmax(logits)
        answer_idx = ['yes', 'no'].index(obj['answers'][0])
        
        tf_conf.append(probabilities.max())
        tf_correct.append(probabilities.argmax() == answer_idx)
        tf_brier.append(brier_score(probabilities, answer_idx))

    elif type(obj['choices']) is dict:

        answer = float(obj['answers'][0])
        
        reg.append(np.abs(logits - answer))

    else:
        assert len(logits) == 12, len(logits)
        probabilities = softmax(logits)
        answer_idx = ord(obj['answers'][0]) - ord('A')
        
        mc_conf.append(probabilities.max())
        mc_correct.append(probabilities.argmax() == answer_idx)
        mc_brier.append(brier_score(probabilities, answer_idx))


In [4]:
print(f"{np.mean(tf_correct)*100:.2f}, {np.mean(mc_correct)*100:.2f}, {np.mean(reg)*100:.2f}")
print(f"{(np.mean(tf_correct) + np.mean(mc_correct) - np.mean(reg)) * 50:.2f}")
print(calib_err(tf_conf, tf_correct), calib_err(mc_conf, mc_correct))
print(f"{calib_err(tf_conf + mc_conf, tf_correct + mc_correct):.2f}")
print(f"{np.mean(tf_brier)*100:.2f}, {np.mean(mc_brier)*100:.2f}")
print(f"{(np.mean(tf_brier) + np.mean(mc_brier) + np.mean(reg)) * 100/3:.2f}")

## Zero-shot metrics

In [24]:
model_size = '3b'
result_file = f'../results/unifiedqa_{model_size}_results.obj'

raw_logits = pickle.load(open(result_file, 'rb'))

In [25]:
tf, mc, reg = [],[],[]
tf_brier, mc_brier = [],[]
tf_conf, tf_correct, mc_conf, mc_correct = [],[],[],[]

assert len(data) == len(raw_logits)

for obj, logits in zip(data, raw_logits):
    if obj['answers'][0] in ['yes', 'no']:
        assert len(logits) == 2, len(logits)
        probabilities = softmax(logits)
        answer_idx = ['no', 'yes'].index(obj['answers'][0])
        
        tf_conf.append(probabilities.max())
        tf_correct.append(probabilities.argmax() == answer_idx)
        tf_brier.append(brier_score(probabilities, answer_idx))

    elif type(obj['choices']) is dict:
        assert len(logits) == 0, len(logits)

        answer = float(obj['answers'][0])
        reg.append(np.abs(np.random.uniform() - answer))

    else:
        # assert len(logits) == 12, len(logits)
        probabilities = softmax(logits)
        answer_idx = ord(obj['answers'][0]) - ord('A')

        mc_conf.append(probabilities.max())
        mc_correct.append(probabilities.argmax() == answer_idx)
        mc_brier.append(brier_score(probabilities, answer_idx))


In [3]:
print(f"{np.mean(tf_correct)*100:.2f}, {np.mean(mc_correct)*100:.2f}, {np.mean(reg)*100:.2f}")
print(f"{(np.mean(tf_correct) + np.mean(mc_correct) - np.mean(reg)) * 50:.2f}")
print(calib_err(tf_conf, tf_correct), calib_err(mc_conf, mc_correct))
print(f"{calib_err(tf_conf + mc_conf, tf_correct + mc_correct):.2f}")
print(f"{np.mean(tf_brier)*100:.2f}, {np.mean(mc_brier)*100:.2f}")
print(f"{(np.mean(tf_brier) + np.mean(mc_brier) + np.mean(reg)) * 100/3:.2f}")

## Get crowd metrics

In [33]:
tf, mc, reg = [],[],[]
tf_brier, mc_brier = [],[]
tf_conf, tf_correct, mc_conf, mc_correct = [],[],[],[]
temp = []

for obj in data:
    if obj['answers'][0] in ['yes', 'no']:
        answer_idx = ['yes', 'no'].index(obj['answers'][0])

        p = float(obj['targets'][-1]['target'])

        probabilities = np.array([1-p, p])
        
        tf_conf.append(probabilities.max())
        tf_correct.append(probabilities.argmax() == answer_idx)
        tf_brier.append(brier_score(probabilities, answer_idx))

    elif type(obj['choices']) is dict:

        answer = float(obj['answers'][0])
        p = float(obj['targets'][-1]['target'])
        p = min(p, 1)
        p = max(p, 0)
        
        reg.append(np.abs(p - answer))

    else:
        answer_idx = ord(obj['answers'][0]) - ord('A')
        
        probabilities = np.array([float(p) for p in obj['targets'][-1]['target'][:len(obj['choices'])]])
        # print(obj['question'], probabilities, obj['answers'][0])
        # if obj['answers'][0] == 'D': break

        mc_conf.append(probabilities.max())
        mc_correct.append(probabilities.argmax() == answer_idx)
        mc_brier.append(brier_score(probabilities, answer_idx))


In [2]:
print(f"{np.mean(tf_correct)*100:.2f}, {np.mean(mc_correct)*100:.2f}, {np.mean(reg)*100:.2f}")
print(f"{(np.mean(tf_correct) + np.mean(mc_correct) - np.mean(reg)) * 50:.2f}")
print(calib_err(tf_conf, tf_correct), calib_err(mc_conf, mc_correct))
print(f"{calib_err(tf_conf + mc_conf, tf_correct + mc_correct):.2f}")
print(f"{np.mean(tf_brier)*100:.2f}, {np.mean(mc_brier)*100:.2f}")
print(f"{(np.mean(tf_brier) + np.mean(mc_brier) + np.mean(reg)) * 100/3:.2f}")

## Random Baseline

In [8]:
tf, mc, reg = [],[],[]
tf_brier, mc_brier = [],[]
tf_conf, tf_correct, mc_conf, mc_correct = [],[],[],[]
temp = []

for obj in data:
    if obj['answers'][0] in ['yes', 'no']:
        answer_idx = ['yes', 'no'].index(obj['answers'][0])

        p = np.random.random()

        probabilities = np.array([1-p, p])
        
        tf_conf.append(probabilities.max())
        tf_correct.append(probabilities.argmax() == answer_idx)
        tf_brier.append(brier_score(probabilities, answer_idx))

    elif type(obj['choices']) is dict:

        answer = float(obj['answers'][0])

        p = np.random.random()
        p = min(p, 1)
        p = max(p, 0)
        
        reg.append(np.abs(p - answer))

    else:
        answer_idx = ord(obj['answers'][0]) - ord('A')
        
        probabilities = np.random.uniform(size=len(obj['choices']))
        probabilities /= probabilities.sum()

        mc_conf.append(probabilities.max())
        mc_correct.append(probabilities.argmax() == answer_idx)
        mc_brier.append(brier_score(probabilities, answer_idx))


In [1]:
print(f"{np.mean(tf_correct)*100:.2f}, {np.mean(mc_correct)*100:.2f}, {np.mean(reg)*100:.2f}")
print(f"{(np.mean(tf_correct) + np.mean(mc_correct) - np.mean(reg)) * 50:.2f}")
print(calib_err(tf_conf, tf_correct), calib_err(mc_conf, mc_correct))
print(f"{calib_err(tf_conf + mc_conf, tf_correct + mc_correct):.2f}")
print(f"{np.mean(tf_brier)*100:.2f}, {np.mean(mc_brier)*100:.2f}")
print(f"{(np.mean(tf_brier) + np.mean(mc_brier)) * 100/2:.2f}")