# Check results of baseline uncertainty estimation methods

In [None]:
#
# For licensing see accompanying LICENSE file.
# Copyright (C) 2025 Apple Inc. All Rights Reserved.
#

In [34]:
import pandas as pd
import json
import os
import numpy as np
from sklearn.metrics import roc_auc_score

def readjsonl(datapath):
    res = []
    with open(datapath, "r", encoding="utf-8") as f:
        for line in f.readlines():
            res.append(json.loads(line))
    return res


In [None]:
def compute_perplexity_auroc(_response_df, uncertainty_type='perplexity', task_type='Controlled_Easy'):
    dic = {}
    if task_type == 'Controlled_Easy':
        label_list = ['correct', 'incorrect']
    elif task_type == 'Controlled_Hard':
        label_list = ['correct', 'subtle_off']
    elif task_type == 'Realistic':
        label_list = ['correct', 'incorrect']
    else:
        raise ValueError('task_type should be one of [Controlled-Easy, Controlled-Hard, Realistic]')

    for following_label in label_list:
        dic[following_label] = _response_df[_response_df['following_label']==following_label][uncertainty_type].apply(lambda x: float(x))

    # // Compute AUROC
    gt_labels_list = []
    binary_uncertainty_list = []
    for label in label_list:
        if uncertainty_type in ['verbalized_confidence', 'normalized_p_true', 'p_true']: # // <-- higher is better
            gt_labels_list.append([int(label=='correct')]*len(dic[label])) # // <-- 0: incorrect 1: correct
        elif uncertainty_type in ['perplexity', 'entropy','maximum_seq_prob']:  # // <-- lower is better
            gt_labels_list.append([int(label!='correct')]*len(dic[label])) # // <-- 0: correct 1: incorrect
        binary_uncertainty_list.append(dic[label])
    gt_labels_list = np.concatenate(gt_labels_list)
    binary_uncertainty_list = np.concatenate(binary_uncertainty_list)

    # // Replace nan
    binary_uncertainty_list = np.array(binary_uncertainty_list, dtype=float)
    nan_mask = ~np.isnan(binary_uncertainty_list)
    gt_labels_list = gt_labels_list[nan_mask]
    binary_uncertainty_list = binary_uncertainty_list[nan_mask]

    print(dic, gt_labels_list, binary_uncertainty_list)

    return gt_labels_list, binary_uncertainty_list

## Check all uncertainty

In [None]:
MODEL='Llama-2-7b-chat-hf' # Llama-2-7b-chat-hf Phi-3-mini-128k-instruct Mistral-7B-Instruct-v0.3
task_type='Controlled_Easy'
data_path = ''

data_type = 'controlled_ver' if 'Cont' in task_type else 'reality_ver'
task_path = f"{data_path}/{MODEL}/{data_type}/"
response_path = os.path.join(task_path, "all_eval_response_and_baseline.jsonl")
_response_df = pd.DataFrame(readjsonl(response_path))

for uncertainty_type in ['perplexity', 'entropy', 'normalized_p_true', 'maximum_seq_prob', 'verbalized_confidence', 'p_true']:
    print()
    print('------', uncertainty_type, '------')
    # // Select insturctions
    inst_dic = {}
    all_inst = ['startend', 'detectable_content', 'detectable_format', 'language', 'change_case', 'keywords', 'length_constraints',  'punctuation']
    ind_list = []
    for inst in all_inst:
        for i in range(len(_response_df)):
            category = _response_df['instruction_id_list'][i][0].split(':')[0]
            if inst == category:
                ind_list.append(i)
        _response_df_inst = _response_df.iloc[ind_list]

        gt_labels_list, binary_uncertainty_list = compute_perplexity_auroc( _response_df_inst, uncertainty_type, task_type)

        # // Stat of test
        succ = (gt_labels_list==0).sum()
        fail = (gt_labels_list==1).sum()

        # // Mask inf
        inf_mask = ~np.isinf(binary_uncertainty_list)
        gt_labels_list = gt_labels_list[inf_mask]
        binary_uncertainty_list = binary_uncertainty_list[inf_mask]


        roc = roc_auc_score(gt_labels_list, binary_uncertainty_list)
        print(inst)
        print(roc)
        print()