In [2]:
import pandas as pd
import os
from typing import Optional, List, Tuple
from huggingface_hub import snapshot_download

from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput
from tqdm.notebook import tqdm
from vllm.lora.request import LoRARequest
tqdm.pandas(desc='pandas bar')
import random


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"

### Merging Specific LoRA

In [6]:
import os
import shutil

def delete_checkpoints(directory):
    """遍历指定目录，删除所有名字包含'checkpoint'的子文件夹。"""
    for root, dirs, files in os.walk(directory, topdown=False):
        # 遍历目录
        for name in dirs:
            if 'checkpoint' in name:
                # 构建完整的文件夹路径
                full_path = os.path.join(root, name)
                # 删除文件夹
                shutil.rmtree(full_path)
                print(f"已删除: {full_path}")

# 使用示例
directory_path = 'lora_weight'
delete_checkpoints(directory_path)

已删除: lora_weight/expert/CTA_SimTab_train_init/checkpoint-50
已删除: lora_weight/expert/CTA_SimTab_train_init/checkpoint-100
已删除: lora_weight/expert/CTA_SimTab_train_init/checkpoint-150
已删除: lora_weight/expert/CTA_WebTable_train_init/checkpoint-50
已删除: lora_weight/expert/CTA_WebTable_train_init/checkpoint-100
已删除: lora_weight/expert/CTA_WebTable_train_init/checkpoint-150
已删除: lora_weight/expert/CTA_WebTable_train_init/checkpoint-200
已删除: lora_weight/expert/CTA_WebTable_train_init/checkpoint-250
已删除: lora_weight/expert/CTA_WebTable_train_init/checkpoint-300
已删除: lora_weight/expert/CTA_WebTable_train_init/checkpoint-350
已删除: lora_weight/expert/CTA_WebTable_train_init/checkpoint-400
已删除: lora_weight/expert/CTA_WebTable_train_init/checkpoint-450
已删除: lora_weight/expert/CTA_WebTable_train_init/checkpoint-500


In [3]:
from transformers import AutoModelForCausalLM
import torch
# 加载基础模型
from peft import PeftModel
base_model = AutoModelForCausalLM.from_pretrained("/data/home/wangys/LLAMA-backup/LLaMA-Factory-main/Mistral-7B-Instruct-v0.2",device_map='cpu') 

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
peft_model_id = "lora_weight/expert/amazon_google-MoE-Add"
model = PeftModel.from_pretrained(base_model, model_id=peft_model_id,adapter_name='amazon-google')

In [None]:
model.load_adapter("lora_weight/expert/amazon_train-MoE-Add", adapter_name="amazon")
model.load_adapter("lora_weight/expert/ant_buy-MoE-Add", adapter_name="ant-buy")
model.load_adapter("lora_weight/expert/CMS_train-MoE-Add", adapter_name="CMS")
model.load_adapter("lora_weight/expert/restaurant_train-MoE-Add", adapter_name="restaurant")
model.load_adapter("lora_weight/expert/semi_text_c-MoE-Add", adapter_name="semi-text-c")
model.load_adapter("lora_weight/expert/semi_text_w-MoE-Add", adapter_name="semi-text-w")
model.load_adapter("lora_weight/expert/walmart_amazon-MoE-Add", adapter_name="walmart-amazon")
model.load_adapter("lora_weight/expert/walmart_train-MoE-Add", adapter_name="walmart")
model.load_adapter("lora_weight/expert/wdc_all-MoE-Add", adapter_name="wdc-all")
model.load_adapter("lora_weight/expert/CTA_SimTab_train_init", adapter_name="SimTab")
model.load_adapter("lora_weight/expert/CTA_WebTable_train_init", adapter_name="WebTable")
model.load_adapter("lora_weight/expert/hospital_train-MoE-Add", adapter_name="hospital")
model.load_adapter("lora_weight/expert/beer_train-MoE-Add", adapter_name="beer")
model.load_adapter("lora_weight/expert/rayyan_train-MoE-Add", adapter_name="rayyan")
model.load_adapter("lora_weight/expert/RE-MoE-Add", adapter_name="RE")
model.load_adapter("lora_weight/expert/synthea_train-MoE-Add", adapter_name="synthea")

In [6]:
model.add_weighted_adapter(
    adapters=["WebTable", "semi-text-w"],
    weights=[1, 1],
    adapter_name="Mistral|webtable-MoE-CT#Mistral|SimTab-MoE-CT",
    combination_type="cat"
)
model_id_merge = 'Mistral|webtable-MoE-CT#Mistral|SimTab-MoE-CT'
model.save_pretrained(model_id = model_id_merge,save_directory='lora_weight/merge/%s' % model_id_merge) ## Save the Merged File

In [None]:
CUDA_VISIBLE_DEVICES=7 python semantic_uncertainty/generate_answers.py --model_name=/data/home/wangys/model/llama3-8b-instruct --dataset=trivia_qa

### Initialize vLLM multi-lora


In [3]:
from typing import Optional, List, Tuple

from huggingface_hub import snapshot_download

from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput
from vllm.lora.request import LoRARequest
def initialize_engine() -> LLMEngine:
    """Initialize the LLMEngine."""
    # max_loras: controls the number of LoRAs that can be used in the same
    #   batch. Larger numbers will cause higher memory usage, as each LoRA
    #   slot requires its own preallocated tensor.
    # max_lora_rank: controls the maximum supported rank of all LoRAs. Larger
    #   numbers will cause higher memory usage. If you know that all LoRAs will
    #   use the same rank, it is recommended to set this as low as possible.
    # max_cpu_loras: controls the size of the CPU LoRA cache.
    engine_args = EngineArgs(model="/data/home/wangys/LLAMA-backup/LLaMA-Factory-main/Mistral-7B-Instruct-v0.2",
                             enable_lora=True,
                             max_loras=32,
                             max_lora_rank=64,
                             max_cpu_loras=32,
                             max_num_seqs=256,enforce_eager=True,tensor_parallel_size=4,
                             disable_log_stats=True)
    return LLMEngine.from_engine_args(engine_args)
model = initialize_engine()

INFO 09-02 00:33:29 config.py:813] Defaulting to use mp for distributed inference
INFO 09-02 00:33:29 llm_engine.py:184] Initializing an LLM engine (v0.5.5) with config: model='/data/home/wangys/LLAMA-backup/LLaMA-Factory-main/Mistral-7B-Instruct-v0.2', speculative_config=None, tokenizer='/data/home/wangys/LLAMA-backup/LLaMA-Factory-main/Mistral-7B-Instruct-v0.2', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=4, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False

[W902 00:33:30.780543017 socket.cpp:697] [c10d] The client socket cannot be initialized to connect to [localhost]:54223 (errno: 97 - Address family not supported by protocol).
[W902 00:33:30.800237946 socket.cpp:697] [c10d] The client socket cannot be initialized to connect to [localhost]:54223 (errno: 97 - Address family not supported by protocol).
[W902 00:33:30.848398666 socket.cpp:697] [c10d] The client socket cannot be initialized to connect to [localhost]:54223 (errno: 97 - Address family not supported by protocol).
[W902 00:33:30.900539567 socket.cpp:697] [c10d] The client socket cannot be initialized to connect to [localhost]:54223 (errno: 97 - Address family not supported by protocol).


INFO 09-02 00:33:30 custom_all_reduce_utils.py:234] reading GPU P2P access cache from /home/wangys/.cache/vllm/gpu_p2p_access_cache_for_4,5,6,7.json
[1;36m(VllmWorkerProcess pid=3314750)[0;0m [1;36m(VllmWorkerProcess pid=3314749)[0;0m [1;36m(VllmWorkerProcess pid=3314751)[0;0m INFO 09-02 00:33:30 custom_all_reduce_utils.py:234] reading GPU P2P access cache from /home/wangys/.cache/vllm/gpu_p2p_access_cache_for_4,5,6,7.json
INFO 09-02 00:33:30 custom_all_reduce_utils.py:234] reading GPU P2P access cache from /home/wangys/.cache/vllm/gpu_p2p_access_cache_for_4,5,6,7.json
INFO 09-02 00:33:30 custom_all_reduce_utils.py:234] reading GPU P2P access cache from /home/wangys/.cache/vllm/gpu_p2p_access_cache_for_4,5,6,7.json
INFO 09-02 00:33:30 shm_broadcast.py:235] vLLM message queue communication handle: Handle(connect_ip='127.0.0.1', local_reader_ranks=[1, 2, 3], buffer=<vllm.distributed.device_communicators.shm_broadcast.ShmRingBuffer object at 0x7ff0ad796180>, local_subscribe_port=603

[1;36m(VllmWorkerProcess pid=3314750)[0;0m   state = torch.load(bin_file, map_location="cpu")
[1;36m(VllmWorkerProcess pid=3314751)[0;0m   state = torch.load(bin_file, map_location="cpu")
  state = torch.load(bin_file, map_location="cpu")
[1;36m(VllmWorkerProcess pid=3314749)[0;0m 

Loading pt checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


  state = torch.load(bin_file, map_location="cpu")


[1;36m(VllmWorkerProcess pid=3314750)[0;0m INFO 09-02 00:33:39 model_runner.py:890] Loading model weights took 3.3975 GB
INFO 09-02 00:33:39 model_runner.py:890] Loading model weights took 3.3975 GB
[1;36m(VllmWorkerProcess pid=3314751)[0;0m INFO 09-02 00:33:39 model_runner.py:890] Loading model weights took 3.3975 GB
[1;36m(VllmWorkerProcess pid=3314749)[0;0m INFO 09-02 00:33:39 model_runner.py:890] Loading model weights took 3.3975 GB
INFO 09-02 00:33:44 distributed_gpu_executor.py:56] # GPU blocks: 121115, # CPU blocks: 8192


In [1]:
MoE_list_update_top_2 = pd.read_csv('/data/home/wangys/MoE-Example/Router/MoE_list_update_top_2.csv',index_col=0)

NameError: name 'pd' is not defined

In [25]:
MoE_list_update_top_2.iloc[1]

pos               ['Mistral|amazon_google-MoE-CT', 'Mistral|semi...
neg                                                              []
expert                                 Mistral|amazon_google-MoE-CT
query             You are an expert in detecting if two text des...
expert_predict    ['Mistral|amazon_google-MoE-CT', 'Mistral|SimT...
domain                                 Mistral|amazon_google-MoE-CT
cross-dataset     [Mistral|SimTab-MoE-CT, Mistral|semi_text_w-Mo...
cross-task          [Mistral|SimTab-MoE-CT, Mistral|walmart-MoE-CT]
Name: 1, dtype: object

In [None]:
def AST(row):
    CD = row['cross-dataset']
    row['cross-dataset'] = eval(CD)
    CT = row['cross-task']
    row['cross-task'] = eval(CT)
    expert = row['expert_predict']
    row['expert_predict'] = eval(expert)
    return row
MoE_list_update_top_2 = MoE_list_update_top_2.progress_apply(AST,axis=1)
expert_list = []
for index,row in MoE_list_update_top_2.iterrows():
    # expert_list.append(set(row['cross-dataset'])) if set(row['cross-dataset']) not in expert_list else None
    # expert_list.append(set(row['cross-task'])) if set(row['cross-task']) not in expert_list else None
    expert_list.append(set(row['expert_predict'])) if set(row['expert_predict']) not in expert_list else None

In [18]:
lora_path_dict = {}
evaluate_task = 'expert_predict'
for e in expert_list:
    expert_0,expert_1 = list(e)
    folder_path = 'lora_weight/merge/%s#%s' % (expert_0,expert_1)
    folder_path_rev = 'lora_weight/merge/%s#%s' % (expert_1,expert_0)

    if(os.path.exists(folder_path)):
        lora_path_dict['%s#%s' % (expert_0,expert_1)] = folder_path
        lora_path_dict['%s#%s' % (expert_1,expert_0)] = folder_path
    elif(os.path.exists(folder_path_rev)):
        lora_path_dict['%s#%s' % (expert_0,expert_1)] = folder_path_rev
        lora_path_dict['%s#%s' % (expert_1,expert_0)] = folder_path_rev
    else:
        print(folder_path)
def create_multi_lora_call(df,lora_id_list=['']):
    multi_lora_call = []
    for index,row in df.iterrows():
        # lora_id = '#'.join(row['cross-task'])
        lora_id = '#'.join(row[evaluate_task])
        multi_lora_call.append([row['query'],lora_id,index])
    return multi_lora_call
def create_test_prompts(multi_lora_call: list,lora_path: dict)-> List[Tuple[str, SamplingParams]]:
    output_list = []
    lora_all = list(lora_path.keys())
    for m in multi_lora_call:
        m_output = ("[INST] %s [/INST]" % m[0],
         SamplingParams(temperature=0.0,
                        top_p=1,
                        # prompt_logprobs=1,
                        max_tokens=512),
         LoRARequest(m[1], lora_all.index(m[1]) + 1, lora_path[m[1]]),
         m[2])
        output_list.append(m_output)
    return output_list
lora_id_list=list(lora_path_dict.keys())


In [19]:
from datetime import datetime
from tqdm import tqdm

def process_requests(engine: LLMEngine,
                     test_prompts: List[Tuple[str, SamplingParams,
                                              Optional[LoRARequest]]]):
    """Continuously process a list of prompts and handle the outputs."""
    request_id = 0
    output_list = []
    output_request = []
    pbar = tqdm(total=len(test_prompts))  # 初始化tqdm进度条
    while test_prompts or engine.has_unfinished_requests():
        
        if test_prompts:
            prompt, sampling_params, lora_request, index = test_prompts.pop(0)
            engine.add_request(str(index),
                               prompt,
                               sampling_params,
                               lora_request=lora_request)
            request_id += 1
            pbar.update(1)  # 更新进度条
        request_outputs: List[RequestOutput] = engine.step()
        for request_output in request_outputs:
            if request_output.finished:
                output_list.append(request_output)
    pbar.close()  # 关闭进度条
    return output_list

# 示例使用
lora_id_list = list(lora_path_dict.keys())
multi_lora_call = create_multi_lora_call(MoE_list_update_top_2.iloc[:2000], lora_id_list=lora_id_list)
test_prompts_input = create_test_prompts(multi_lora_call=multi_lora_call, lora_path=lora_path_dict)
start_time = datetime.now()
result_all = process_requests(model, test_prompts=test_prompts_input)
end_time = datetime.now()
print((end_time - start_time).total_seconds())

  LoRARequest(m[1], lora_all.index(m[1]) + 1, lora_path[m[1]]),
100%|██████████| 2000/2000 [03:17<00:00, 10.15it/s]

197.08821





In [20]:
# print(result_all[0].prompt)
# result_all[33]
output_ins = {}
output_predict = {}
for lora_id in lora_id_list:
    # output_ins[lora_id] = [''] * int(len(result_all) / len(lora_path_dict))
    # output_predict[lora_id] = [''] * int(len(result_all) / len(lora_path_dict))
    output_ins[lora_id] = [''] * int(len(result_all) )
    output_predict[lora_id] = [''] * int(len(result_all) )
# output_lora_id = [''] * len(result_all)
for request in result_all:
    request_id = int(request.request_id)
    request_ins = request.prompt.strip()
    request_lora = request.lora_request.lora_name
    request_output = request.outputs[0].text.strip()
    output_ins[request_lora][request_id] = request_ins
    output_predict[request_lora][request_id] = request_output
    # output_lora_id[request_id] = request_lora

In [21]:
dict_output_MoE = {}
for result in result_all:
    prompt = result.prompt.replace('[INST] ','').replace(' [/INST]','')
    output = result.outputs[0].text.strip()
    dict_output_MoE[prompt] = output

In [31]:
import numpy as np
np.save('tests/MoE_result_2000.npy',dict_output_MoE)

In [22]:
MoE_list_update_top_2_few = MoE_list_update_top_2.iloc[:2000]

In [23]:
MoE_list_update_top_2_few['prediction'] = MoE_list_update_top_2_few['query'].map(dict_output_MoE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MoE_list_update_top_2_few['prediction'] = MoE_list_update_top_2_few['query'].map(dict_output_MoE)


In [None]:
{'performance': {'accuracy': {'mean': 0.66, 'bootstrap': {'std_err': 0.024141193860101266, 'low': 0.62, 'high': 0.6975}}}, 'uncertainty': {'p_false': {'AUROC': {'mean': 0.7035984848484848, 'bootstrap': {'std_err': 0.02868401073143026, 'low': 0.653203966827957, 'high': 0.7460714715683334}}, 'area_under_thresholded_accuracy': {'mean': 0.7292793838384062, 'bootstrap': {'std_err': 0.02544652776737242, 'low': 0.6854210115670789, 'high': 0.7696289034003679}}, 'mean_uncertainty': {'mean': 1.369315084692098, 'bootstrap': {'std_err': 0.042032147321903794, 'low': 1.3005711938199878, 'high': 1.4418123183850153}}, 'accuracy_at_0.8_answer_fraction': {'mean': 0.734375, 'bootstrap': {'std_err': 0.025600775278244323, 'low': 0.6934984520123839, 'high': 0.7770833249833019}}, 'accuracy_at_0.9_answer_fraction': {'mean': 0.6805555555555556, 'bootstrap': {'std_err': 0.02436561822303607, 'low': 0.6371191135734072, 'high': 0.7166666666666667}}, 'accuracy_at_0.95_answer_fraction': {'mean': 0.6763157894736842, 'bootstrap': {'std_err': 0.02443786481709613, 'low': 0.6394736842105263, 'high': 0.718421052631579}}, 'accuracy_at_1.0_answer_fraction': {'mean': 0.66, 'bootstrap': {'std_err': 0.02298985378655391, 'low': 0.6225, 'high': 0.6975}}}, 'p_false_UNANSWERABLE': {'AUROC': {'mean': nan, 'bootstrap': {'std_err': nan, 'low': nan, 'high': nan}}, 'area_under_thresholded_accuracy': {'mean': 0.9473684210526319, 'bootstrap': {'std_err': 0.0, 'low': 0.9473684210526319, 'high': 0.9473684210526319}}, 'mean_uncertainty': {'mean': 1.369315084692098, 'bootstrap': {'std_err': 0.04231973671562589, 'low': 1.3071708831006759, 'high': 1.450182662198471}}, 'accuracy_at_0.8_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}, 'accuracy_at_0.9_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}, 'accuracy_at_0.95_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}, 'accuracy_at_1.0_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}}, 'p_false_fixed': {'AUROC': {'mean': 0.7035984848484848, 'bootstrap': {'std_err': 0.029221055564121294, 'low': 0.6516528019465964, 'high': 0.7483264771615535}}, 'area_under_thresholded_accuracy': {'mean': 0.7292793838384062, 'bootstrap': {'std_err': 0.024442589921248144, 'low': 0.6877122458301619, 'high': 0.7656358315815655}}, 'mean_uncertainty': {'mean': 0.1643655148000058, 'bootstrap': {'std_err': 0.014291699531727502, 'low': 0.14193933071971335, 'high': 0.19023172928351054}}, 'accuracy_at_0.8_answer_fraction': {'mean': 0.734375, 'bootstrap': {'std_err': 0.02662937236491397, 'low': 0.6930013488514927, 'high': 0.78125}}, 'accuracy_at_0.9_answer_fraction': {'mean': 0.6805555555555556, 'bootstrap': {'std_err': 0.026021578899907926, 'low': 0.631578947368421, 'high': 0.7194444444444444}}, 'accuracy_at_0.95_answer_fraction': {'mean': 0.6763157894736842, 'bootstrap': {'std_err': 0.024189868065330418, 'low': 0.6377952755905512, 'high': 0.7165354330708661}}, 'accuracy_at_1.0_answer_fraction': {'mean': 0.66, 'bootstrap': {'std_err': 0.024109943786524875, 'low': 0.6197431891104098, 'high': 0.6975}}}, 'p_false_fixed_UNANSWERABLE': {'AUROC': {'mean': nan, 'bootstrap': {'std_err': nan, 'low': nan, 'high': nan}}, 'area_under_thresholded_accuracy': {'mean': 0.9473684210526319, 'bootstrap': {'std_err': 0.0, 'low': 0.9473684210526319, 'high': 0.9473684210526319}}, 'mean_uncertainty': {'mean': 0.1643655148000058, 'bootstrap': {'std_err': 0.015471778412932095, 'low': 0.1410452565055509, 'high': 0.19063765082904466}}, 'accuracy_at_0.8_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}, 'accuracy_at_0.9_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}, 'accuracy_at_0.95_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}, 'accuracy_at_1.0_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}}, 'cluster_assignment_entropy': {'AUROC': {'mean': 0.7458082664884135, 'bootstrap': {'std_err': 0.02695053899385819, 'low': 0.696066896826084, 'high': 0.7849903830459627}}, 'area_under_thresholded_accuracy': {'mean': 0.7418208412716618, 'bootstrap': {'std_err': 0.02330995878459444, 'low': 0.6998433386690897, 'high': 0.7764516132607315}}, 'mean_uncertainty': {'mean': 0.64406744955496, 'bootstrap': {'std_err': 0.03349239526642996, 'low': 0.5921619059734258, 'high': 0.7047643441835325}}, 'accuracy_at_0.8_answer_fraction': {'mean': 0.7507788161993769, 'bootstrap': {'std_err': 0.026008505698620125, 'low': 0.7046530970893384, 'high': 0.7907925647520161}}, 'accuracy_at_0.9_answer_fraction': {'mean': 0.7154696132596685, 'bootstrap': {'std_err': 0.024463068328288616, 'low': 0.6721775870328311, 'high': 0.7527777777777778}}, 'accuracy_at_0.95_answer_fraction': {'mean': 0.6839378238341969, 'bootstrap': {'std_err': 0.025364921189658284, 'low': 0.6371294633724262, 'high': 0.7193063301221577}}, 'accuracy_at_1.0_answer_fraction': {'mean': 0.66, 'bootstrap': {'std_err': 0.02413476925696799, 'low': 0.6175, 'high': 0.6975}}}, 'cluster_assignment_entropy_UNANSWERABLE': {'AUROC': {'mean': nan, 'bootstrap': {'std_err': nan, 'low': nan, 'high': nan}}, 'area_under_thresholded_accuracy': {'mean': 0.9473684210526319, 'bootstrap': {'std_err': 0.0, 'low': 0.9473684210526319, 'high': 0.9473684210526319}}, 'mean_uncertainty': {'mean': 0.64406744955496, 'bootstrap': {'std_err': 0.03412283819837339, 'low': 0.5920744891489007, 'high': 0.7046586903017125}}, 'accuracy_at_0.8_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}, 'accuracy_at_0.9_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}, 'accuracy_at_0.95_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}, 'accuracy_at_1.0_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}}, 'regular_entropy': {'AUROC': {'mean': 0.7593025846702317, 'bootstrap': {'std_err': 0.026247063554649504, 'low': 0.7124753402150099, 'high': 0.7993677564270243}}, 'area_under_thresholded_accuracy': {'mean': 0.7574959306284539, 'bootstrap': {'std_err': 0.022247095521513518, 'low': 0.7181114931645975, 'high': 0.7908064226078415}}, 'mean_uncertainty': {'mean': 0.6669306986157606, 'bootstrap': {'std_err': 0.0315462424011832, 'low': 0.6144860698546385, 'high': 0.7185335210467023}}, 'accuracy_at_0.8_answer_fraction': {'mean': 0.7625, 'bootstrap': {'std_err': 0.027558846067912316, 'low': 0.715625, 'high': 0.80625}}, 'accuracy_at_0.9_answer_fraction': {'mean': 0.7111111111111111, 'bootstrap': {'std_err': 0.02476664936675551, 'low': 0.6638888888888889, 'high': 0.7451523545706371}}, 'accuracy_at_0.95_answer_fraction': {'mean': 0.6947368421052632, 'bootstrap': {'std_err': 0.024990425713854226, 'low': 0.6526315789473685, 'high': 0.7322834645669292}}, 'accuracy_at_1.0_answer_fraction': {'mean': 0.66, 'bootstrap': {'std_err': 0.023594578781955356, 'low': 0.62, 'high': 0.695}}}, 'regular_entropy_UNANSWERABLE': {'AUROC': {'mean': nan, 'bootstrap': {'std_err': nan, 'low': nan, 'high': nan}}, 'area_under_thresholded_accuracy': {'mean': 0.9473684210526319, 'bootstrap': {'std_err': 0.0, 'low': 0.9473684210526319, 'high': 0.9473684210526319}}, 'mean_uncertainty': {'mean': 0.6669306986157606, 'bootstrap': {'std_err': 0.031238346732170397, 'low': 0.6147131369942241, 'high': 0.7174470980494095}}, 'accuracy_at_0.8_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}, 'accuracy_at_0.9_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}, 'accuracy_at_0.95_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}, 'accuracy_at_1.0_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}}, 'semantic_entropy': {'AUROC': {'mean': 0.7442067736185384, 'bootstrap': {'std_err': 0.028247504655536767, 'low': 0.6915618337648043, 'high': 0.7882459476310489}}, 'area_under_thresholded_accuracy': {'mean': 0.7444770381737311, 'bootstrap': {'std_err': 0.02397198041436991, 'low': 0.702229990822112, 'high': 0.7805534193313592}}, 'mean_uncertainty': {'mean': 0.44979299889069735, 'bootstrap': {'std_err': 0.028296649135451272, 'low': 0.40567772053218304, 'high': 0.5005747356440328}}, 'accuracy_at_0.8_answer_fraction': {'mean': 0.75, 'bootstrap': {'std_err': 0.02688927514531295, 'low': 0.7071651090342679, 'high': 0.794392523364486}}, 'accuracy_at_0.9_answer_fraction': {'mean': 0.7166666666666667, 'bootstrap': {'std_err': 0.026028887657561714, 'low': 0.675, 'high': 0.7555555555555555}}, 'accuracy_at_0.95_answer_fraction': {'mean': 0.6894736842105263, 'bootstrap': {'std_err': 0.024552704636588544, 'low': 0.6492146596858639, 'high': 0.7313725308312533}}, 'accuracy_at_1.0_answer_fraction': {'mean': 0.66, 'bootstrap': {'std_err': 0.02398854505892436, 'low': 0.62, 'high': 0.6975}}}, 'semantic_entropy_UNANSWERABLE': {'AUROC': {'mean': nan, 'bootstrap': {'std_err': nan, 'low': nan, 'high': nan}}, 'area_under_thresholded_accuracy': {'mean': 0.9473684210526319, 'bootstrap': {'std_err': 0.0, 'low': 0.9473684210526319, 'high': 0.9473684210526319}}, 'mean_uncertainty': {'mean': 0.44979299889069735, 'bootstrap': {'std_err': 0.03007733158812221, 'low': 0.4028665346303029, 'high': 0.5017891613519951}}, 'accuracy_at_0.8_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}, 'accuracy_at_0.9_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}, 'accuracy_at_0.95_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}, 'accuracy_at_1.0_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}}, 'p_ik': {'AUROC': {'mean': 0.6278966131907308, 'bootstrap': {'std_err': 0.029611489562178066, 'low': 0.5749703439229479, 'high': 0.673213816603144}}, 'area_under_thresholded_accuracy': {'mean': 0.6986251550698442, 'bootstrap': {'std_err': 0.0258544578124125, 'low': 0.6549891784009043, 'high': 0.7377444578308109}}, 'mean_uncertainty': {'mean': 0.25540223396559353, 'bootstrap': {'std_err': 0.017929461172182942, 'low': 0.22589726309512342, 'high': 0.28597168929048955}}, 'accuracy_at_0.8_answer_fraction': {'mean': 0.70625, 'bootstrap': {'std_err': 0.02638966540639941, 'low': 0.6625, 'high': 0.746875}}, 'accuracy_at_0.9_answer_fraction': {'mean': 0.6777777777777778, 'bootstrap': {'std_err': 0.025501545493097837, 'low': 0.631914336072809, 'high': 0.7146814404432132}}, 'accuracy_at_0.95_answer_fraction': {'mean': 0.6710526315789473, 'bootstrap': {'std_err': 0.025096668349502298, 'low': 0.6289473684210526, 'high': 0.7114211256149086}}, 'accuracy_at_1.0_answer_fraction': {'mean': 0.66, 'bootstrap': {'std_err': 0.023036104398440633, 'low': 0.62, 'high': 0.6975}}}, 'p_ik_UNANSWERABLE': {'AUROC': {'mean': nan, 'bootstrap': {'std_err': nan, 'low': nan, 'high': nan}}, 'area_under_thresholded_accuracy': {'mean': 0.9473684210526319, 'bootstrap': {'std_err': 0.0, 'low': 0.9473684210526319, 'high': 0.9473684210526319}}, 'mean_uncertainty': {'mean': 0.25540223396559353, 'bootstrap': {'std_err': 0.01767447719530058, 'low': 0.2248553087358517, 'high': 0.2831552998184846}}, 'accuracy_at_0.8_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}, 'accuracy_at_0.9_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}, 'accuracy_at_0.95_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}, 'accuracy_at_1.0_answer_fraction': {'mean': 1.0, 'bootstrap': {'std_err': 0.0, 'low': nan, 'high': nan}}}}}