In [1]:
import json
from tqdm import tqdm
import os
import glob

import torch

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
device = "cuda"
model.to(device)

question_structure = "I will use you as an evaluator. I will give you ground truth, and a model generated answer. I want you to tell me if the ground truth and model generated answer are consistent."

Downloading config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [3]:
def performEvaluation(path):
    file = open(path, 'r')
    qa_list = json.load(file)
    print(len(qa_list))
    promptList = []

    for qa in qa_list:
        prompt = f"""{question_structure}
            Ground truth: {qa['Ground truth']}
            Model generated answer: {qa['Model generated answer']}
            Consistent:"""   
        promptList.append(prompt)
        
    return evalZephyr(promptList)
    

In [4]:
def evalZephyr(promptList):
    evals = []
    
    batch_size = 128
    batches = [promptList[i:i+batch_size] for i in range(0, len(promptList), batch_size)]
    for batch in tqdm(batches):
        model_inputs = tokenizer(batch, return_tensors="pt", padding=True).to(device)
        
    
        generated_ids = model.generate(
        # seed = 42,
        **model_inputs, 
        max_new_tokens=3,
        do_sample = False,
        min_length = None,
        use_cache = True,
        top_p = 1.0,
        temperature = 1e-05,
        top_k = 50,
        repetition_penalty = 1.0,
        length_penalty = 1,
        max_padding_length = None)
    
        evals += tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
        # torch.cuda.empty_cache() 
    print(len(evals))
    return evals

In [None]:
QA_PATH = '/home/averma90/CSE576/github/MLLM_Hallucinations/CLEVR_v1/answers/val/num_objects_based_difficulty_vanilla'
SAVE_FOLDER = '/home/averma90/CSE576/github/MLLM_Hallucinations/CLEVR_v1/eval/val/num_objects_based_difficulty_vanilla'

files = glob.glob(os.path.join(QA_PATH, '*.json'), recursive=True)

for file in files:
    evals = performEvaluation(file)

    with open(os.path.join(SAVE_FOLDER, file.split("/")[-1]), 'w') as file:
        json.dump(evals, file)

9992


100%|██████████| 79/79 [41:57<00:00, 31.87s/it]


9992
10000


100%|██████████| 79/79 [41:56<00:00, 31.85s/it]


10000
10000


 65%|██████▍   | 51/79 [27:23<15:06, 32.37s/it]