In [1]:
# !pip install transformers
import json
from tqdm import tqdm
import os
import glob

import torch

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
device = "cuda"
model.to(device)

question_structure = "I will use you as an evaluator. I will give you ground truth, and a model generated answer. I want you to tell me if the ground truth and model generated answer are consistent."

Downloading tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [3]:
def performEvaluation(path):
    file = open(path, 'r')
    qa_list = json.load(file)
    print(len(qa_list))
    promptList = []

    for qa in qa_list:
        prompt = f"""{question_structure}
            Ground truth: {qa['Ground truth']}
            Model generated answer: {qa['Model generated answer']}
            Consistent:"""   
        promptList.append(prompt)
        
    return evalZephyr(promptList)
    

In [4]:
def evalZephyr(promptList):
    evals = []
    
    batch_size = 128
    batches = [promptList[i:i+batch_size] for i in range(0, len(promptList), batch_size)]
    for batch in tqdm(batches):
        model_inputs = tokenizer(batch, return_tensors="pt", padding=True).to(device)
        
    
        generated_ids = model.generate(
        # seed = 42,
        **model_inputs, 
        max_new_tokens=3,
        do_sample = False,
        min_length = None,
        use_cache = True,
        top_p = 1.0,
        temperature = 1e-05,
        top_k = 50,
        repetition_penalty = 1.0,
        length_penalty = 1,
        max_padding_length = None)
    
        evals += tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
        # torch.cuda.empty_cache() 
    print(len(evals))
    return evals

In [6]:
QA_PATH = '/scratch/nmachav1/MLLM_Hallucinations/OpenFlamingo/answers_difficulty_based_val'
    
files = glob.glob(os.path.join(QA_PATH, '*.json'), recursive=True)

for file in files:
    evals = performEvaluation(file)
    
    with open(f'/scratch/nmachav1/MLLM_Hallucinations/OpenFlamingo/answers_difficulty_based_val/eval/{file.split("/")[-1]}', 'a') as file:
        json.dump(evals, file)
    

1000


100%|██████████| 8/8 [01:43<00:00, 12.99s/it]


1000
1000


100%|██████████| 8/8 [01:47<00:00, 13.42s/it]


1000
1000


100%|██████████| 8/8 [01:27<00:00, 10.92s/it]


1000
1000


100%|██████████| 8/8 [01:40<00:00, 12.52s/it]


1000
1000


100%|██████████| 8/8 [01:34<00:00, 11.86s/it]


1000
1000


100%|██████████| 8/8 [01:35<00:00, 11.94s/it]


1000
1000


100%|██████████| 8/8 [01:43<00:00, 12.98s/it]


1000
1000


100%|██████████| 8/8 [01:37<00:00, 12.17s/it]


1000
