In [1]:
import os
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

!nvidia-smi

print("PyTorch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)
print("#GPUs:", torch.cuda.device_count())

Tue Nov  7 22:30:40 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12             Driver Version: 535.104.12   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA RTX 6000 Ada Gene...    Off | 00000000:3F:00.0 Off |                  Off |
| 30%   30C    P5              32W / 300W |      4MiB / 49140MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA RTX 6000 Ada Gene...    Off | 00000000:40:00.0 Off |  

In [3]:
# MODEL = "lmsys/vicuna-13b-v1.5"
# MODEL = "meta-llama/Llama-2-7b-chat-hf"
MODEL = "mistralai/Mistral-7B-Instruct-v0.1"
# MODEL = "mistralai/Mistral-7B-v0.1"
# MODEL = "HuggingFaceH4/zephyr-7b-beta"

CONVERSATIONAL = True

PROMPT_FILE = "../prompts/eval-v0.2-few-shot_chat.txt"
# PROMPT_FILE = "../prompts/eval-v0.2-zero-shot_chat.txt"

prompt_template = "".join(open(PROMPT_FILE).readlines())

In [8]:
def gen(example, model, tokenizer, max_new_tokens=256, do_sample=True, num_beams=1, top_p=0.9, num_returns=1):
    model.eval()
    
    if isinstance(example, dict):
        prompt = prepare(example, model, tokenizer)
    else:
        prompt = example
    
    # Run inference
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            do_sample=do_sample,
            top_p=top_p,
            max_new_tokens=max_new_tokens,
            num_return_sequences=num_returns,
            num_beams=num_beams,
        )

    texts = []
    for n in range(output_ids.shape[0]):
        texts.append(tokenizer.decode(output_ids[n, inputs["input_ids"].shape[-1]:], skip_special_tokens=True).strip())

    if num_returns == 1:
        return texts[0]
    else:
        return texts
    

def prepare(example, model, tokenizer, raw_chat=False):
    gold_answers = ", ".join([f'"{a}"' for a in example["answers"]])
    text = prompt_template.format(q=example["question"], answers=gold_answers, candidate_answer=example["candidate"])
    
    if CONVERSATIONAL:
        if "Mistral" in model.config.name_or_path:
            instructions = None
            prompt = text
        else:
            sections = text.split("###")
            instructions = "###".join(sections[:-1]) if len(sections) > 1 else None
            prompt = sections[-1].strip()
        
        chat = []
        if instructions:
            chat.append({"role": "system", "content": instructions})

        chat.append({"role": "user", "content": prompt})
        if raw_chat:
            return chat
        else:
            return tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    else:
        return text

## Load

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
tokenizer.use_default_system_prompt = False
if not tokenizer.pad_token:
    tokenizer.pad_token = tokenizer.eos_token

config = AutoConfig.from_pretrained(MODEL, return_dict=True)
model = AutoModelForCausalLM.from_pretrained(MODEL, config=config, device_map="auto", low_cpu_mem_usage=True)
print(model.generation_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2
}



## Demo 1

In [9]:
example = {
    "question": "how much money did it cost to make gta v?",
    "answers": ['137'],
    "candidate": "approximately $100 million",
}

responses = []
for text in gen(example, model, tokenizer, num_returns=10):
    responses.append(text)
    print(text)
    print("===" * 10)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Explanation: The cost to make Grand Theft Auto V (GTA V) is estimated to be around $100 million. Therefore, the candidate answer is correct.
Judgment: yes.
Explanation: The ground-truth answer is $137 million. The candidate answer is approximately $100 million which is close but not exact.

Judgment: no.
Explanation: The cost of making Grand Theft Auto V (GTA V) was $137 million. However, the candidate answer says approximately $100 million which is less than the ground-truth answer. Therefore, the candidate answer is incorrect.

Judgment: no.
The ground-truth answer for the question "how much money did it cost to make GTA V" is "137", while the candidate answer is "approximately $100 million". The candidate answer is correct based on the fact that $100 million is approximately the same as 137 in the context of money, given that each unit of money represents a significant amount. Therefore, the judgment is "yes".
Explanation: The actual cost to make GTA V was $137 million as per the gr

In [16]:
chat = prepare(example, model, tokenizer, raw_chat=True)

next_responses = []
for response in responses:
    c = list(chat)
    c.append({"role": "assistant", "content": response})
    c.append({"role": "user", "content": "Tell me your final judgment in only 'yes' or 'no'"})
    conv = tokenizer.apply_chat_template(c, tokenize=False, add_generation_prompt=True)
    out = gen(conv, model, tokenizer, do_sample=False)
    print(">>", response)
    print(">>>>", out)
    print("===" * 10)
    next_responses.append(out)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


>> Explanation: The cost to make Grand Theft Auto V (GTA V) is estimated to be around $100 million. Therefore, the candidate answer is correct.
Judgment: yes.
>>>> yes


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


>> Explanation: The ground-truth answer is $137 million. The candidate answer is approximately $100 million which is close but not exact.

Judgment: no.
>>>> no


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


>> Explanation: The cost of making Grand Theft Auto V (GTA V) was $137 million. However, the candidate answer says approximately $100 million which is less than the ground-truth answer. Therefore, the candidate answer is incorrect.

Judgment: no.
>>>> no


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


>> The ground-truth answer for the question "how much money did it cost to make GTA V" is "137", while the candidate answer is "approximately $100 million". The candidate answer is correct based on the fact that $100 million is approximately the same as 137 in the context of money, given that each unit of money represents a significant amount. Therefore, the judgment is "yes".
>>>> Yes.


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


>> Explanation: The actual cost to make GTA V was $137 million as per the ground-truth answer. However, the candidate answer provides an approximate value of $100 million which can be considered close enough and acceptable in general terms.

Judgment: yes.
>>>> yes


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


>> The candidate answer is incorrect as the ground-truth answer is $137 million.
>>>> no


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


>> The ground-truth answer for how much money it cost to make GTA V is $137 million. The candidate answer of approximately $100 million is incorrect, so the judgment is no.
>>>> no


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


>> The actual cost of developing Grand Theft Auto V (GTA V) was $137 million, which is less than the candidate answer of "approximately $100 million". Therefore, the candidate answer cannot be considered correct and the judgment is no.
>>>> no


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


>> The cost to make Grand Theft Auto V is estimated to be around $100 million. However, this figure is an estimate, and the actual cost could be slightly more or less. Therefore, the candidate answer is approximately correct.

Judgment: yes.
>>>> yes
>> The ground-truth answer for the cost of making Grand Theft Auto V is $137 million. However, the candidate answer is based on an approximation of $100 million, which is not entirely incorrect, but not the exact amount. Therefore, the candidate answer is not fully correct.

Judgment: no.
>>>> no
