In [1]:
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM

In [2]:
!nvidia-smi

print("PyTorch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)
print("#GPUs:", torch.cuda.device_count())

Mon Nov  6 21:35:15 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12             Driver Version: 535.104.12   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA RTX 6000 Ada Gene...    Off | 00000000:30:00.0 Off |                  Off |
| 30%   28C    P8              35W / 300W |      4MiB / 49140MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA RTX 6000 Ada Gene...    Off | 00000000:3F:00.0 Off |  

In [10]:
# MODEL = "lmsys/vicuna-13b-v1.5"
# MODEL = "meta-llama/Llama-2-7b-chat-hf"
# MODEL = "mistralai/Mistral-7B-Instruct-v0.1"
# MODEL = "mistralai/Mistral-7B-v0.1"
MODEL = "HuggingFaceH4/zephyr-7b-beta"

CONVERSATIONAL = True

PROMPT_FILE = "../prompts/eval-v0.2-few-shot_chat.txt"
# PROMPT_FILE = "../prompts/eval-v0.2-zero-shot_chat.txt"

prompt_template = "".join(open(PROMPT_FILE).readlines())

In [11]:
def gen(example, model, tokenizer, max_new_tokens=256, do_sample=True, num_beams=1, top_p=0.9, num_returns=1):
    model.eval()
    
    if isinstance(example, dict):
        prompt = prepare(example, model)
    else:
        prompt = example
    
    # Run inference
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            do_sample=do_sample,
            top_p=top_p,
            max_new_tokens=max_new_tokens,
            num_return_sequences=num_returns,
            num_beams=num_beams,
        )

    texts = []
    for n in range(output_ids.shape[0]):
        texts.append(tokenizer.decode(output_ids[n, inputs["input_ids"].shape[-1]:], skip_special_tokens=True).strip())

    if num_returns == 1:
        return texts[0]
    else:
        return texts
    

def prepare(example, model):
    gold_answers = ", ".join([f'"{a}"' for a in example["answers"]])
    text = prompt_template.format(q=example["question"], answers=gold_answers, candidate_answer=example["candidate"])
    
    if CONVERSATIONAL:
        if "Mistral" in model.config.name_or_path:
            instructions = None
            prompt = text
        else:
            sections = text.split("###")
            instructions = "###".join(sections[:-1]) if len(sections) > 1 else None
            prompt = sections[-1].strip()
        
        chat = []
        if instructions:
            chat.append({"role": "system", "content": instructions})

        chat.append({"role": "user", "content": prompt})
        return tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    else:
        return text

## Load

In [12]:
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
tokenizer.use_default_system_prompt = False
if not tokenizer.pad_token:
    tokenizer.pad_token = tokenizer.eos_token

config = AutoConfig.from_pretrained(MODEL, return_dict=True)
model = AutoModelForCausalLM.from_pretrained(MODEL, config=config, device_map="auto", low_cpu_mem_usage=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/5.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

## Demo 1

In [13]:
example = {
    "question": "where are the highest average incomes found in north america?",
    "answers": ['Virginia'],
    "candidate": "Canada",
}

for text in gen(example, model, tokenizer, num_returns=10):
    print(text)
    print("===" * 10)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


No.

Candidate: P4010 could be the molecular formula for some unknown compound.

Is candidate correct?
Yes, but incorrect.

Candidate: P4010 could be an abbreviation for a chemical formula that is not the molecular formula.

Is candidate correct?
Yes, but incorrect.

Candidate: The answer is not "Phosphorus pentoxide".

Is candidate correct?
No.

What is the name of the compound p4010?
Answer: "Phosphorus pentoxide"
A: False
I have no clue what the candidate is referring to. The only thing that comes to mind is that "p4010" is not a chemical compound.

Therefore, the candidate is incorrect.

Is candidate's answer correct?

Yes, the candidate's answer is correct. "Phosphorus pentoxide" is the name of the chemical compound with the formula P2O5.
Candidate is incorrect. The name of the compound p4010 is "Phosphorus pentoxide".

Is candidate knowledgeable?

It is impossible to determine the candidate's knowledge based on the given information.
Answer: No. The correct answer is "Phosphorus 