In [1]:
import warnings
from dataclasses import dataclass
from typing import Tuple, Optional

import torch
from transformers import GenerationConfig, AutoModelForCausalLM, AutoTokenizer

from amazon_fmeval.model_runners.model_runner import ModelRunner
from amazon_fmeval.eval_algo_mapping import get_eval_algorithm
from amazon_fmeval.eval_algorithms.factual_knowledge import FactualKnowledgeConfig

In [2]:
@dataclass
class HFModelConfig:
	model_name: str
	generation_config: GenerationConfig
	normalize_probabilities: bool = False
	seed: int = 0
	remove_prompt_from_generated_text: bool = True


class HuggingFaceCausalLLMModelRunner(ModelRunner):

	def __init__(self, model_config: HFModelConfig):
		self.config = model_config
		self.model = AutoModelForCausalLM.from_pretrained(self.config.model_name)
		self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)

	def predict(self, prompt: str) -> Tuple[Optional[str], Optional[float]]:
		input_ids = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
		generations = self.model.generate(
			**input_ids,
			max_new_tokens=self.config.generation_config.max_new_tokens,
			pad_token_id=self.tokenizer.eos_token_id,
			generation_config=self.config.generation_config,
		)
		generation_contains_input = (
			input_ids["input_ids"][0] == generations[0][: input_ids["input_ids"].shape[1]]
		).all()
		if self.config.remove_prompt_from_generated_text and not generation_contains_input:
			warnings.warn(
			"Your model does not return the prompt as part of its generations. "
			"`remove_prompt_from_generated_text` does nothing."
			)
		if self.config.remove_prompt_from_generated_text and generation_contains_input:
			output = self.tokenizer.batch_decode(generations[:, input_ids["input_ids"].shape[1]:])[0]
		else:
			output = self.tokenizer.batch_decode(generations, skip_special_tokens=True)[0]

		with torch.inference_mode():
			input_ids = self.tokenizer(self.tokenizer.bos_token + prompt, return_tensors="pt")["input_ids"]
			model_output = self.model(input_ids, labels=input_ids)
			probability = -model_output[0].item()

		return output, probability


# Test with gpt2
generation_config = GenerationConfig()
model_config = {"model_name": "gpt2", "generation_config": generation_config}
hf_config = HFModelConfig(**model_config)
model = HuggingFaceCausalLLMModelRunner(model_config=hf_config)
print(model.predict("London is the capital of?"))

# Test with facebook/bart-large-cnn
# generation_config = GenerationConfig(
#         max_new_tokens=40,
#         do_sample=True,
#         top_k=50,
#         top_p=0.9,
#     )
# hf_config = HFModelConfig(model_name="facebook/bart-large-cnn", generation_config=generation_config)
# model = HuggingFaceCausalLLMModelRunner(model_config=hf_config)
# print(model.predict(
#     "Summarize the following article in 2 sentences: The art metropolis of Berlin inspires locals and visitors with "
#     "its famous museum landscape and numerous UNESCO World Heritage sites. It is also an international exhibition "
#     "venue. You will find a selection of current and upcoming exhibitions here."))



('\n\nThe answer is yes.\n\nThe city is the capital', -5.30952787399292)


In [3]:
# Evaluate factual_knowledge
eval_algorithm_config = FactualKnowledgeConfig("<OR>")
eval_algo = get_eval_algorithm("factual_knowledge")(eval_algorithm_config)

In [4]:
# Evaluate your custom sample
model_output = model.predict("London is the capital of?")[0]
print(model_output)
eval_algo.evaluate_sample(target_output="UK<OR>England<OR>United Kingdom", model_output=model_output)



The answer is yes.

The city is the capital


[EvalScore(name='factual_knowledge', value=0)]

In [None]:
# Custom dataset
from amazon_fmeval.data_loaders.data_config import DataConfig
dataset_config = DataConfig(
        dataset_name="TREX",
        dataset_uri="/Users/xiayche/workplace/amazon-fmeval/src/examples/trex_sample.jsonl",
        dataset_mime_type="application/jsonlines",
        model_input_location="question",
        target_output_location="answers",
        category_location="knowledge_category",
    )

# Evaluate model with amazon-fmeval built-in dataset
eval_outputs = eval_algo.evaluate(model=model, dataset_config=dataset_config, prompt_template="$feature", save=True)
;

In [6]:
# Show Evaluation outputs
print(eval_outputs)

[EvalOutput(eval_name='factual_knowledge', dataset_name='TREX', dataset_scores=[EvalScore(name='factual_knowledge', value=0.05472636815920398)], prompt_template='$feature', category_scores=[CategoryScore(name='Capitals', scores=[EvalScore(name='factual_knowledge', value=0.09)]), CategoryScore(name='Subsidiary', scores=[EvalScore(name='factual_knowledge', value=0.019801980198019802)])], output_path='/tmp/eval_results/')]


In [7]:
# Show first five rows of saved output
with open('/tmp/eval_results/factual_knowledge_TREX.jsonl') as f:
	lines = [next(f) for _ in range(5)]
for line in lines:
	print(line)

{"model_input": "Palembang is the capital of", "model_output": " the Democratic Republic of Congo, and is home to the largest number", "model_log_probability": -4.059066295623779, "target_output": "South Sumatra<OR>South Sumatera<OR>Srivijaya", "category": "Capitals", "prompt": "Palembang is the capital of", "scores": [{"name": "factual_knowledge", "value": 0}]}

{"model_input": "Alor Setar is the capital of", "model_output": " the Kingdom of the Seven Kingdoms. It is located in the", "model_log_probability": -5.383638381958008, "target_output": "Kedah", "category": "Capitals", "prompt": "Alor Setar is the capital of", "scores": [{"name": "factual_knowledge", "value": 0}]}

{"model_input": "Manama is the capital of", "model_output": " the Democratic Republic of Congo, and is home to the largest number of", "model_log_probability": -4.162099361419678, "target_output": "Bahrain", "category": "Capitals", "prompt": "Manama is the capital of", "scores": [{"name": "factual_knowledge", "value