In [1]:
import warnings
from dataclasses import dataclass
from typing import Tuple, Optional

import torch
from transformers import GenerationConfig, AutoModelForCausalLM, AutoTokenizer

from amazon_fmeval.model_runners.model_runner import ModelRunner
from amazon_fmeval.eval_algo_mapping import get_eval_algorithm
from amazon_fmeval.eval_algorithms.factual_knowledge import FactualKnowledgeConfig

In [2]:
@dataclass
class HFModelConfig:
	model_name: str
	generation_config: GenerationConfig
	normalize_probabilities: bool = False
	seed: int = 0
	remove_prompt_from_generated_text: bool = True


class HuggingFaceCausalLLMModelRunner(ModelRunner):

	def __init__(self, model_config: HFModelConfig):
		self.config = model_config
		self.model = AutoModelForCausalLM.from_pretrained(self.config.model_name)
		self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)

	def predict(self, prompt: str) -> Tuple[Optional[str], Optional[float]]:
		input_ids = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
		generations = self.model.generate(
			**input_ids,
			max_new_tokens=self.config.generation_config.max_new_tokens,
			pad_token_id=self.tokenizer.eos_token_id,
			generation_config=self.config.generation_config,
		)
		generation_contains_input = (
			input_ids["input_ids"][0] == generations[0][: input_ids["input_ids"].shape[1]]
		).all()
		if self.config.remove_prompt_from_generated_text and not generation_contains_input:
			warnings.warn(
			"Your model does not return the prompt as part of its generations. "
			"`remove_prompt_from_generated_text` does nothing."
			)
		if self.config.remove_prompt_from_generated_text and generation_contains_input:
			output = self.tokenizer.batch_decode(generations[:, input_ids["input_ids"].shape[1]:])[0]
		else:
			output = self.tokenizer.batch_decode(generations, skip_special_tokens=True)[0]

		with torch.inference_mode():
			input_ids = self.tokenizer(self.tokenizer.bos_token + prompt, return_tensors="pt")["input_ids"]
			model_output = self.model(input_ids, labels=input_ids)
			probability = -model_output[0].item()

		return output, probability


# Test with gpt2
generation_config = GenerationConfig()
model_config = {"model_name": "gpt2", "generation_config": generation_config}
hf_config = HFModelConfig(**model_config)
model = HuggingFaceCausalLLMModelRunner(model_config=hf_config)
print(model.predict("London is the capital of?"))

# Test with facebook/bart-large-cnn
# generation_config = GenerationConfig(
#         max_new_tokens=40,
#         do_sample=True,
#         top_k=50,
#         top_p=0.9,
#     )
# hf_config = HFModelConfig(model_name="facebook/bart-large-cnn", generation_config=generation_config)
# model = HuggingFaceCausalLLMModelRunner(model_config=hf_config)
# print(model.predict(
#     "Summarize the following article in 2 sentences: The art metropolis of Berlin inspires locals and visitors with "
#     "its famous museum landscape and numerous UNESCO World Heritage sites. It is also an international exhibition "
#     "venue. You will find a selection of current and upcoming exhibitions here."))



('\n\nThe answer is yes.\n\nThe city is the capital', -5.30952787399292)


In [3]:
# Evaluate factual_knowledge
eval_algorithm_config = FactualKnowledgeConfig("<OR>")
eval_algo = get_eval_algorithm("factual_knowledge")(eval_algorithm_config)

In [4]:
# Evaluate your custom sample
model_output = model.predict("London is the capital of?")[0]
print(model_output)
eval_algo.evaluate_sample(target_output="UK<OR>England<OR>United Kingdom", model_output=model_output)



The answer is yes.

The city is the capital


[EvalScore(name='factual_knowledge', value=0)]

In [5]:
# Custom dataset
from amazon_fmeval.data_loaders.data_config import DataConfig
dataset_config = DataConfig(
        dataset_name="TREX",
        dataset_uri="/Users/xiayche/workplace/amazon-fmeval/src/examples/trex_sample.jsonl",
        dataset_mime_type="application/jsonlines",
        model_input_location="question",
        target_output_location="answers",
        category_location="knowledge_category",
    )

# Evaluate model with amazon-fmeval built-in dataset
eval_algo.evaluate(model=model, dataset_config=dataset_config, prompt_template="$feature", save=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2023-10-10 20:02:36,335	INFO worker.py:1621 -- Started a local Ray instance.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2023-10-10 20:02:42,760	INFO read_api.py:374 -- To satisfy the requested parallelism of 20, each read task output will be split into 20 smaller blocks.
2023-10-10 20:02:43,675	INFO dataset.py:2180 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.
2023-10-10 20:02:43,677	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCustomJSON->SplitBlocks(20)] -> TaskPoolMapOperator[MapBatches(process_batch)->Map(_generate_model_predict_response_columns)->MapBatches(process_batch)] -> AllToAllOperator[Aggregate]
2023-10-10 20:02:43,677	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-10-10 20:02:43,678	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataC

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


- Aggregate 1:   0%|          | 0/20 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/20 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/20 [00:00<?, ?it/s]

Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

2023-10-10 20:04:25,482	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCustomJSON->SplitBlocks(20)] -> TaskPoolMapOperator[MapBatches(process_batch)->Map(_generate_model_predict_response_columns)->MapBatches(process_batch)] -> LimitOperator[limit=1]
2023-10-10 20:04:25,484	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-10-10 20:04:25,485	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

2023-10-10 20:05:13,442	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCustomJSON->SplitBlocks(20)] -> TaskPoolMapOperator[MapBatches(process_batch)->Map(_generate_model_predict_response_columns)->MapBatches(process_batch)] -> AllToAllOperator[Aggregate] -> TaskPoolMapOperator[MapBatches(<lambda>)]
2023-10-10 20:05:13,444	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-10-10 20:05:13,445	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- Aggregate 1:   0%|          | 0/20 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/20 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/20 [00:00<?, ?it/s]

Running 0:   0%|          | 0/20 [00:00<?, ?it/s]



Sort Sample 0:   0%|          | 0/20 [00:00<?, ?it/s]

2023-10-10 20:06:42,124	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCustomJSON->SplitBlocks(20)] -> TaskPoolMapOperator[MapBatches(process_batch)->Map(_generate_model_predict_response_columns)->MapBatches(process_batch)] -> AllToAllOperator[Aggregate]
2023-10-10 20:06:42,126	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-10-10 20:06:42,126	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- Aggregate 1:   0%|          | 0/20 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/20 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/20 [00:00<?, ?it/s]

Running 0:   0%|          | 0/20 [00:00<?, ?it/s]



Sort Sample 0:   0%|          | 0/20 [00:00<?, ?it/s]

2023-10-10 20:08:26,766	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCustomJSON->SplitBlocks(20)] -> TaskPoolMapOperator[MapBatches(process_batch)->Map(_generate_model_predict_response_columns)->MapBatches(process_batch)->Map(<lambda>)]
2023-10-10 20:08:26,768	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-10-10 20:08:26,769	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

2023-10-10 20:10:00,330	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCustomJSON->SplitBlocks(20)] -> TaskPoolMapOperator[MapBatches(process_batch)->Map(_generate_model_predict_response_columns)->MapBatches(process_batch)->Map(<lambda>)]
2023-10-10 20:10:00,334	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-10-10 20:10:00,337	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/20 [00:00<?, ?it/s]



[EvalOutput(eval_name='factual_knowledge', dataset_name='TREX', dataset_scores=[EvalScore(name='factual_knowledge', value=0.05472636815920398)], prompt_template='$feature', category_scores=[CategoryScore(name='Capitals', scores=[EvalScore(name='factual_knowledge', value=0.09)]), CategoryScore(name='Subsidiary', scores=[EvalScore(name='factual_knowledge', value=0.019801980198019802)])], output_path='/tmp/eval_results/')]

In [11]:
# Show saved output
with open('/tmp/eval_results/factual_knowledge_TREX.jsonl') as f:
	for line in f:
		print(line)

{"model_input": "East Berlin is the capital of", "model_output": " Germany, and the city is home to the largest concentration of Jews in", "model_log_probability": -3.797389030456543, "target_output": "German Democratic Republic<OR>German Democratic Republic (GDR)<OR>East Germany", "category": "Capitals", "prompt": "East Berlin is the capital of", "scores": [{"name": "factual_knowledge", "value": 0}]}

{"model_input": "Mogadishu is the capital of", "model_output": " Somalia, and the capital of the Somali state of Somal", "model_log_probability": -2.824249744415283, "target_output": "Somalia<OR>Banaadir<OR>Somali", "category": "Capitals", "prompt": "Mogadishu is the capital of", "scores": [{"name": "factual_knowledge", "value": 1}]}

{"model_input": "Naples is the capital of", "model_output": " the United States, and the capital of the United States.\n\n", "model_log_probability": -4.232937812805176, "target_output": "Campania region<OR>Campania", "category": "Capitals", "prompt": "Napl