In [1]:
from dataclasses import dataclass
from typing import Tuple, Optional

import requests
import json

from fmeval.model_runners.model_runner import ModelRunner
from fmeval.fmeval import get_eval_algorithm
from fmeval.eval_algorithms.factual_knowledge import FactualKnowledgeConfig

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/kvasist/Library/Application Support/sagemaker/config.yaml


In [2]:
# Create ChatGPT Custom ModelRunner
@dataclass
class ChatGPTModelConfig:
	temperature: float
	top_p: float
	max_tokens: int
	api_key: str


class ChatGPTModelRunner(ModelRunner):
	url = "https://api.openai.com/v1/chat/completions"

	def __init__(self, model_config: ChatGPTModelConfig):
		self.config = model_config

	def predict(self, prompt: str) -> Tuple[Optional[str], Optional[float]]:
		payload = json.dumps({
			"model": "gpt-3.5-turbo",
			"messages": [
				 {
					 "role": "user",
					 "content": prompt
				 }
			],
			"temperature": self.config.temperature,
			"top_p": self.config.top_p,
			"n": 1,
			"stream": False,
			"max_tokens": self.config.max_tokens,
			"presence_penalty": 0,
			"frequency_penalty": 0
		})
		headers = {
			 'Content-Type': 'application/json',
			 'Accept': 'application/json',
			 'Authorization': self.config.api_key
		}

		response = requests.request("POST", self.url, headers=headers, data=payload)

		return json.loads(response.text)["choices"][0]["message"]["content"], None

In [3]:
# Validate Custom Model Runner
# Note: Don't forget to include your api_key
config = ChatGPTModelConfig(
	api_key='',
	temperature=1.0,
	top_p=1.0,
	max_tokens=250
)
model_runner = ChatGPTModelRunner(config)
print(model_runner.predict("London is the capital of?"))

('London is the capital of England.', None)


In [4]:
# Evaluate factual_knowledge
eval_algorithm_config = FactualKnowledgeConfig("<OR>")
eval_algo = get_eval_algorithm("factual_knowledge")(eval_algorithm_config)

In [5]:
# Evaluate your custom sample
model_output = model_runner.predict("London is the capital of?")[0]
print(model_output)
eval_algo.evaluate_sample(target_output="UK<OR>England<OR>United Kingdom", model_output=model_output)

London is the capital of England and the United Kingdom.


[EvalScore(name='factual_knowledge', value=1)]

In [7]:
# Evaluate model with amazon-fmeval built-in dataset
from fmeval.data_loaders.data_config import DataConfig
dataset_config = DataConfig(
        dataset_name="TREX",
        dataset_uri="trex_sample.jsonl",
        dataset_mime_type="application/jsonlines",
        model_input_location="question",
        target_output_location="answers",
        category_location="knowledge_category",
    )

# Evaluate model with amazon-fmeval built-in dataset
eval_outputs = eval_algo.evaluate(model=model_runner, dataset_config=dataset_config, prompt_template="$feature", save=True);

2023-10-11 04:16:04,625	INFO worker.py:1621 -- Started a local Ray instance.
2023-10-11 04:16:05,812	INFO read_api.py:374 -- To satisfy the requested parallelism of 20, each read task output will be split into 20 smaller blocks.
2023-10-11 04:16:06,447	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCustomJSON->SplitBlocks(20)] -> ActorPoolMapOperator[MapBatches(process_batch)->Map(ModelRunnerWrapper)]
2023-10-11 04:16:06,447	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-10-11 04:16:06,448	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
2023-10-11 04:16:06,483	INFO actor_pool_map_operator.py:117 -- MapBatches(process_batch)->

[2m[36m(_MapWorker pid=11603)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
[2m[36m(_MapWorker pid=11603)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Users/kvasist/Library/Application Support/sagemaker/config.yaml


Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

2023-10-11 04:17:43,357	INFO dataset.py:2180 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.
2023-10-11 04:17:43,359	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(process_batch)] -> AllToAllOperator[Aggregate]
2023-10-11 04:17:43,360	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-10-11 04:17:43,360	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- Aggregate 1:   0%|          | 0/20 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/20 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/20 [00:00<?, ?it/s]

Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

[2m[36m(MapBatches(process_batch) pid=11591)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml[32m [repeated 18x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
[2m[36m(MapBatches(process_batch) pid=11591)[0m sagemaker.config INFO - Not applying SDK defaults from location: /Users/kvasist/Library/Application Support/sagemaker/config.yaml[32m [repeated 18x across cluster][0m


2023-10-11 04:17:44,111	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(process_batch)] -> LimitOperator[limit=1]
2023-10-11 04:17:44,112	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-10-11 04:17:44,112	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

2023-10-11 04:17:45,298	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(process_batch)] -> AllToAllOperator[Aggregate] -> TaskPoolMapOperator[MapBatches(<lambda>)]
2023-10-11 04:17:45,298	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-10-11 04:17:45,299	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- Aggregate 1:   0%|          | 0/20 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/20 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/20 [00:00<?, ?it/s]

Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

Sort Sample 0:   0%|          | 0/20 [00:00<?, ?it/s]

2023-10-11 04:17:46,112	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(process_batch)] -> AllToAllOperator[Aggregate]
2023-10-11 04:17:46,112	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-10-11 04:17:46,112	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- Aggregate 1:   0%|          | 0/20 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/20 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/20 [00:00<?, ?it/s]

Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

Sort Sample 0:   0%|          | 0/20 [00:00<?, ?it/s]

2023-10-11 04:17:46,283	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(process_batch)->Map(<lambda>)]
2023-10-11 04:17:46,284	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-10-11 04:17:46,284	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

2023-10-11 04:17:46,337	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(process_batch)->Map(<lambda>)]
2023-10-11 04:17:46,337	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-10-11 04:17:46,337	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

In [8]:
# Show Evaluation outputs
print(eval_outputs)

[EvalOutput(eval_name='factual_knowledge', dataset_name='TREX', dataset_scores=[EvalScore(name='factual_knowledge', value=0.7611940298507462)], prompt_template='$feature', category_scores=[CategoryScore(name='Capitals', scores=[EvalScore(name='factual_knowledge', value=0.8)]), CategoryScore(name='Subsidiary', scores=[EvalScore(name='factual_knowledge', value=0.7227722772277227)])], output_path='/tmp/eval_results/')]


In [9]:
# Show first five rows of saved output
with open('/tmp/eval_results/factual_knowledge_TREX.jsonl') as f:
	lines = [next(f) for _ in range(5)]
for line in lines:
	print(line)

{"model_input": "Lichinga is the capital of", "model_output": "Niassa province in Mozambique.", "target_output": "Niassa province<OR>Niassa Province", "category": "Capitals", "prompt": "Lichinga is the capital of", "scores": [{"name": "factual_knowledge", "value": 1}]}

{"model_input": "Thimphu is the capital of", "model_output": "Thimphu is the capital of Bhutan.", "target_output": "Thimphu District<OR>Bhutan", "category": "Capitals", "prompt": "Thimphu is the capital of", "scores": [{"name": "factual_knowledge", "value": 1}]}

{"model_input": "Vientiane is the capital of", "model_output": "Laos.", "target_output": "Laotian<OR>Laos", "category": "Capitals", "prompt": "Vientiane is the capital of", "scores": [{"name": "factual_knowledge", "value": 1}]}

{"model_input": "Sapporo is the capital of", "model_output": "Hokkaido, Japan.", "target_output": "Hokkaido<OR>Hokkaido Prefecture<OR>Hokkaid\u014d<OR>Ishikari", "category": "Capitals", "prompt": "Sapporo is the capital of", "scores": [