In [None]:
#
#
# This example runs evaluations against a vLLM API Server:
#
#   https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html#api-server
#
# This example requires the following files:
#
#   1. tiny_dataset.jsonl
#
# This example will write its output to:
#
#   /tmp/eval_results/factual_knowledge_tiny_dataset.jsonl
#
#

In [None]:
#
# Let's check for this example's required files in the environment:
#
#   1. tiny_dataset.jsonl
#

import glob

if not glob.glob("tiny_dataset.jsonl"):
    print("ERROR - please make sure file exists: tiny_dataset.jsonl")

In [None]:
from amazon_fmeval.data_loaders.data_config import DataConfig
from amazon_fmeval.model_runners.vllm_model_runner import VllmModelRunner
from amazon_fmeval.eval_algo_mapping import get_eval_algorithm
from amazon_fmeval.constants import MIME_TYPE_JSONLINES
from amazon_fmeval.eval_algorithms.factual_knowledge import FactualKnowledge, FactualKnowledgeConfig

In [None]:
#
# We create an instance of DataConfig, which tells us about the data that should be used for an evaluation.
# This step is only necessary for custom datasets.
#
config = DataConfig(
    dataset_name="tiny_dataset",
    dataset_uri="tiny_dataset.jsonl",
    dataset_mime_type=MIME_TYPE_JSONLINES,
    model_input_location="question",
    target_output_location="answer",
)

In [None]:
#
# We also a create a VllmModelRunner which can perform invocation again the vLLM API Server.
#

vllm_model_runner = VllmModelRunner(
    content_template = '{"content": "$prompt"}',
    output = 'text[0]',
    remote_uri = 'http://localhost:8000/generate',
    num_completions = 1,
    temperature = 0.8,
    top_p = 0.5
)

In [None]:
#
# If you want to choose the output path, uncomment the lines below.
# This is set using the EVAL_RESULTS_PATH environment variable.
#

import os

# eval_results_path = "/tmp/custom_dir_eval_results/"
# os.environ["EVAL_RESULTS_PATH"] = eval_results_path
# os.mkdir(eval_results_path)

In [None]:
#
# Here, we run the FactualKnowledge evaluation algorithm.
#

eval_algo = FactualKnowledge(FactualKnowledgeConfig(target_output_delimiter="<OR>"))
eval_output = eval_algo.evaluate(model=vllm_model_runner, dataset_config=config, prompt_template="$feature", save=True)

In [None]:
#
# Print the evalaution output.
#

eval_output

In [None]:
#
# Pretty-print the evalaution output (notice the score).
#

import json
print(json.dumps(eval_output, default=vars, indent=4))

In [None]:
#
# See the raw evaluation results.
#

!cat /tmp/eval_results/factual_knowledge_tiny_dataset.jsonl