In [None]:
#
#
# *** Image: Base Python 3.0
# *** Kernel: Python 3
# *** Instance type: ml.m5.large
# *** Start-up script: No script 
#
#
# This example requires two files in the environment:
#
#   1. new_dist/amazon_fmeval-*-py3-none-any.whl
#   2. tiny_dataset.jsonl
#
# This example will write its output to:
#
#   /tmp/eval_results/factual_knowledge_tiny_dataset.jsonl
#
#

!pip3 install boto3==1.28.65

!pip3 install -U pyarrow
!pip3 install -U accelerate
!pip3 install "ipywidgets>=8"

In [None]:
import boto3
boto3.__version__

In [None]:
import boto3
import json
bedrock = boto3.client(service_name='bedrock')
bedrock_runtime = boto3.client(service_name='bedrock-runtime')

In [None]:
#
# Let's check for this example's required files in the environment:
#
#   1. new_dist/amazon_fmeval-*-py3-none-any.whl
#   2. tiny_dataset.jsonl
#

import glob

if not glob.glob("new_dist/amazon_fmeval-*-py3-none-any.whl"):
    print("ERROR - please make sure file exists: new_dist/amazon_fmeval-*-py3-none-any.whl")

if not glob.glob("tiny_dataset.jsonl"):
    print("ERROR - please make sure file exists: tiny_dataset.jsonl")

In [None]:
modelId = "amazon.titan-tg1-large"
accept = "application/json"
contentType = "application/json"

print(bedrock.get_foundation_model(modelIdentifier=modelId).get('modelDetails'))

body = json.dumps({
        "inputText": "Hello how are you?",
        "textGenerationConfig": {
            "maxTokenCount": 4096,
            "stopSequences": [],
            "temperature": 1.0,
            "topP": 1.0,
        }
})
response = bedrock_runtime.invoke_model(body=body, modelId=modelId, accept=accept, contentType=contentType)
response_body = json.loads(response.get('body').read())
print(response_body.get('results')[0].get('outputText'))

In [None]:
# modelId = "anthropic.claude-v2"
# accept = "application/json"
# contentType = "application/json"

# print(bedrock.get_foundation_model(modelIdentifier=modelId).get('modelDetails'))

# body = json.dumps({
#     "prompt": "\n\nHuman:Hello how are you?\n\nAssistant:",
#     "max_tokens_to_sample": 300,
#     "temperature": 0.1,
#     "top_p": 0.9,
# })

# response = bedrock_runtime.invoke_model(body=body, modelId=modelId, accept=accept, contentType=contentType)
# response_body = json.loads(response.get('body').read())
# print(response_body.get('completion'))

In [None]:
#
# Install the amazon_fmeval-*-py3-none-any.whl distribution.
#

!rm -Rf ~/.cache/pip/*

!pip3 install new_dist/*.whl --upgrade --upgrade-strategy only-if-needed --force-reinstall
!pip3 install boto3==1.28.65

In [None]:
from amazon_fmeval.data_loaders.data_config import DataConfig
from amazon_fmeval.model_runners.bedrock_model_runner import BedrockModelRunner
from amazon_fmeval import get_eval_algorithm
from amazon_fmeval.constants import MIME_TYPE_JSONLINES
from amazon_fmeval.eval_algorithms.factual_knowledge import FactualKnowledge, FactualKnowledgeConfig

In [None]:
# We create an instance of DataConfig which tells us about
# the data that should be used for an evaluation.
# This step is only necessary for custom datasets.

config = DataConfig(
    dataset_name="tiny_dataset",
    dataset_uri="tiny_dataset.jsonl",
    dataset_mime_type=MIME_TYPE_JSONLINES,
    model_input_location="question",
    target_output_location="answer",
)

In [None]:
# We also a create a BedrockModelRunner which can perform invocation on 
# Bedrock models, and represent the model being evaluated.

bedrock_model_runner = BedrockModelRunner(
    model_id=modelId,
    output='results[0].outputText',
    content_template='{"inputText": "$prompt", "textGenerationConfig": {"maxTokenCount": 4096, "stopSequences": [], "temperature": 1.0, "topP": 1.0}}',
)

prompt_template_txt = "$feature"

In [None]:
#
# If you want to choose the output path, uncomment the lines below.
# This is set using the EVAL_RESULTS_PATH environment variable.
#

import os

# eval_results_path = "/tmp/custom_dir_eval_results/"
# os.environ["EVAL_RESULTS_PATH"] = eval_results_path
# os.mkdir(eval_results_path)

os.environ["PARALLELIZATION_FACTOR"] = "1"

In [None]:
#
# Here, we run the FactualKnowledge evaluation algorithm.
#

eval_algo = FactualKnowledge(FactualKnowledgeConfig(target_output_delimiter="<OR>"))
eval_output = eval_algo.evaluate(model=bedrock_model_runner, dataset_config=config, prompt_template="$feature", save=True)

In [None]:
#
# Print the evalaution output.
#

eval_output

In [None]:
#
# Pretty-print the evalaution output (notice the score).
#

import json
print(json.dumps(eval_output, default=vars, indent=4))

In [None]:
#
# See the raw evaluation results.
#

!cat /tmp/eval_results/factual_knowledge_tiny_dataset.jsonl