In [None]:
#
#
# *** Image: Base Python 3.0
# *** Kernel: Python 3
# *** Instance type: ml.t3.medium 2 vCPU + 4 GiB
# *** Start-up script: No script 
#
#
# This example requires two files in the environment:
#
#   1. new_dist/amazon_fmeval-*-py3-none-any.whl
#   2. tiny_dataset.jsonl
#
# This example will write its output to:
#
#   /tmp/eval_results/factual_knowledge_tiny_dataset.jsonl
#
#

!pip3 install sagemaker

!pip3 install -U pyarrow
!pip3 install -U accelerate
!pip3 install "ipywidgets>=8"

In [None]:
import sagemaker

In [None]:
#
# Let's check for this example's required files in the environment:
#
#   1. new_dist/amazon_fmeval-*-py3-none-any.whl
#   2. tiny_dataset.jsonl
#

import glob

if not glob.glob("new_dist/amazon_fmeval-*-py3-none-any.whl"):
    print("ERROR - please make sure file exists: new_dist/amazon_fmeval-*-py3-none-any.whl")

if not glob.glob("tiny_dataset.jsonl"):
    print("ERROR - please make sure file exists: tiny_dataset.jsonl")

In [None]:
from sagemaker.jumpstart.model import JumpStartModel

# Which JumpStart model and version do we want to use?
model_id, model_version = "meta-textgeneration-llama-2-7b-f", "*"

# If endpoint is already deployed and available, uncomment and run the line below.
# endpoint_name = "<endpoint_name>"

In [None]:
#
# If we don't have an existing endpoint, let's create one.
# As part of this process, we accept the EULA and test our ability to predict.
#

try:

    endpoint_name

except NameError:
    
    my_model = JumpStartModel(model_id=model_id) # add role or sagemaker session as appropriate
    predictor = my_model.deploy()
    endpoint_name = predictor.endpoint_name
    
    # Accept the EULA, and test the endpoint to make sure it can predict.
    predictor.predict({"inputs": [[{"role":"user", "content": "Hellow how are you?"}]]}, custom_attributes='accept_eula=true')

In [None]:
#
# Install the amazon_fmeval-*-py3-none-any.whl distribution.
#

!rm -Rf ~/.cache/pip/*

!pip3 install new_dist/*.whl --upgrade --upgrade-strategy only-if-needed --force-reinstall

In [None]:
from amazon_fmeval.data_loaders.data_config import DataConfig
from amazon_fmeval.model_runners.sm_jumpstart_model_runner import JumpStartModelRunner
from amazon_fmeval import get_eval_algorithm
from amazon_fmeval.constants import MIME_TYPE_JSONLINES
from amazon_fmeval.eval_algorithms.factual_knowledge import FactualKnowledge, FactualKnowledgeConfig

In [None]:
# We create an instance of DataConfig which tells us about
# the data that should be used for an evaluation.
# This step is only necessary for custom datasets.

config = DataConfig(
    dataset_name="tiny_dataset",
    dataset_uri="tiny_dataset.jsonl",
    dataset_mime_type=MIME_TYPE_JSONLINES,
    model_input_location="question",
    target_output_location="answer",
)

In [None]:
# We also a create a JumpStartModelRunner which can perform invocation on 
# JumpStart models, and represent the model being evaluated.

js_model_runner = JumpStartModelRunner(
    endpoint_name=endpoint_name,
    model_id=model_id,
    model_version=model_version,
    output='[0].generation.content',
    content_template='{"inputs": [[{"role":"user", "content": "$prompt"}]], "parameters":{"max_new_tokens": 100, "top_p": 0.9, "temperature": 1e-11}}',
    custom_attributes="accept_eula=true"
)

prompt_template_txt = """<s>[INST] <<SYS>>Answer the question at the end. If you dont know the answer just say that you dont know, dont try to make up an answer.<</SYS>>Question: $feature [/INST]"""

In [None]:
#
# If you want to choose the output path, uncomment the lines below.
# This is set using the EVAL_RESULTS_PATH environment variable.
#

import os

# eval_results_path = "/tmp/custom_dir_eval_results/"
# os.environ["EVAL_RESULTS_PATH"] = eval_results_path
# os.mkdir(eval_results_path)

In [None]:
#
# Here, we run the FactualKnowledge evaluation algorithm.
#

eval_algo = FactualKnowledge(FactualKnowledgeConfig(target_output_delimiter="<OR>"))
eval_output = eval_algo.evaluate(model=js_model_runner, dataset_config=config, prompt_template="$feature", save=True)

In [None]:
#
# Print the evalaution output.
#

eval_output

In [None]:
#
# Pretty-print the evalaution output (notice the score).
#

import json
print(json.dumps(eval_output, default=vars, indent=4))

In [None]:
#
# See the raw evaluation results.
#

!cat /tmp/eval_results/factual_knowledge_tiny_dataset.jsonl