### Evaluate Model using the fmeval library

**Environment**

- Base Python 3.0 kernel
- Studio Notebook instance type: ml.m5.xlarge

### Setup

In [None]:
import sagemaker

from sagemaker.jumpstart.model import JumpStartModel

# These are needed, even if you use an existing endpoint, by a cell later in this notebook.
model_id, model_version = "meta-textgeneration-llama-2-7b-f", "*"

### Connect to Finetuned Endpoint

Get the endpoint name of instruction model that you have deployed in this journey



In [None]:
ift_endpoint_name = "IFT_ENDPOINT"

In [None]:
import boto3
client = boto3.client('sagemaker')

# Get the inference component name. There is only one in this example.

response = client.list_inference_components(
    SortBy='CreationTime',
    SortOrder='Descending',
    StatusEquals='InService',
    EndpointNameEquals=ift_endpoint_name
)
print(response['InferenceComponents'][0]['InferenceComponentName'])

inference_conponent_name = response['InferenceComponents'][0]['InferenceComponentName']

### FMEval Setup

In [None]:
from fmeval.data_loaders.data_config import DataConfig
from fmeval.constants import MIME_TYPE_JSONLINES
from fmeval.model_runners.sm_jumpstart_model_runner import JumpStartModelRunner
from fmeval.eval_algorithms.factual_knowledge import FactualKnowledge, FactualKnowledgeConfig

### Evaluate the model using a dataset

## Data Config Setup

Below, we create a DataConfig for the local dataset file, trex_sample.jsonl.

- `dataset_name` is just an identifier for your own reference
- `dataset_uri` is either a local path to a file or an S3 URI
- `dataset_mime_type` is the MIME type of the dataset. Currently, JSON and JSON Lines are supported.
- `model_input_location` and `target_output_location` are JMESPath queries used to find the model inputs and target outputs within the dataset. category_location similarly is used to find information about the category that the sample belongs to. The values that you specify here depend on the structure of the dataset itself. Take a look at trex_sample.jsonl to see where "question", "answers", and "knowledge_category" show up.

In [None]:
dataset_uri="s3://sagemaker-<region>-<account_id>/datasets/sciq/evaluation/automatic/dataset_evaluation.jsonl"

In [None]:
config = DataConfig(
    dataset_name="evaluation_dataset_small",
    dataset_uri= dataset_uri,
    dataset_mime_type = MIME_TYPE_JSONLINES,
    model_input_location = "model_input",
    target_output_location = "target_output"
)

### Model Runner setup

In [None]:
js_model_runner = JumpStartModelRunner(
    endpoint_name=ift_endpoint_name,
    model_id=model_id,
    model_version=model_version,
    output='[0].generated_text',
    content_template='{"inputs": $prompt, "parameters": { "top_p": 0.9, "temperature": 0.8, "max_new_tokens": 200}}',
    component_name=inference_conponent_name
)

### Run instruction finetuned evaluation

In [None]:
eval_algo = FactualKnowledge(FactualKnowledgeConfig("<OR>"))
ift_eval_output = eval_algo.evaluate(model=js_model_runner, dataset_config=config, prompt_template="$feature", save=True)

### Parse Evaluation Results

In [None]:
# Pretty-print the evaluation output (notice the score).
import json
print(json.dumps(ift_eval_output, default=vars, indent=4))

### Domain finetuned evaluation

In [None]:
dft_endpoint_name = "DFT_ENDPOINT"

In [None]:
js_model_runner = JumpStartModelRunner(
    endpoint_name=dft_endpoint_name,
    model_id=model_id,
    model_version=model_version,
    output='[0].generated_text',
    content_template='{"inputs": $prompt, "parameters": { "top_p": 0.9, "temperature": 0.8, "max_new_tokens": 200}}',
    component_name=inference_conponent_name
)

In [None]:
eval_algo = FactualKnowledge(FactualKnowledgeConfig("<OR>"))
dft_eval_output = eval_algo.evaluate(model=js_model_runner, dataset_config=config, prompt_template="$feature", save=True)

### You now have the evaluation results for domain and instruction fine-tuning. These results can be compared to Journey 3's base model evaluation.