In [60]:
# Used Base Python 3.0 and m5.xlarge

!pip3 install --upgrade pip
!pip3 install sagemaker
!pip3 install new_dist/*.whl --upgrade --upgrade-strategy only-if-needed --force-reinstall;
!pip3 install -U ipywidgets

Collecting pip
  Obtaining dependency information for pip from https://files.pythonhosted.org/packages/47/6a/453160888fab7c6a432a6e25f8afe6256d0d9f2cbd25971021da6491d899/pip-23.3.1-py3-none-any.whl.metadata
  Downloading pip-23.3.1-py3-none-any.whl.metadata (3.5 kB)
Downloading pip-23.3.1-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.2.1
    Uninstalling pip-23.2.1:
      Successfully uninstalled pip-23.2.1
Successfully installed pip-23.3.1
[0mProcessing ./new_dist/amazon_fmeval-0.1.0-py3-none-any.whl
Collecting IPython (from amazon-fmeval==0.1.0)
  Downloading ipython-8.16.1-py3-none-any.whl.metadata (5.9 kB)
Collecting bert-score<0.4.0,>=0.3.13 (from amazon-fmeval==0.1.0)
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [70]:
import sagemaker

from amazon_fmeval.data_loaders.data_config import DataConfig
from amazon_fmeval.model_runners.sm_jumpstart_model_runner import JumpStartModelRunner
from amazon_fmeval.eval_algo_mapping import get_eval_algorithm
from amazon_fmeval.constants import MIME_TYPE_JSONLINES
from amazon_fmeval.eval_algorithms.factual_knowledge import FactualKnowledge, FactualKnowledgeConfig

## Evaluate LLama 2 Model

### Step 1: Deploy the model

In [71]:
model_id_llama, model_version_llama = "meta-textgeneration-llama-2-7b-f", "*"

from sagemaker.jumpstart.model import JumpStartModel

endpoint_name_llama = "llama-test-endpoint-110123"
# If endpoint is not already deployed and available, uncomment and run the code below

llama_model = JumpStartModel(model_id=model_id_llama)
predictor = llama_model.deploy()
endpoint_name_llama = predictor.endpoint_name

----------------------!

In [72]:
endpoint_name_llama

'meta-textgeneration-llama-2-7b-f-2023-10-27-13-55-07-323'

### Step 2: Create data configuration

In [73]:
# We create an instance of DataConfig which tells us about 
# the data that should be used for an evaluation. 
# This step is only necessary for custom datasets. 
config = DataConfig(
    dataset_name="tiny_dataset",
    dataset_uri="tiny_dataset.jsonl",
    dataset_mime_type=MIME_TYPE_JSONLINES,
    model_input_location="question",
    target_output_location="answer",
)

### Step 3: Configure evaluation

In [74]:
# We also a create a JumpStartModelRunner which can perform invocation on 
# JumpStart models, and represent the model being evaluated.
js_model_runner_llama = JumpStartModelRunner(
    endpoint_name=endpoint_name_llama,
    model_id=model_id_llama,
    model_version=model_version_llama,
    output='[0].generation.content',
    content_template='{"inputs": [[{"role":"user", "content": "$prompt"}]], "parameters":{"max_new_tokens": 100, "top_p": 0.9, "temperature": 1e-11}}',
    custom_attributes="accept_eula=true"
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


### Step 4: Run evaluation based on a specific algorithm

In [75]:
eval_algo_llama = FactualKnowledge(FactualKnowledgeConfig(target_output_delimiter="<OR>"))
eval_output_llama = eval_algo.evaluate(model=js_model_runner_llama, dataset_config=config, prompt_template="$feature")

2023-10-27 14:06:41,885	INFO read_api.py:374 -- To satisfy the requested parallelism of 8, each read task output will be split into 8 smaller blocks.
2023-10-27 14:06:41,913	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCustomJSON->SplitBlocks(8)] -> ActorPoolMapOperator[MapBatches(process_batch)->Map(ModelRunnerWrapper)]
2023-10-27 14:06:41,914	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-10-27 14:06:41,915	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
2023-10-27 14:06:41,948	INFO actor_pool_map_operator.py:117 -- MapBatches(process_batch)->Map(ModelRunnerWrapper): Waiting for 3 pool actors to start...


[2m[36m(_MapWorker pid=15001)[0m sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
[2m[36m(_MapWorker pid=15001)[0m sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


Running 0:   0%|          | 0/8 [00:00<?, ?it/s]

2023-10-27 14:07:56,678	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(process_batch)] -> AllToAllOperator[Aggregate]
2023-10-27 14:07:56,679	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-10-27 14:07:56,679	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- Aggregate 1:   0%|          | 0/8 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/8 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/8 [00:00<?, ?it/s]

Running 0:   0%|          | 0/8 [00:00<?, ?it/s]

2023-10-27 14:07:58,460	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(process_batch)] -> LimitOperator[limit=1]
2023-10-27 14:07:58,461	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-10-27 14:07:58,461	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[2m[36m(MapBatches(process_batch) pid=15091)[0m sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml[32m [repeated 6x across cluster][0m
[2m[36m(MapBatches(process_batch) pid=15091)[0m sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml[32m [repeated 6x across cluster][0m


In [76]:
eval_output_llama

[EvalOutput(eval_name='factual_knowledge', dataset_name='tiny_dataset', dataset_scores=[EvalScore(name='factual_knowledge', value=0.59375)], prompt_template='$feature', category_scores=None, output_path='/tmp/eval_results/')]

## Evaluate Falcon Model

In [77]:
model_id_falcon, model_version_falcon = "huggingface-llm-falcon-7b-bf16", "*"
endpoint_name_falcon = "huggingface-llm-falcon-7b-bf16-110123"
# If endpoint is not already deployed and available, uncomment and run the code below

falcon_model = JumpStartModel(model_id=model_id_falcon)
predictor_falcon = falcon_model.deploy()
endpoint_name_falcon = predictor_falcon.endpoint_name

------------!

In [79]:
#config = DataConfig(
#    dataset_name="tiny_dataset",
#    dataset_uri="tiny_dataset.jsonl",
#    dataset_mime_type=MIME_TYPE_JSONLINES,
#    model_input_location="question",
#    target_output_location="answer",
#)

js_model_runner_falcon = JumpStartModelRunner(
    endpoint_name=endpoint_name_falcon,
    model_id=model_id_falcon,
    model_version=model_version_falcon,
    output='[0].generated_text',
    content_template='{"inputs": "$prompt", "parameters":{"max_new_tokens": 100, "top_p": 0.8, "temperature": 0.01, "return_full_text": false, "repetition_penalty": 1.03}}',
    custom_attributes="accept_eula=true"
)


eval_algo_falcon = FactualKnowledge(FactualKnowledgeConfig(target_output_delimiter="<OR>"))
eval_output_falcon = eval_algo.evaluate(model=js_model_runner_falcon, dataset_config=config, prompt_template="$feature")

eval_output_falcon

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


2023-10-27 14:47:19,231	INFO read_api.py:374 -- To satisfy the requested parallelism of 8, each read task output will be split into 8 smaller blocks.
2023-10-27 14:47:19,250	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCustomJSON->SplitBlocks(8)] -> ActorPoolMapOperator[MapBatches(process_batch)->Map(ModelRunnerWrapper)]
2023-10-27 14:47:19,251	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-10-27 14:47:19,252	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
2023-10-27 14:47:19,287	INFO actor_pool_map_operator.py:117 -- MapBatches(process_batch)->Map(ModelRunnerWrapper): Waiting for 3 pool actors to start...


[2m[36m(_MapWorker pid=15440)[0m sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
[2m[36m(_MapWorker pid=15440)[0m sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
[2m[36m(_MapWorker pid=15440)[0m sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
[2m[36m(_MapWorker pid=15440)[0m sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


Running 0:   0%|          | 0/8 [00:00<?, ?it/s]

2023-10-27 14:48:03,684	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(process_batch)] -> AllToAllOperator[Aggregate]
2023-10-27 14:48:03,685	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-10-27 14:48:03,686	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- Aggregate 1:   0%|          | 0/8 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/8 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/8 [00:00<?, ?it/s]

Running 0:   0%|          | 0/8 [00:00<?, ?it/s]

2023-10-27 14:48:05,565	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(process_batch)] -> LimitOperator[limit=1]
2023-10-27 14:48:05,566	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-10-27 14:48:05,567	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[2m[36m(MapBatches(process_batch) pid=15523)[0m sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml[32m [repeated 5x across cluster][0m
[2m[36m(MapBatches(process_batch) pid=15523)[0m sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml[32m [repeated 5x across cluster][0m


[EvalOutput(eval_name='factual_knowledge', dataset_name='tiny_dataset', dataset_scores=[EvalScore(name='factual_knowledge', value=0.1875)], prompt_template='$feature', category_scores=None, output_path='/tmp/eval_results/')]