In [None]:
# define AWS infrastructure parameters
aws_region = 'us-west-2'
read_bucket = 'sagemaker-fmbench-read-<region>-<account>'
write_bucket = 'sagemaker-fmbench-write-<region>-<account>'
sagemaker_execution_role = '<sagemaker-execution-role-arn>'

In [None]:
import textwrap
from fmbench.utils import write_to_s3

prompt_template_dir = 'prompt_template'
prompt_template_filename = 'prompt_template_mistral_with_system_prompt.txt'

# create prompt template with system key used by ORCA
content: str = textwrap.dedent(
    """
    <s>[INST]
    <<SYS>>
    {system_prompt}
    <</SYS>>
    
    Question: {input}
    
    [/INST]
    Answer:
    """
)

# write prompt template locally
with open(f'{prompt_template_dir}/{prompt_template_filename}', 'w') as prompt_template_file:
    prompt_template_file.write(content)

# write prompt template to S3
prompt_template_s3_uri = write_to_s3(content, read_bucket, prompt_template_dir, '', prompt_template_filename)

In [None]:
import os
import yaml
from fmbench.utils import write_to_s3

config_dir = 'configs'
baseline_config_filename = 'config-mistral-7b-instruct-g5-orca.yml'
experiment_config_filename = 'config-mistral-7b-instruct-g5-orca-exp.yml'

# read the baseline configuration
with open(f'{config_dir}/{baseline_config_filename}', 'r') as baseline_config_file:
    config_as_dict = yaml.safe_load(baseline_config_file)

# update configuration file with parameters

# write the experiment configuration
with open(f'{config_dir}/{experiment_config_filename}', 'w') as experiment_config_file:
    yaml.safe_dump(config_as_dict, experiment_config_file)

# write configuration to S3
config_as_yaml = yaml.safe_dump(config_as_dict)
config_s3_uri = write_to_s3(config_as_yaml, read_bucket, config_dir, '', experiment_config_filename)
os.environ['FM_BENCH_CONFIG'] = config_s3_uri

In [None]:
# setup FM bench
import json
import pandas as pd
from fmbench.utils import *
from fmbench.globals import *
from datasets import load_dataset
config = load_config(config_s3_uri)

print(json.dumps(config, indent=2))

## Convert HuggingFace dataset to jsonl format

`fmbench` works with datasets in the [`JSON Lines`](https://jsonlines.org/) format. So here we show how to convert a HuggingFace dataset into JSON lines format.

Set the `ds_name` to the HuggingFace dataset id, for example [`THUDM/LongBench`](https://huggingface.co/datasets/THUDM/LongBench), [`rajpurkar/squad_v2`](https://huggingface.co/datasets/rajpurkar/squad_v2), [`banking77`](https://huggingface.co/datasets/banking77) or other text datasets.

In [None]:
ds_id: str = "Open-Orca/OpenOrca"
ds_name: str = "default"
ds_split: str = "train"
# Take a random subset of the dataframe, adjust the value of `N` below as appropriate.
# size of random subset of the data
ds_N: int = 100

In [None]:
# Load the dataset from huggingface
dataset = load_dataset(ds_id, name=ds_name)

In [None]:
# preview the dataset
dataset

In [None]:
# convert the dataset to a dataframe, for print it out and easy conversion to jsonl
df = pd.DataFrame(dataset[ds_split])

# some datasets contain a field called column, we would like to call it
# input to match it to the prompt template
df.rename(columns={"question": "input"}, inplace=True)

In [None]:
df.head()

## Subset the dataset

In [None]:
print(f"dataset shape before random subset = {df.shape}")
df = df.sample(n=ds_N)
print(f"dataset shape before random subset = {df.shape}")

## Convert to JSON lines format

In [None]:
jsonl_content = df.to_json(orient='records', lines=True)
print(jsonl_content[:1000])

## Upload the dataset to S3

In [None]:
file_name: str = f"{ds_id}.jsonl"
dataset_s3_uri = write_to_s3(jsonl_content, read_bucket, 'source_data/Open-Orca', '', file_name)

## Run the experiment

In [None]:
!python -m main "--config-file" "${FM_BENCH_CONFIG}"