# Bring your own dataset

---------
*This notebook works best with the conda_python3 kernel on a ml.t3.medium machine*.

### This part of our solution design includes 

- Creating your own `fmbench` compatible dataset from a [HuggingFace dataset](https://huggingface.co/docs/datasets/en/index).

- Creating a prompt payload template compatible with your dataset.

- Upload the dataset and the prompt payload to Amazon S3 from where it can be used by `fmbench`.

In [1]:
# if interactive mode is set to no -> pickup fmbench from Python installation path
# if interactive mode is set to yes -> pickup fmbench from the current path (one level above this notebook)
# if interactive mode is not defined -> pickup fmbench from the current path (one level above this notebook)
# the premise is that if run non-interactively then it can only be run through main.py which will set interactive mode to no
import os
import sys
if os.environ.get("INTERACTIVE_MODE_SET", "yes") == "yes":
    sys.path.append(os.path.dirname(os.getcwd()))

In [2]:
import pandas as pd
from fmbench.utils import *
from fmbench.globals import *
from datasets import load_dataset
config = load_config(CONFIG_FILE)

region_name=us-west-2
role_arn_from_env=None, using current sts caller identity to set arn_string
the sts role is an assumed role, setting arn_string to arn:aws:iam::988564344122:role/fmbench-stack-us-west-2-role
config file current -> configs/llama3.1/8b/config-llama3.1-8b-g5.yml, None
loaded config: {'general': {'name': 'Llama3-1-8b-g5', 'model_name': 'Llama3-1-8b'}, 'aws': {'region': 'us-west-2', 'sagemaker_execution_role': 'arn:aws:iam::988564344122:role/fmbench-stack-us-west-2-role', 'bucket': 'sagemaker-fmbench-write-us-west-2-988564344122'}, 'dir_paths': {'data_prefix': 'data', 'prompts_prefix': 'prompts', 'all_prompts_file': 'all_prompts.csv', 'metrics_dir': 'metrics', 'models_dir': 'models', 'metadata_dir': 'metadata'}, 's3_read_data': {'read_bucket': 'sagemaker-fmbench-read-us-west-2-988564344122', 'scripts_prefix': 'scripts', 'script_files': ['hf_token.txt'], 'configs_prefix': 'configs', 'config_files': ['pricing.yml'], 'source_data_prefix': 'source_data', 'source_data_files

## Convert HuggingFace dataset to jsonl format

`fmbench` works with datasets in the [`JSON Lines`](https://jsonlines.org/) format. So here we show how to convert a HuggingFace dataset into JSON lines format.

Set the `ds_name` to the HuggingFace dataset id, for example [`THUDM/LongBench`](https://huggingface.co/datasets/THUDM/LongBench), [`rajpurkar/squad_v2`](https://huggingface.co/datasets/rajpurkar/squad_v2), [`banking77`](https://huggingface.co/datasets/banking77) or other text datasets.

In [15]:
ds_id: str = "rajpurkar/squad"
ds_name: str = "plain_text"
ds_split: str = "train"
# Take a random subset of the dataframe, adjust the value of `N` below as appropriate.
# size of random subset of the data
ds_N: int = 100

# another example
# ds_id: str = "THUDM/LongBench"
# ds_name: str = "2wikimqa"
# ds_split: str = "test"
# Take a random subset of the dataframe, adjust the value of `N` below as appropriate.
# size of random subset of the data
# ds_N: int = 200

# another example
# ds_id: str = "banking77"
# ds_name: str = "default"
# ds_split: str = "train"
# Take a random subset of the dataframe, adjust the value of `N` below as appropriate.
# size of random subset of the data
# ds_N: int = 10000

ds_id: str = "Open-Orca/OpenOrca"
ds_name: str = "default"
ds_split: str = "train"
# Take a random subset of the dataframe, adjust the value of `N` below as appropriate.
# size of random subset of the data
ds_N: int = 100

In [16]:
# Load the dataset from huggingface
dataset = load_dataset(ds_id, name=ds_name)

### For image datasets
---

In this section of the notebook, we will use an image dataset, convert the images into `base64` and then send the relevant data to s3/locally that will be used during the benchmarking test.

In [19]:
from datasets import load_dataset, Dataset
import itertools

ds_id: str = "HuggingFaceM4/WebSight"
ds_name: str = "v0.2"
ds_split: str = "train"
ds_N: int = 100

# Load the dataset in streaming mode so you don't have to load the entire dataset
dataset = load_dataset(ds_id, name=ds_name, split=ds_split, streaming=True)

# Take only the first ds_N examples
dataset_iter = itertools.islice(dataset, ds_N)

# Convert to a list and then to a regular dataset
dataset_list = list(dataset_iter)
dataset = Dataset.from_list(dataset_list)

logger.info(f"Loaded {len(dataset)} examples")

Resolving data files:   0%|          | 0/738 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/738 [00:00<?, ?it/s]

In [21]:
dataset

Dataset({
    features: ['image', 'text', 'llm_generated_idea'],
    num_rows: 100
})

In [35]:
# convert the dataset to a dataframe, for print it out and easy conversion to jsonl
df = pd.DataFrame(dataset)

In [36]:
df.image

0     <PIL.JpegImagePlugin.JpegImageFile image mode=...
1     <PIL.JpegImagePlugin.JpegImageFile image mode=...
2     <PIL.JpegImagePlugin.JpegImageFile image mode=...
3     <PIL.JpegImagePlugin.JpegImageFile image mode=...
4     <PIL.JpegImagePlugin.JpegImageFile image mode=...
                            ...                        
95    <PIL.JpegImagePlugin.JpegImageFile image mode=...
96    <PIL.JpegImagePlugin.JpegImageFile image mode=...
97    <PIL.JpegImagePlugin.JpegImageFile image mode=...
98    <PIL.JpegImagePlugin.JpegImageFile image mode=...
99    <PIL.JpegImagePlugin.JpegImageFile image mode=...
Name: image, Length: 100, dtype: object

In [45]:
def image_to_base64(img):
    buffered = io.BytesIO()
    img.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue()).decode("utf-8")

# Create the new base64 column
df['image_base64'] = df['image'].apply(image_to_base64)

# Now df has a new column 'image_base64' with the base64 encoded images
df.head()

Unnamed: 0,image,text,llm_generated_idea,image_base64
0,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"<html>\n<link href=""https://cdn.jsdelivr.net/n...",Fashion Brand: A visually stunning layout with...,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"<html>\n<link href=""https://cdn.jsdelivr.net/n...",Restaurant Chain: A design with a mouth-wateri...,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"<html>\n<link href=""https://cdn.jsdelivr.net/n...","Consulting Firm: A clean, professional design ...",/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
3,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"<html>\n<link href=""https://cdn.jsdelivr.net/n...",Real Estate Agency: A user-friendly design wit...,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
4,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"<html>\n<link href=""https://cdn.jsdelivr.net/n...","Education Platform: A design with a wide, hero...",/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [46]:
# some datasets contain a field called column, we would like to call it
# input to match it to the prompt template
df.rename(columns={"image_base64": "input"}, inplace=True)
df.head()

Unnamed: 0,image,text,llm_generated_idea,input
0,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"<html>\n<link href=""https://cdn.jsdelivr.net/n...",Fashion Brand: A visually stunning layout with...,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"<html>\n<link href=""https://cdn.jsdelivr.net/n...",Restaurant Chain: A design with a mouth-wateri...,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"<html>\n<link href=""https://cdn.jsdelivr.net/n...","Consulting Firm: A clean, professional design ...",/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
3,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"<html>\n<link href=""https://cdn.jsdelivr.net/n...",Real Estate Agency: A user-friendly design wit...,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
4,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"<html>\n<link href=""https://cdn.jsdelivr.net/n...","Education Platform: A design with a wide, hero...",/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


### Subset the data

In [47]:
print(f"dataset shape before random subset = {df.shape}")
df = df.sample(n=ds_N)
print(f"dataset shape before random subset = {df.shape}")

dataset shape before random subset = (100, 4)
dataset shape before random subset = (100, 4)


Convert to json lines format

In [48]:
jsonl_content = df.to_json(orient='records', lines=True)
print(jsonl_content[:1000])

{"image":{"app":{"APP0":"JFIF\u0000\u0001\u0001\u0000\u0000\u0001\u0000\u0001\u0000\u0000"},"applist":[["APP0","JFIF\u0000\u0001\u0001\u0000\u0000\u0001\u0000\u0001\u0000\u0000"]],"bits":8,"custom_mimetype":null,"decoderconfig":[],"decodermaxblock":65536,"encoderconfig":[-1,false,0,false,false,0,0,0,-1,0,0,null,null,"",""],"encoderinfo":{},"filename":"","format":"JPEG","format_description":"JPEG (ISO 10918)","fp":null,"has_transparency_data":false,"height":1440,"huffman_ac":{},"huffman_dc":{},"icclist":[],"im":{"bands":3,"id":94323468030944,"mode":"RGB","ptr":{},"size":[2560,1440],"unsafe_ptrs":[["image8",0],["image32",94331769226096],["image",94331769226096]]},"info":{"jfif":257,"jfif_version":[1,1],"jfif_unit":0,"jfif_density":[1,1]},"layer":[[1,2,2,0],[2,1,1,1],[3,1,1,1]],"layers":3,"map":null,"mode":"RGB","palette":null,"pyaccess":null,"quantization":{"0":[8,6,5,8,12,20,26,31,6,6,7,10,13,29,30,28,7,7,8,12,20,29,35,28,7,9,11,15,26,44,40,31,9,11,19,28,34,55,52,39,12,18,28,32,41,52,57

## Upload the dataset to s3

In [49]:
bucket: str = config['s3_read_data']['read_bucket']
prefix: str = config['s3_read_data']['source_data_prefix']
file_name: str = f"{ds_id}.jsonl"
write_to_s3(jsonl_content, bucket, prefix, "", file_name)

's3://sagemaker-fmbench-read-us-west-2-988564344122/source_data/HuggingFaceM4/WebSight.jsonl'

## Create a prompt template and upload it to S3
The prompt template is specific to the model under test and also the dataset being used. The variables used in the template, such as `context` and `input` must exist in the dataset being used so that this prompt template can be converted into an actual prompt.

In [50]:
# dictionary containing the prompt template, it has a key by the name
# of the dataset id which forces you to explicitly add your dataset here
# otherwise no new prompt template will be uploaded and it wont accidently
# end up overwriting an existing prompt template
prompt_template = {}

In [None]:
# LongBench
prompt_template['THUDM-LongBench-llama2-mistral'] = """<s>[INST] <<SYS>>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context in the section demarcated by "```" to answer the question. If you don't know the answer just say that you don't know. Use three sentences maximum and keep the answer concise.
<</SYS>>

```
{context}
```

Question: {input}

[/INST]
Answer:
"""

In [None]:
# Open Orca
prompt_template['Open-Orca-OpenOrca-llama2-mistral'] = """<s>[INST] <<SYS>>

{system_prompt}

<</SYS>>

Context and task: {input}

[/INST]
"""

In [None]:
prompt_template['Open-Orca-OpenOrca-llama3'] = """<|begin_of_text|><|start_header_id|>user<|end_header_id|>

{system_prompt}

Context and task: {input} 

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

In [None]:
bucket: str = config['s3_read_data']['read_bucket']
prefix: str = config['s3_read_data']['prompt_template_dir']
for k in prompt_template.keys():
    file_name: str = f"prompt_template_{k}.txt"
    print(f"writing {file_name} to s3://{bucket}/{prefix}/{file_name}")
    write_to_s3(prompt_template[k], bucket, prefix, "", file_name)

## Scratchpad

### Utility function for converting a line from container log to JSON format

The following is a line from CW log from a model container that provides all the information about the model that is not available anywhere else (not in Model or EndpointConfig or Endpoint description). This information is often necessary to know the low level settings about the model which may have been set while compiling the model.

In [3]:
line="""model_id_or_path='/tmp/.djl.ai/download/ae03dd100c208acd82b5dbed563c971de864c408' rolling_batch=<RollingBatchEnum.auto: 'auto'> tensor_parallel_degree=8 trust_remote_code=False enable_streaming=<StreamingEnum.false: 'false'> batch_size=4 max_rolling_batch_size=4 dtype=<Dtype.f16: 'fp16'> revision=None output_formatter=None waiting_steps=None is_mpi=False draft_model_id=None spec_length=0 neuron_optimize_level=None enable_mixed_precision_accumulation=False enable_saturate_infinity=False n_positions=4096 unroll=None load_in_8bit=False low_cpu_mem_usage=False load_split_model=True context_length_estimate=None amp='f16' quantize=None compiled_graph_path=None task=None save_mp_checkpoint_path=None group_query_attention=None model_loader=<TnXModelLoaders.tnx: 'tnx'> rolling_batch_strategy=<TnXGenerationStrategy.continuous_batching: 'continuous_batching'> fuse_qkv=False on_device_embedding=False attention_layout=None collectives_layout=None cache_layout=None partition_schema=None all_reduce_dtype=None cast_logits_dtype=None"""
import re
import json
pattern = r' (?=[^\'"])'


# Split the string using the pattern
result = re.split(pattern, line)
print("\n".join([r for r in result]))
params= {}
for kv in result:
    #print(kv.split('='))
    k,v = kv.split('=')
    params[k] = v
print(json.dumps(params, indent=2, default=str))

model_id_or_path='/tmp/.djl.ai/download/ae03dd100c208acd82b5dbed563c971de864c408'
rolling_batch=<RollingBatchEnum.auto: 'auto'>
tensor_parallel_degree=8
trust_remote_code=False
enable_streaming=<StreamingEnum.false: 'false'>
batch_size=4
max_rolling_batch_size=4
dtype=<Dtype.f16: 'fp16'>
revision=None
output_formatter=None
waiting_steps=None
is_mpi=False
draft_model_id=None
spec_length=0
neuron_optimize_level=None
enable_mixed_precision_accumulation=False
enable_saturate_infinity=False
n_positions=4096
unroll=None
load_in_8bit=False
low_cpu_mem_usage=False
load_split_model=True
context_length_estimate=None
amp='f16'
quantize=None
compiled_graph_path=None
task=None
save_mp_checkpoint_path=None
group_query_attention=None
model_loader=<TnXModelLoaders.tnx: 'tnx'>
rolling_batch_strategy=<TnXGenerationStrategy.continuous_batching: 'continuous_batching'>
fuse_qkv=False
on_device_embedding=False
attention_layout=None
collectives_layout=None
cache_layout=None
partition_schema=None
all_reduce_d