# Generate Data: Gather data, create prompts/payloads of different sizes
---------
*This notebook works best with the conda_python3 kernel on a ml.t3.medium machine*.

### This part of our solution design includes 

- running and downloading our specific dataset

- generating prompts as payloads of different sizes that we will send to our different model endpoints with different combinations of concurrency levels that we will later use to run inference and generate benchmarking metrics and visualizations.

#### This file will generate all data on wikiqa (english version) with prompt sizes 300 - 4000 token lengths in different payload sizes to send to the model endpoint during the inference pipeline. You will also be able to generate the normal wikiqa dataset from the actual 'long bench dataset'. This notebook then focuses on 3 main deliverables:

1. Loading the dataset that is stored within the dataset in the data directory.


2. Generating payloads: This notebook also converts the loaded datasets into payloads based on the input question and records teh context length of the prompt to send as a part of the payload during running inferences on the deployed endpoints.

    - All of the prompts are saved in this data directory in a file named all_prompts.csv.
    

3. Constructing different sized payloads

In [1]:
## auto reload all of the changes made in the config/globals.py file 
%load_ext autoreload
%autoreload 2
!touch globals.py

#### Import all of the necessary libraries below to run this notebook

In [2]:
import glob
import json
import io
import copy
import logging
import itertools
import pandas as pd
from globals import *
from typing import Dict, List
from utils import process_item, load_config, count_tokens, write_to_s3, read_from_s3

CONFIG_FILE=configs/config-mistral-7b-tgi-g5.yml


#### Pygmentize globals.py to view and use any of the globally initialized variables 

In [3]:
# global constants
!pygmentize globals.py

[34mimport[39;49;00m [04m[36mos[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36myaml[39;49;00m[37m[39;49;00m
[34mfrom[39;49;00m [04m[36menum[39;49;00m [34mimport[39;49;00m Enum[37m[39;49;00m
[34mfrom[39;49;00m [04m[36mpathlib[39;49;00m [34mimport[39;49;00m Path[37m[39;49;00m
[34mimport[39;49;00m [04m[36mboto3[39;49;00m[37m[39;49;00m
[34mfrom[39;49;00m [04m[36mdatetime[39;49;00m [34mimport[39;49;00m datetime[37m[39;49;00m
[37m[39;49;00m
CONFIG_FILEPATH_FILE: [36mstr[39;49;00m = [33m"[39;49;00m[33mconfig_filepath.txt[39;49;00m[33m"[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[37m# S3 client initialization[39;49;00m[37m[39;49;00m
s3_client = boto3.client([33m'[39;49;00m[33ms3[39;49;00m[33m'[39;49;00m)[37m[39;49;00m
[37m[39;49;00m
[37m## Configuring the role ARN -- extract the role name[39;49;00m[37m[39;49;00m
arn_string = boto3.client([33m'[39;49;00m[33msts[39;49;00m[33m'[39;49;00m).get_caller_identity()

#### Set up a logger to log all messages while the code runs

In [4]:
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [5]:
## config.yml file contains information that is used across this benchmarking environment, 
## such as information about the aws account, prompts, payloads to be used for invocations
config = load_config(CONFIG_FILE)
logger.info(json.dumps(config, indent=2))

[2024-01-30 20:43:17,950] p62972 {2001288519.py:4} INFO - {
  "general": {
    "name": "mistral-7b-tgi-g5-v1",
    "model_name": "mistral7b"
  },
  "aws": {
    "region": "us-east-1",
    "sagemaker_execution_role": "arn:aws:iam::015469603702:role/service-role/AmazonSageMaker-ExecutionRole-20220504T122644",
    "bucket": "fmbttest"
  },
  "dir_paths": {
    "data_prefix": "data",
    "prompts_prefix": "prompts",
    "all_prompts_file": "all_prompts.csv",
    "metrics_dir": "metrics_{datetime}",
    "models_dir": "models_{datetime}",
    "metadata_dir": "metadata"
  },
  "s3_read_data": {
    "read_bucket": "fmbt-read",
    "scripts_prefix": "scripts",
    "source_data_prefix": "source_data",
    "tokenizer_prefix": "tokenizer",
    "prompt_template_dir": "prompt_template",
    "prompt_template_file": "prompt_template.txt"
  },
  "run_steps": {
    "0_setup.ipynb": false,
    "1_deploy_model.ipynb": false,
    "2_generate_data.ipynb": false,
    "3_run_inference.ipynb": false,
    "4_mo

#### Define the file path for the prompt template

In [16]:
s3_file_path = os.path.join(config['s3_read_data']['prompt_template_dir'], config['s3_read_data']['prompt_template_file'])

# Check if the prompt_template directory and file exist
if os.path.exists('prompt_template/prompt_template.txt'):
    with open('prompt_template/prompt_template.txt', 'r') as file:
        prompt_template = file.read()
        logger.info(f"prompt template extracted locally")
else:
    # Read the prompt template from S3
    prompt_template = read_from_s3(config['s3_read_data']['read_bucket'], s3_file_path)
    logger.info(f"prompt template extracted from s3")

if prompt_template:
    prompt_template = prompt_template.strip()
    logger.info(f"prompt template from s3://{config['s3_read_data']['read_bucket']}/{config['s3_read_data']['prompt_template_dir']}/{config['s3_read_data']['prompt_template_file']} ->\n{prompt_template}")

    # Calculate the number of tokens in the prompt template
    empty_prompt_len_in_tokens = count_tokens(prompt_template.format(context="", question=""))

    # Log the number of tokens
    logger.info(f"prompt template length={empty_prompt_len_in_tokens} tokens")
else:
    logger.error("Failed to read the prompt template from S3")

[2024-01-30 20:53:34,364] p62972 {2037602823.py:7} INFO - prompt template extracted locally
[2024-01-30 20:53:34,365] p62972 {2037602823.py:15} INFO - prompt template from s3://fmbt-read/prompt_template/prompt_template.txt ->
<s>[INST] <<SYS>>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context in the section demarcated by "```" to answer the question. If you don't know the answer just say that you don't know. Use three sentences maximum and keep the answer concise.
<</SYS>>

```
{context}
```

Question: {question}

[/INST]
Answer:
[2024-01-30 20:53:34,368] p62972 {2037602823.py:21} INFO - prompt template length=97 tokens


In [7]:
def list_files():
    response = s3_client.list_objects_v2(Bucket=config['s3_read_data']['read_bucket'], Prefix=config['s3_read_data']['source_data_prefix'])
    return [obj['Key'] for obj in response['Contents']]

# List all files in the bucket and prefix
s3_files = list_files()
logger.info(f"s3 paths of the data set -> {s3_files}")

# Log the files you're going to read
logger.info(f"dataset files = {s3_files}")

# Read and concatenate DataFrames
df = pd.concat([pd.read_json(io.BytesIO(s3_client.get_object(Bucket=config['s3_read_data']['read_bucket'], Key=file_key)['Body'].read()), lines=True) for file_key in s3_files])

# Log the source of the dataset and its shape
logger.info(f"dataset read from {s3_files}\nhas shape {df.shape}")

[2024-01-30 20:43:20,515] p62972 {2174902878.py:7} INFO - s3 paths of the data set -> ['source_data/', 'source_data/2wikimqa.jsonl', 'source_data/2wikimqa_e.jsonl', 'source_data/hotpotqa.jsonl', 'source_data/hotpotqa_e.jsonl', 'source_data/narrativeqa.jsonl', 'source_data/triviaqa.jsonl', 'source_data/triviaqa_e.jsonl']
[2024-01-30 20:43:20,515] p62972 {2174902878.py:10} INFO - dataset files = ['source_data/', 'source_data/2wikimqa.jsonl', 'source_data/2wikimqa_e.jsonl', 'source_data/hotpotqa.jsonl', 'source_data/hotpotqa_e.jsonl', 'source_data/narrativeqa.jsonl', 'source_data/triviaqa.jsonl', 'source_data/triviaqa_e.jsonl']
[2024-01-30 20:43:33,009] p62972 {2174902878.py:16} INFO - dataset read from ['source_data/', 'source_data/2wikimqa.jsonl', 'source_data/2wikimqa_e.jsonl', 'source_data/hotpotqa.jsonl', 'source_data/hotpotqa_e.jsonl', 'source_data/narrativeqa.jsonl', 'source_data/triviaqa.jsonl', 'source_data/triviaqa_e.jsonl']
has shape (1700, 8)


#### View a portion of the df to view inputs, contexts, and more information on the data

In [8]:
df.head()

Unnamed: 0,input,context,answers,length,dataset,language,all_classes,_id
0,Where was the wife of Francis I Rákóczi born?,Passage 1:\nWaldrada of Lotharingia\nWaldrada ...,[Ozalj],4696,2wikimqa,en,,41ac2a4beb0af8f58d01863a62b90692f7c7d74b5e3a58d9
1,Who is Sobe (Sister Of Saint Anne)'s grandchild?,Passage 1:\nJim Ramel Kjellgren\nJim Love Rame...,[John the Baptist],4776,2wikimqa,en,,3924e4ac5039ce3fadda49604bfcb0f5238af81774616e53
2,Where does the director of film Man At Bath wo...,Passage 1:\nJason Moore (director)\nJason Moor...,[Cahiers du cinéma],4274,2wikimqa,en,,2c952e3e1ca394df975103b3135b3c38e0ee16e25d860258
3,Do both Beauty And The Bad Man and Wild Child ...,Passage 1:\nBetty Hall\nBeatrice Perin Barker ...,[no],8125,2wikimqa,en,,aec83da1f2faf6ec8badfd53d632f525c9ef2090d99d1c6c
4,"What is the date of birth of William Paulet, 3...","Passage 1:\nHenry, Lord Paulet\nLord Henry Pau...",[1510],4621,2wikimqa,en,,4b28d517ce1c1e3cfec9282ca7b212c1cb87c254781d7c86


#### Display basic statistics on the existing dataset: including count, mean, std, min, etc.

In [9]:
logger.info(f"distribution of the length field in the dataset is as follows ->\n{df.length.describe()}")

[2024-01-30 20:43:33,282] p62972 {903950042.py:1} INFO - distribution of the length field in the dataset is as follows ->
count     1700.000000
mean      8221.461176
std       5876.876131
min        111.000000
25%       3892.500000
50%       7131.500000
75%      10760.000000
max      36418.000000
Name: length, dtype: float64


### Concert the dataset elements into prompts as payloads for inference purposes

Now, we will focus on converting the existing data within our datasets, and extract the information to convert it into prompts to be able to send to our deployed model endpoints during the process of testing and benchmarking for results and various metrics

In [10]:
%%time
df['prompt'] = df.apply(lambda row: process_item(row, prompt_template), axis=1)
df['prompt_len'] = df.prompt.map(lambda x: x['prompt_len'])


CPU times: user 6min 26s, sys: 1.41 s, total: 6min 28s
Wall time: 6min 34s


In [11]:
# Convert DataFrame to a CSV format string
csv_buffer = io.StringIO()
df.to_csv(csv_buffer, index=False)
csv_data = csv_buffer.getvalue()
all_prompts_file = config['dir_paths']['all_prompts_file']

# Write to S3 using the write_to_s3 function
write_to_s3(csv_data, config['aws']['bucket'], DATA_DIR, config['dir_paths']['prompts_prefix'], all_prompts_file)

# Log where the prompts are saved
logger.info(f"all prompts dataframe of shape {df.shape} saved to s3://{config['aws']['bucket']}/{DATA_DIR}/{os.path.join(config['dir_paths']['prompts_prefix'], all_prompts_file)}")

[2024-01-30 20:50:42,556] p62972 {1138732359.py:11} INFO - all prompts dataframe of shape (1700, 10) saved to s3://fmbttest/mistral-7b-tgi-g5-v1-madhurusertest/data/prompts/all_prompts.csv


In [12]:
## View some of the prompts 
df.head()

Unnamed: 0,input,context,answers,length,dataset,language,all_classes,_id,prompt,prompt_len
0,Where was the wife of Francis I Rákóczi born?,Passage 1:\nWaldrada of Lotharingia\nWaldrada ...,[Ozalj],4696,2wikimqa,en,,41ac2a4beb0af8f58d01863a62b90692f7c7d74b5e3a58d9,{'question': 'Where was the wife of Francis I ...,8199
1,Who is Sobe (Sister Of Saint Anne)'s grandchild?,Passage 1:\nJim Ramel Kjellgren\nJim Love Rame...,[John the Baptist],4776,2wikimqa,en,,3924e4ac5039ce3fadda49604bfcb0f5238af81774616e53,{'question': 'Who is Sobe (Sister Of Saint Ann...,8068
2,Where does the director of film Man At Bath wo...,Passage 1:\nJason Moore (director)\nJason Moor...,[Cahiers du cinéma],4274,2wikimqa,en,,2c952e3e1ca394df975103b3135b3c38e0ee16e25d860258,{'question': 'Where does the director of film ...,7801
3,Do both Beauty And The Bad Man and Wild Child ...,Passage 1:\nBetty Hall\nBeatrice Perin Barker ...,[no],8125,2wikimqa,en,,aec83da1f2faf6ec8badfd53d632f525c9ef2090d99d1c6c,{'question': 'Do both Beauty And The Bad Man a...,13248
4,"What is the date of birth of William Paulet, 3...","Passage 1:\nHenry, Lord Paulet\nLord Henry Pau...",[1510],4621,2wikimqa,en,,4b28d517ce1c1e3cfec9282ca7b212c1cb87c254781d7c86,{'question': 'What is the date of birth of Wil...,8727


### Convert Prompts into Payloads for inference purposes
------
Now we will prepare data for model inference. It involves converting prompts, created and stored in a specific format, into payloads for inference. We will utilize the prompt file for our model and incorporate the prompt into a payload using that. 

These payloads are tailored to the needs of deployed model endpoints. The conversion considers prompt sizes and specific configurations to further make our benchmarking more detailed and comprehensive. 

The goal is to have a set of well-formatted and parameterized payload requests of various sizes ready to be sent to the model endpoints for inference, with the responses to be used for further analysis

In [13]:
# Function to construct a single request payload based on row prompt data and configuration
def construct_request_payload(row, config: Dict) -> Dict:
    
    # Deep copy inference parameters from the config.yml file - feel free to change this based on the model type you are using
    parameters = copy.deepcopy(config['inference_parameters'])
    if parameters['truncate'] == TRUNCATE_POLICY.AT_PROMPT_TOKEN_LENGTH:
        parameters['truncate'] = row['prompt_len']
        
    # Return the constructed payload
    return dict(inputs=row['prompt']['prompt'], parameters=parameters)

# Function to create a dataset payload files from the given dataset file we have
def create_dataset_payload_file(df: pd.DataFrame, dataset_info: Dict, config: Dict) -> str:
    
    # First, log the dataset existing information
    logger.info(f"going to create a payload file as dataset_info={json.dumps(dataset_info, indent=2)}")
    
    # Filter the DataFrame based on prompt length and language given below for constructing payloads of various sizes
    df['prompt_len_in_range'] = df.prompt.map(lambda x: x['prompt_len'] >= dataset_info['min_length_in_tokens'] and \
                                                        x['prompt_len'] <= dataset_info['max_length_in_tokens'])
    
    # select prompts between pre-configured threshold lengths and are in the selected language
    df_filtered = df[(df.language == dataset_info['language']) & (df.prompt_len_in_range)]
    logger.info(f"after filtering for {json.dumps(dataset_info, indent=2)}, shape of dataframe is {df_filtered.shape}")
    # df_filtered.head()

    # Here, we construct request payloads for each row in the filtered DataFrame
    df_filtered['request'] = df_filtered.apply(lambda r: construct_request_payload(r, config), axis=1)
    logger.info(f"payload request entry looks like this -> {json.dumps(df_filtered['request'].iloc[0], indent=2)}")
    
     # Convert the 'request' column of the filtered DataFrame to a JSON Lines string
    json_lines_str = df_filtered['request'].to_json(orient='records', lines=True)
    
    
    lang = dataset_info['language']
    min_len = dataset_info['min_length_in_tokens']
    max_len = dataset_info['max_length_in_tokens']
    file_name = dataset_info['payload_file'].format(lang=lang, min=min_len, max=max_len)

    prompts_path = os.path.join(DATA_DIR, config['dir_paths']['prompts_prefix'])

    ## defining the s3_path these prompts will go to
    s3_file_path = os.path.join(prompts_path, file_name)

    # Write the JSON Lines string to S3
    # get the bucket name, config vars from config file
    write_to_s3(json_lines_str, config['aws']['bucket'], DATA_DIR, config['dir_paths']['prompts_prefix'], file_name)

    logger.info(f"dataset of different payload file structures saved to s3://{config['aws']['bucket']}/{s3_file_path}")
    return f"s3://{config['aws']['bucket']}/{s3_file_path}"

In [14]:
items = ((df, d, config) for d in config['datasets'])

# This results in the creation of payload files for each dataset
paths: List = list(itertools.starmap(create_dataset_payload_file, items))

[2024-01-30 20:50:42,688] p62972 {1154133551.py:16} INFO - going to create a payload file as dataset_info={
  "language": "en",
  "min_length_in_tokens": 1,
  "max_length_in_tokens": 500,
  "payload_file": "payload_{lang}_{min}-{max}.jsonl"
}
[2024-01-30 20:50:42,695] p62972 {1154133551.py:24} INFO - after filtering for {
  "language": "en",
  "min_length_in_tokens": 1,
  "max_length_in_tokens": 500,
  "payload_file": "payload_{lang}_{min}-{max}.jsonl"
}, shape of dataframe is (1, 11)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['request'] = df_filtered.apply(lambda r: construct_request_payload(r, config), axis=1)
[2024-01-30 20:50:42,698] p62972 {1154133551.py:29} INFO - payload request entry looks like this -> {
  "inputs": "<s>[INST] <<SYS>>\nYou are an a