# Generate Data: Gather data, create prompts/payloads of different sizes
---------
*This notebook works best with the conda_python3 kernel on a ml.t3.medium machine*.

### This part of our solution design includes 

- running and downloading our specific dataset

- generating prompts as payloads of different sizes that we will send to our different model endpoints with different combinations of concurrency levels that we will later use to run inference and generate benchmarking metrics and visualizations.

#### This file will generate all data on wikiqa (english version) with prompt sizes 300 - 4000 token lengths in different payload sizes to send to the model endpoint during the inference pipeline. You will also be able to generate the normal wikiqa dataset from the actual 'long bench dataset'. This notebook then focuses on 3 main deliverables:

1. Loading the dataset that is stored within the dataset in the data directory.


2. Generating payloads: This notebook also converts the loaded datasets into payloads based on the input question and records the context length of the prompt to send as a part of the payload during running inferences on the deployed endpoints.

    - All of the prompts are saved in this data directory in a file named all_prompts.csv.
    

3. Constructing different sized payloads

#### Import all of the necessary libraries below to run this notebook

In [None]:
# if interactive mode is set to no -> pickup fmbench from Python installation path
# if interactive mode is set to yes -> pickup fmbench from the current path (one level above this notebook)
# if interactive mode is not defined -> pickup fmbench from the current path (one level above this notebook)
# the premise is that if run non-interactively then it can only be run through main.py which will set interactive mode to no
import os
import sys
if os.environ.get("INTERACTIVE_MODE_SET", "yes") == "yes":
    sys.path.append(os.path.dirname(os.getcwd()))

In [None]:
import io
import copy
import json
import logging
import itertools
import pandas as pd
from fmbench.utils import *
from fmbench.globals import *
from typing import Dict, List, Optional
import importlib.resources as pkg_resources

#### Pygmentize globals.py to view and use any of the globally initialized variables 

#### Set up a logger to log all messages while the code runs

In [None]:
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
## config.yml file contains information that is used across this benchmarking environment, 
## such as information about the aws account, prompts, payloads to be used for invocations
config = load_main_config(CONFIG_FILE)
logger.info(json.dumps(config, indent=2))

#### Define the file path for the prompt template

In [None]:
s3_file_path = "/".join([config['s3_read_data']['prompt_template_dir'],
                         config['s3_read_data']['prompt_template_file']])

## download the file from s3 else check locally and use that version
prompt_template_from_s3: str = read_from_s3(config['s3_read_data']['read_bucket'], s3_file_path)

prompt_template_dir = Path(pkg_resources.files(FMBENCH_PACKAGE_NAME), config['s3_read_data']['prompt_template_dir'])
logger.info(f"Using fmbench.{config['s3_read_data']['prompt_template_dir']} directory: {prompt_template_dir}")

if prompt_template_from_s3 is None:
    promtp_template_fpath: str = os.path.join(prompt_template_dir, config['s3_read_data']['prompt_template_file'])
    prompt_template = Path(promtp_template_fpath).read_text()
    logger.info(f"Using the default local prompt template --> {prompt_template}")
else:
    prompt_template = prompt_template_from_s3
    logger.info(f"Using the prompt template from S3 --> {prompt_template}")
prompt_template = prompt_template.strip()

# Calculate the number of tokens in the prompt template
prompt_template_keys = config['datasets']['prompt_template_keys']
# get the ground truth key from the dataset section of the config file
ground_truth_col_key: Optional[str] = config['datasets'].get('ground_truth_col_key', None)
# get the question col key from the dataset section of the config file
question_col_key: Optional[str] = config['datasets'].get('question_col_key', None)

args = {}
for k in prompt_template_keys:
    args[k] = ""
empty_prompt_template = prompt_template.format(**args)
logger.info(f"empty prompt template = \"{empty_prompt_template}\"")
empty_prompt_len_in_tokens = count_tokens(empty_prompt_template)

# Log the number of tokens
logger.info(f"prompt template length={empty_prompt_len_in_tokens} tokens")

In [None]:
def list_files():
    response = s3_client.list_objects_v2(Bucket=config['s3_read_data']['read_bucket'], Prefix=config['s3_read_data']['source_data_prefix'])
    return [obj['Key'] for obj in response['Contents']]

# List all files in the bucket and prefix
# s3_files = list_files()
s3_files = list_s3_files(config['s3_read_data']['read_bucket'], config['s3_read_data']['source_data_prefix'], '.jsonl')
logger.info(f"s3 paths of the data set -> {s3_files}")

# Log the files you're going to read
logger.info(f"dataset files = {s3_files}")

# Read and concatenate DataFrames
if is_read_local():
    jsonl_files = s3_files
else:
    jsonl_files = [file_key for file_key in s3_files if file_key.replace(config['s3_read_data']['source_data_prefix'] + "/", "") in config['s3_read_data']['source_data_files']]
logger.info(f"jsonl_files={jsonl_files}")
# Read and concatenate only the .jsonl files
# df = pd.concat([pd.read_json(io.BytesIO(s3_client.get_object(Bucket=config['s3_read_data']['read_bucket'], Key=file_key)['Body'].read()), lines=True) 
#                  for file_key in jsonl_files])
df = pd.concat([pd.read_json(io.BytesIO(get_s3_object(config['s3_read_data']['read_bucket'], file_key, decode=False)), lines=True) 
                for file_key in jsonl_files])

# Log the source of the dataset and its shape
logger.info(f"dataset read from {s3_files}\nhas shape {df.shape}")

#### View a portion of the df to view inputs, contexts, and more information on the data

In [None]:
df.head()

#### Display basic statistics on the existing dataset: including count, mean, std, min, etc.

In [None]:
logger.info(f"distribution of the length field in the dataset is as follows ->\n{df.describe()}")

### Convert the dataset elements into prompts as payloads for inference purposes

Now, we will focus on converting the existing data within our datasets, and extract the information to convert it into prompts to be able to send to our deployed model endpoints during the process of testing and benchmarking for results and various metrics

In [None]:
%%time
df['prompt'] = df.apply(lambda row: process_item(row, config['datasets']['prompt_template_keys'], prompt_template), axis=1)
df['prompt_len'] = df.prompt.map(lambda x: x['prompt_len'])

In [None]:
# Convert DataFrame to a CSV format string
csv_buffer = io.StringIO()
df.to_csv(csv_buffer, index=False)
csv_data = csv_buffer.getvalue()
all_prompts_file = config['dir_paths']['all_prompts_file']

# Write to S3 using the write_to_s3 function
write_to_s3(csv_data,
            config['aws']['bucket'],
            DATA_DIR, config['dir_paths']['prompts_prefix'],
            all_prompts_file)

# Log where the prompts are saved
logger.info(f"all prompts dataframe of shape {df.shape} saved to "
            f"s3://{config['aws']['bucket']}/{DATA_DIR}/{os.path.join(config['dir_paths']['prompts_prefix'], all_prompts_file)}")

In [None]:
# View some of the prompts 
df.head()

### Convert Prompts into Payloads for inference purposes
------
Now we will prepare data for model inference. It involves converting prompts, created and stored in a specific format, into payloads for inference. We will utilize the prompt file for our model and incorporate the prompt into a payload using that. 

These payloads are tailored to the needs of deployed model endpoints. The conversion considers prompt sizes and specific configurations to further make our benchmarking more detailed and comprehensive. 

The goal is to have a set of well-formatted and parameterized payload requests of various sizes ready to be sent to the model endpoints for inference, with the responses to be used for further analysis.

In [None]:
# Function to construct a single request payload based on row prompt data and configuration
def construct_request_payload(row, config: Dict) -> Dict:

    # Deep copy inference parameters from the config.yml file - feel free to change this based on the model type you are using
    # parameters = copy.deepcopy(config['inference_parameters']['common'])
    # truncate = parameters.get('truncate', None)
    # if truncate == TRUNCATE_POLICY.AT_PROMPT_TOKEN_LENGTH:
    #     parameters['truncate'] = row['prompt_len']

    # Return the constructed payload along with the ground truth if any
    # is contained within the dataset
    prompt_dict: Optional[Dict] = None
    try:
        # Construct the base prompt dictionary with the prompt input
        prompt_dict = dict(inputs=row['prompt']['prompt'])

        # Add ground truth and question_col_key if available
        if ground_truth_col_key is not None and ground_truth_col_key in row:
            prompt_dict['ground_truth'] = row[ground_truth_col_key]

        if question_col_key is not None and question_col_key in row:
            prompt_dict['question'] = row[question_col_key]

    except Exception as e:
        logger.error(f"Prompt template could not be constructed: {e}")
        prompt_dict = None

    return prompt_dict


# Function to create a dataset payload files from the given dataset file we have
def create_dataset_payload_file(df: pd.DataFrame, dataset_info: Dict, config: Dict) -> str:

    # First, log the dataset existing information
    logger.info(f"going to create a payload file as dataset_info={json.dumps(dataset_info, indent=2)}")

    # Filter the DataFrame based on prompt length and language given below for constructing payloads of various sizes
    df['prompt_len_in_range'] = df.prompt.map(lambda x: x['prompt_len'] >= dataset_info['min_length_in_tokens'] and \
                                                        x['prompt_len'] < dataset_info['max_length_in_tokens'])

    # select prompts between pre-configured threshold lengths and are in the selected language
    if 'language' in df.columns:
        df_filtered = df[(df.language == dataset_info['language']) & (df.prompt_len_in_range)]
    else:
        df_filtered = df[df.prompt_len_in_range]

    logger.info(f"after filtering for {json.dumps(dataset_info, indent=2)}, shape of dataframe is {df_filtered.shape}")
    if df_filtered.shape[0] == 0:
        logger.error(f"did not find any prompts in the dataframe that matched the filtering criteria, exiting")
        return None
    # df_filtered.head()

    # Here, we construct request payloads for each row in the filtered DataFrame
    df_filtered['request'] = df_filtered.apply(lambda r: construct_request_payload(r, config), axis=1)
    logger.info(f"payload request entry looks like this -> {json.dumps(df_filtered['request'].iloc[0], indent=2)}")

    # Convert the 'request' column of the filtered DataFrame to a JSON Lines string
    json_lines_str = df_filtered['request'].to_json(orient='records', lines=True)

    lang = dataset_info['language']
    min_len = dataset_info['min_length_in_tokens']
    max_len = dataset_info['max_length_in_tokens']
    file_name = dataset_info['payload_file'].format(lang=lang, min=min_len, max=max_len)

    prompts_path = os.path.join(DATA_DIR, config['dir_paths']['prompts_prefix'])

    # defining the s3_path these prompts will go to
    s3_file_path = os.path.join(prompts_path, file_name)

    # Write the JSON Lines string to S3
    # get the bucket name, config vars from config file
    prefix = f"{config['dir_paths']['prompts_prefix']}/{config['s3_read_data']['source_data_prefix']}"
    write_to_s3(json_lines_str, config['aws']['bucket'], DATA_DIR, prefix, file_name)

    logger.info(f"dataset of different payload file structures saved to s3://{config['aws']['bucket']}/{s3_file_path}")
    return f"s3://{config['aws']['bucket']}/{s3_file_path}"

In [None]:
items = ((df, d, config) for d in config['datasets']['filters'])

# This results in the creation of payload files for each dataset
paths: List = list(itertools.starmap(create_dataset_payload_file, items))

In [None]:
print("\n".join([p for p in paths if p]))