# Generate Data: Gather data, create prompts/payloads of different sizes
---------
*This notebook works best with the conda_python3 kernel on a ml.t3.medium machine*.

### This part of our solution design includes 

- running and downloading our specific dataset

- generating prompts as payloads of different sizes that we will send to our different model endpoints with different combinations of concurrency levels that we will later use to run inference and generate benchmarking metrics and visualizations.

#### This file will generate all data on wikiqa (english version) with prompt sizes 300 - 4000 token lengths in different payload sizes to send to the model endpoint during the inference pipeline. You will also be able to generate the normal wikiqa dataset from the actual 'long bench dataset'. This notebook then focuses on 3 main deliverables:

1. Loading the dataset that is stored within the dataset in the data directory.


2. Generating payloads: This notebook also converts the loaded datasets into payloads based on the input question and records the context length of the prompt to send as a part of the payload during running inferences on the deployed endpoints.

    - All of the prompts are saved in this data directory in a file named all_prompts.csv.
    

3. Constructing different sized payloads

#### Import all of the necessary libraries below to run this notebook

In [None]:
# if interactive mode is set to no -> pickup fmbench from Python installation path
# if interactive mode is set to yes -> pickup fmbench from the current path (one level above this notebook)
# if interactive mode is not defined -> pickup fmbench from the current path (one level above this notebook)
# the premise is that if run non-interactively then it can only be run through main.py which will set interactive mode to no
import os
import sys
if os.environ.get("INTERACTIVE_MODE_SET", "yes") == "yes":
    sys.path.append(os.path.dirname(os.getcwd()))

In [None]:
import io
import re
import copy
import json
import base64
import logging
import itertools
import pandas as pd
from PIL import Image
from io import BytesIO
from pathlib import Path
from fmbench.utils import *
from fmbench.globals import *
from typing import Dict, List, Optional
from datasets import load_dataset, Dataset
import importlib.resources as pkg_resources

#### Pygmentize globals.py to view and use any of the globally initialized variables 

#### Set up a logger to log all messages while the code runs

In [None]:
# Create a logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Remove existing handlers
logger.handlers.clear()

# Add a simple handler
handler = logging.StreamHandler()
formatter = logging.Formatter('[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

In [None]:
## config.yml file contains information that is used across this benchmarking environment, 
## such as information about the aws account, prompts, payloads to be used for invocations
config = load_main_config(CONFIG_FILE)
logger.info(json.dumps(config, indent=2))

#### Define the file path for the prompt template

In [None]:
s3_file_path = "/".join([config['s3_read_data']['prompt_template_dir'],
                         config['s3_read_data']['prompt_template_file']])

## download the file from s3 else check locally and use that version
prompt_template_from_s3: str = read_from_s3(config['s3_read_data']['read_bucket'], s3_file_path)

prompt_template_dir = Path(pkg_resources.files(FMBENCH_PACKAGE_NAME), config['s3_read_data']['prompt_template_dir'])
logger.info(f"Using fmbench.{config['s3_read_data']['prompt_template_dir']} directory: {prompt_template_dir}")

if prompt_template_from_s3 is None:
    prompt_template_fpath: str = os.path.join(prompt_template_dir, config['s3_read_data']['prompt_template_file'])
    prompt_template = Path(prompt_template_fpath).read_text()
    logger.info(f"Using the default local prompt template --> {prompt_template}")
else:
    prompt_template = prompt_template_from_s3
    logger.info(f"Using the prompt template from S3 --> {prompt_template}")
prompt_template = prompt_template.strip()

# Calculate the number of tokens in the prompt template
prompt_template_keys = config['datasets']['prompt_template_keys']
# get the ground truth key from the dataset section of the config file
ground_truth_col_key: Optional[str] = config['datasets'].get('ground_truth_col_key', None)
# get the question col key from the dataset section of the config file
question_col_key: Optional[str] = config['datasets'].get('question_col_key', None)

args = {}
if prompt_template_keys:
    # if the prompt template keys are provided, then they are formatted into the prompt
    for k in prompt_template_keys:
        args[k] = ""
    empty_prompt_template = prompt_template.format(**args)
else:
    # if there are no formatting placeholders or prompt template keys provided, then the same prompt will be
    # used for all of the payloads. This is beneficial when there is an image dataset without any questions and
    # a consistent prompt can be used for all of the images
    logger.info("No prompt template keys provided. Using the prompt template provided for all requests.")
    empty_prompt_template = prompt_template

logger.info(f"empty prompt template = \"{empty_prompt_template}\"")
empty_prompt_len_in_tokens = count_tokens(empty_prompt_template)

# Log the number of tokens
logger.info(f"prompt template length={empty_prompt_len_in_tokens} tokens")

### Load the dataset: Handle text & multimodal image datasets dynamically
---

In this portion of the generate data step, FMBench does as follows:

1. If the dataset is a Hugging Face dataset (prefixed with an `hf:` in the `source_data_files` of the `s3_read_data` section in the config file), then FMBench downloads the data using the `dataset id` and `hf token`. The data is then converted into `jsonl` format and sent to the s3 read bucket within the `source_data` directory. If the run is on `local mode`, then the data is converted to `jsonl` and sent to `/tmp/fmbench-read/source_data/` directory.

2. If there is a custom dataset (not prefixed with `hf:`), then FMBench assumes that those files are already provided by the user in the `source_data` folder of the `EC2` instance or within the s3 bucket, available to be used. If the custom dataset is not in `jsonl` format, then use the `bring_your_own_dataset.ipynb`[fmbench/bring_your_own_dataset.ipynb] notebook to convert your custom dataset into `jsonl` format, place those in s3 or the local read folder, and FMBench will use those files in the generate data step.

In [None]:
# Read the HF token. This HF token is required while loading an HF dataset
# if the HF token is not provided, an exception might be through at the dataset download step.
try:
    # HF token file name
    hf_token_key: str = os.path.join(config['s3_read_data']['scripts_prefix'], HF_TOKEN_FNAME)
    hf_token_content: str = get_s3_object(config['s3_read_data']['read_bucket'], hf_token_key, decode=True)
    if hf_token_content is not None:
        HF_TOKEN = hf_token_content.strip()
        logger.info(f"HF token content successfully retrieved from S3: {HF_TOKEN_FNAME}")
    else:
        logger.info(f"HF token content not found: {hf_token_content}")
    logger.info(f"HF toke file found: {HF_TOKEN_FNAME}")
except FileNotFoundError:
    logger.error(f"HF token file '{HF_TOKEN_FNAME}' not found.")
    HF_TOKEN = None

In [None]:
# Custom JSON encoder for PIL Images
class PILImageEncoder(json.JSONEncoder):
    """
    A custom JSON encoder that extends the default JSONEncoder to handle PIL Image objects.

    This encoder allows for the serialization of PIL Image objects into JSON-compatible format.
    When encountering a PIL Image object, it converts the image to a PNG format, then to a 
    base64-encoded string. This allows image data to be included in JSON output, which 
    normally doesn't support binary data.
    """
    def default(self, obj):
        if isinstance(obj, Image.Image):
            buffered = io.BytesIO()
            obj.save(buffered, format=DEFAULT_IMAGE_FORMAT)
            # get the hex value of the buffered image object
            hex_data = buffered.getvalue().hex()
            return {
                'hex_data': hex_data,
                'format': DEFAULT_IMAGE_FORMAT
            }
        return super(PILImageEncoder, self).default(obj)

In [None]:
def load_hf_dataset(dataset_identifier: str,
                      HF_TOKEN: Optional[str],
                      ds_N: int):
    """
    Load a dataset from Hugging Face if the dataset_identifier starts with 'hf:' prefix.
    Returns the dataset as a pandas DataFrame.

    :param dataset_identifier: The dataset identifier, e.g., 'hf:derek-thomas/ScienceQA.jsonl'
    :param HF_TOKEN: Hugging Face token, if required
    :param ds_N: Number of samples to process
    :return: Hugging face dataset object
    """
    try:
        dataset = []
        # remove the hf prefix to load the hugging face dataset
        ds_id = dataset_identifier[len(HF_DATASET_PREFIX):]
        # Check if a specific split is requested
        ds_parts = ds_id.split('/')
        # Extract ds_id, ds_name, ds_split to load the dataset
        if len(ds_parts) >= 2:
            ds_id = '/'.join(ds_parts[:2])
            logger.info(f"Going to load the hugging face dataset: {ds_id}")
        else:
            logger.error(f"Invalid hugging face dataset id: {ds_id}. Use a valid dataset id.")
        # fetch the ds subset name and split for loading the dataset. If the subset name is not provided
        # then default to 'default'
        ds_name = ds_parts[2] if len(ds_parts) >= 3 else 'default'
        # If ds_split is not provided, then we will default it to 'train'. If you wish to use
        # a desired split, then provide the subset name and split name in this format within the 
        # config file: hf:dataset-id/subset-name/split-name
        ds_split = ds_parts[3] if len(ds_parts) >= 4 else 'train'
        # Set up parameters for load_dataset. These parameters are used while loading
        # the dataset. The 'streaming' parameter enables you to work with a dataset without downloading it. 
        # The data is streamed as you iterate over the dataset.
        load_dataset_kwargs = {'path': ds_id, 'name': ds_name, 'split': ds_split, 'streaming': True}
        logger.info(f"Using the following parameters to load the HF dataset: {load_dataset_kwargs}")
        if HF_TOKEN:
            # initialize the hf token
            load_dataset_kwargs['token'] = HF_TOKEN
        # Try to load the dataset - if the dataset has no subsets and only spits, then the
        # try block will handle it, otherwise it will try loading the dataset using the config name
        # if there are subsets in the dataset
        try:
            logger.info(f"Loading the HF dataset: {ds_id}")
            logger.info(f"load_dataset_kwargs: {load_dataset_kwargs}")
            dataset = load_dataset(**load_dataset_kwargs)
            logger.info(f"Done loading the dataset with the provided split {ds_split}: {ds_id}")
        except ValueError as e:
            logger.error(f"Error occurred while loading the HF dataset: {e}")
            return
        # If the dataset has multiple splits, select one
        dataset_split = dataset
        # Take only the first ds_N examples
        dataset_iter = itertools.islice(dataset_split, ds_N)
        dataset_list = list(dataset_iter)
        dataset = Dataset.from_list(dataset_list)
    except Exception as e:
        logger.error(f"Error loading the hugging face dataset: {e}")
        dataset=None
    return dataset

In [None]:
def process_dataset(dataset: Dataset,
                    config: Dict,
                    ds_N: int) -> Optional[str]:
    """
    Process the dataset and convert it to JSON Lines format. This is handled for both
    text and image datasets.

    :param dataset: The Dataset object
    :param config: FMBench configuration file
    :param ds_N: Number of samples to process
    :return: JSON Lines content as a string
    """
    try:
        jsonl_content: Optional[str] = None
        df = pd.DataFrame(dataset)
        image_col = config['datasets'].get('image_col')
        # If image_col is specified and exists in df, filter out rows where image_col is None
        if image_col and image_col in df.columns:
            initial_shape = df.shape
            df = df[df[image_col].notnull()]
            logger.info(f"Filtered dataset to only include rows where '{image_col}' is not None. Shape changed from {initial_shape} to {df.shape}")

        # Subset the data and randomly shuffle it
        logger.info(f"Dataset shape before random subset: {df.shape}")
        df = df.sample(n=min(ds_N, len(df)))
        logger.info(f"Dataset shape after random subset: {df.shape}")

        # Convert to JSON Lines format
        if image_col and image_col in df.columns:
            logger.info("Image column found in the dataset. The data is multimodal - Using the PILImageEncoder to encode images in JSON Lines.")
            jsonl_content = df.to_json(orient='records', lines=True, default_handler=PILImageEncoder().default)
        else:
            logger.info("The data is standard text data, converting to JSON Lines.")
            jsonl_content = df.to_json(orient='records', lines=True)
    except Exception as e:
        logger.error(f"Error processing and converting the dataset into jsonl format: {e}")
        jsonl_content = None
    return jsonl_content

In [None]:
# iterate through all of the source files in the source data file section to load the hf datasets
for dataset_file in config['s3_read_data']['source_data_files']:
    if dataset_file.startswith(HF_DATASET_PREFIX):
        logger.info(f"{dataset_file} is a hugging face dataset. Going to load and process the dataset now.")
        # Load the dataset
        dataset = load_hf_dataset(dataset_file, HF_TOKEN, config['datasets'].get('ds_N', DEFAULT_HF_DS_N_VALUE))
        logger.info(f"dataset: {dataset}")
        if dataset is None:
            logger.error(f"Failed to load dataset: {dataset_file}")
            logger.error(f"If your dataset does not have a 'default' subset or a 'train' split, then provide a dataset with a valid subset id and split name in the format 'hf:dataset-id/subset-name/split-name'.")
            continue
        # Process the dataset and convert to JSON Lines
        jsonl_content = process_dataset(dataset, config, config['datasets'].get('ds_N', DEFAULT_HF_DS_N_VALUE))
        if jsonl_content is None:
            logger.error(f"Failed to process dataset: {dataset_file}")
            continue
        # Prepare the file name. The hf dataset is stored as a jsonl file in the fmbench read directory, 
        # so we add a jsonl extension which is then stored and used in the benchmarking test
        file_name = dataset_file + '.jsonl'
        # Upload to S3 or locally within the /tmp/fmbench-read folder
        write_to_s3(jsonl_content, config['s3_read_data']['read_bucket'], config['s3_read_data']['source_data_prefix'], "", file_name)
        logger.info(f"Finished processing and uploading dataset: {dataset_file}")
    else:
        logger.info(f"The provided source data file {dataset_file} is not a hugging face dataset "
            "because it is not prefixed with 'hf:'. Assuming that this file is "
            "already provided")

### Prompt payload generation
---

In this portion of the `generate_data` step, FMBench fetches `jsonl` data files and uses those to generate prompt payloads using the specified prompt template in the configuration file.

In [None]:
def list_files():
    response = s3_client.list_objects_v2(Bucket=config['s3_read_data']['read_bucket'], Prefix=config['s3_read_data']['source_data_prefix'])
    return [obj['Key'] for obj in response['Contents']]

# List all files in the bucket and prefix
# s3_files = list_files()
s3_files = list_s3_files(config['s3_read_data']['read_bucket'], config['s3_read_data']['source_data_prefix'], '.jsonl')
logger.info(f"s3 paths of the data set -> {s3_files}")

# Log the files you're going to read
logger.info(f"dataset files = {s3_files}")

# Process source data files. If the dataset identifier is an hf
# dataset, then we strip out the 'hf:' and append the 'jsonl' file extension, 
# else we assume the provided jsonl file to be provided by the user in the 
# local/s3 read path.
processed_files = []
for source_file in config['s3_read_data']['source_data_files']:
    if source_file.startswith(HF_DATASET_PREFIX):
        # For Hugging Face datasets, append .jsonl
        processed_files.append(source_file.lstrip(HF_DATASET_PREFIX) + '.jsonl')
    else:
        # For S3 files, use as is
        processed_files.append(source_file)

# Read and concatenate DataFrames
# If there are any hf datasets that are read directly at runtime in this notebook, 
# then strip the "hf:" prefix from the source data file name in the config file and 
# read that.
jsonl_files = [
    file_key for file_key in s3_files 
    if file_key.replace(config['s3_read_data']['source_data_prefix'] + "/", "") in processed_files
]
logger.info(f"jsonl_files={jsonl_files}")

# Read and concatenate only the .jsonl files
# df = pd.concat([pd.read_json(io.BytesIO(s3_client.get_object(Bucket=config['s3_read_data']['read_bucket'], Key=file_key)['Body'].read()), lines=True) 
#                  for file_key in jsonl_files])
df = pd.concat([pd.read_json(io.BytesIO(get_s3_object(config['s3_read_data']['read_bucket'], file_key, decode=False)), lines=True) 
                for file_key in jsonl_files])

# Log the source of the dataset and its shape
logger.info(f"dataset read from {s3_files}\nhas shape {df.shape}")

#### View a portion of the df to view inputs, contexts, and more information on the data

In [None]:
df.head()

#### Check for if the dataset is a multimodal image dataset
---

In this portion, we check if there is a user provided `image_col` in the `datasets` section of the config file. If there is, the image column is used to generate a new column. This new column contains the `base64` conversion of the image which is later used during the inference step of the benchmarking process. If the user does not provide an image column, then the standard text generation benchmarking test will run as is. If the user has not provided a dataset with an image column but the configuration file contains an image column, an exception will be thrown.

In [None]:
def image_to_base64(img_dict: Optional[Dict[str, str]]) -> Optional[str]:
    """
    Convert an image represented as a dictionary with 'hex_data' and 'format' into a base64 encoded string.

    Args:
        img_dict (Optional[Dict[str, str]]): The input image data as a dictionary containing 'hex_data' and 'format'.
                                             If None, the function returns None.

    Returns:
        Optional[str]: A base64 encoded string representation of the image in JPEG format.
                       Returns None if the input is None or if an error occurs during conversion.
    """
    try:
        base64_img: Optional[str] = None
        if img_dict is None:
            logger.error("Provided image is None in the dataset. Make sure the dataset contains images.")
            return
        # Extract 'hex_data' and 'format' from the image dictionary
        hex_data = img_dict.get('hex_data')
        img_format = img_dict.get('format', DEFAULT_IMAGE_FORMAT)  # Default to 'JPEG' if format is not specified
        if not hex_data:
            logger.error("Hex data is missing in the image dictionary.")
            return
        # Convert hex data back into bytes
        img_bytes = bytes.fromhex(hex_data)

        # Create a BytesIO object from the image bytes
        img_buffer = BytesIO(img_bytes)

        # Open the image using PIL
        img = Image.open(img_buffer)

        # Convert the image to base64
        buffered = BytesIO()
        img.save(buffered, format=DEFAULT_IMAGE_FORMAT)
        base64_img = base64.b64encode(buffered.getvalue()).decode('utf-8')
    except Exception as e:
        logger.error(f"Error occurred while converting the image into base64: {e}")
        base64_img = None
    return base64_img

In [None]:
# if the image column is provided in the configuration while within the `dataset` section, 
# apply the base64 conversion function on the image column. If the image column is not provided, 
# the standard text generation benchmarking test will be used.
if config['datasets'].get('image_col') is not None:
    logger.info(f"Image column provided in the dataset: {config['datasets'].get('image_col')},"
                f"going to convert the image into base64 and generate a new base64 column")
    df['base64_img'] = df[config['datasets'].get('image_col')].apply(image_to_base64)
else:
    logger.info("Going to use the standard text generation benchmarking test. No image columns"
                "found in the dataset")

In [None]:
df.head(10)

#### Display basic statistics on the existing dataset: including count, mean, std, min, etc.

In [None]:
logger.info(f"distribution of the length field in the dataset is as follows ->\n{df.describe()}")

### Convert the dataset elements into prompts as payloads for inference purposes

Now, we will focus on converting the existing data within our datasets, and extract the informatprompt_templateion to convert it into prompts to be able to send to our deployed model endpoints during the process of testing and benchmarking for results and various metrics

In [None]:
%%time
if config['datasets']['prompt_template_keys']:
    df['prompt'] = df.apply(lambda row: process_item(row, config['datasets']['prompt_template_keys'], prompt_template), axis=1)
    df['prompt_len'] = df.prompt.map(lambda x: x['prompt_len'])
else:
    print("No prompt template keys provided. Using constant prompt for all rows.")
    constant_prompt = {
        'prompt': prompt_template,
        'prompt_len': len(prompt_template)
    }
    df['prompt'] = [constant_prompt] * len(df)
    df['prompt_len'] = constant_prompt['prompt_len']

In [None]:
# Convert DataFrame to a CSV format string
csv_buffer = io.StringIO()
df.to_csv(csv_buffer, index=False)
csv_data = csv_buffer.getvalue()
all_prompts_file = config['dir_paths']['all_prompts_file']

# Write to S3 using the write_to_s3 function
write_to_s3(csv_data,
            config['aws']['bucket'],
            DATA_DIR, config['dir_paths']['prompts_prefix'],
            all_prompts_file)

# Log where the prompts are saved
logger.info(f"all prompts dataframe of shape {df.shape} saved to "
            f"s3://{config['aws']['bucket']}/{DATA_DIR}/{os.path.join(config['dir_paths']['prompts_prefix'], all_prompts_file)}")

In [None]:
# View some of the prompts 
df.head()

### Convert Prompts into Payloads for inference purposes
------
Now we will prepare data for model inference. It involves converting prompts, created and stored in a specific format, into payloads for inference. We will utilize the prompt file for our model and incorporate the prompt into a payload using that. 

These payloads are tailored to the needs of deployed model endpoints. The conversion considers prompt sizes and specific configurations to further make our benchmarking more detailed and comprehensive. 

The goal is to have a set of well-formatted and parameterized payload requests of various sizes ready to be sent to the model endpoints for inference, with the responses to be used for further analysis.

In [None]:
# Function to construct a single request payload based on row prompt data and configuration
def construct_request_payload(row, config: Dict) -> Dict:

    # Deep copy inference parameters from the config.yml file - feel free to change this based on the model type you are using
    # parameters = copy.deepcopy(config['inference_parameters']['common'])
    # truncate = parameters.get('truncate', None)
    # if truncate == TRUNCATE_POLICY.AT_PROMPT_TOKEN_LENGTH:
    #     parameters['truncate'] = row['prompt_len']

    # Return the constructed payload along with the ground truth if any
    # is contained within the dataset
    prompt_dict: Optional[Dict] = None
    try:
        # Construct the base prompt dictionary with the prompt input
        prompt_dict = dict(inputs=row['prompt']['prompt'])

        # Add ground truth and question_col_key if available
        if ground_truth_col_key is not None and ground_truth_col_key in row:
            prompt_dict['ground_truth'] = row[ground_truth_col_key]

        if question_col_key is not None and question_col_key in row:
            prompt_dict['question'] = row[question_col_key]

        # Check if 'base64_img' column exists and add it to the prompt_dict
        if 'base64_img' in row and row['base64_img'] is not None:
            prompt_dict['base64_img'] = row['base64_img']

    except Exception as e:
        logger.error(f"Prompt template could not be constructed: {e}")
        prompt_dict = None

    return prompt_dict


# Function to create a dataset payload files from the given dataset file we have
def create_dataset_payload_file(df: pd.DataFrame, dataset_info: Dict, config: Dict) -> str:

    # First, log the dataset existing information
    logger.info(f"going to create a payload file as dataset_info={json.dumps(dataset_info, indent=2)}")

    # Filter the DataFrame based on prompt length and language given below for constructing payloads of various sizes
    df['prompt_len_in_range'] = df.prompt.map(lambda x: x['prompt_len'] >= dataset_info['min_length_in_tokens'] and \
                                                        x['prompt_len'] < dataset_info['max_length_in_tokens'])

    # select prompts between pre-configured threshold lengths and are in the selected language
    if 'language' in df.columns:
        df_filtered = df[(df.language == dataset_info['language']) & (df.prompt_len_in_range)]
    else:
        df_filtered = df[df.prompt_len_in_range]

    logger.info(f"after filtering for {json.dumps(dataset_info, indent=2)}, shape of dataframe is {df_filtered.shape}")
    if df_filtered.shape[0] == 0:
        logger.error(f"did not find any prompts in the dataframe that matched the filtering criteria, exiting")
        return None
    # df_filtered.head()

    # Here, we construct request payloads for each row in the filtered DataFrame
    df_filtered['request'] = df_filtered.apply(lambda r: construct_request_payload(r, config), axis=1)
    logger.info(f"payload request entry looks like this -> {json.dumps(df_filtered['request'].iloc[0], indent=2)}")

    # Convert the 'request' column of the filtered DataFrame to a JSON Lines string
    json_lines_str = df_filtered['request'].to_json(orient='records', lines=True)

    lang = dataset_info['language']
    min_len = dataset_info['min_length_in_tokens']
    max_len = dataset_info['max_length_in_tokens']
    file_name = dataset_info['payload_file'].format(lang=lang, min=min_len, max=max_len)

    prompts_path = os.path.join(DATA_DIR, config['dir_paths']['prompts_prefix'])

    # defining the s3_path these prompts will go to
    s3_file_path = os.path.join(prompts_path, file_name)

    # Write the JSON Lines string to S3
    # get the bucket name, config vars from config file
    prefix = f"{config['dir_paths']['prompts_prefix']}/{config['s3_read_data']['source_data_prefix']}"
    write_to_s3(json_lines_str, config['aws']['bucket'], DATA_DIR, prefix, file_name)

    logger.info(f"dataset of different payload file structures saved to s3://{config['aws']['bucket']}/{s3_file_path}")
    return f"s3://{config['aws']['bucket']}/{s3_file_path}"

In [None]:
items = ((df, d, config) for d in config['datasets']['filters'])

# This results in the creation of payload files for each dataset
paths: List = list(itertools.starmap(create_dataset_payload_file, items))

In [None]:
print("\n".join([p for p in paths if p]))