# Load data and store the prompts

1. Download the LLaMA 2 Tokenzier from https://huggingface.co/meta-llama/Llama-2-7b-hf/tree/main 
   and place the files into a directory named `llama2_tokenizer` in the same 
   directory as this notebook.

2. install the python packages below:

In [21]:
import glob
import json
import copy
import logging
import itertools
import pandas as pd
from globals import *
from typing import Dict, List
from utils import process_item, load_config, count_tokens

In [2]:
# global constants
!pygmentize globals.py

import os
from enum import Enum
from pathlib import Path

CONFIG_FILE: str = "config.yml"
DATA_DIR: str = "data"
PROMPTS_DIR = os.path.join(DATA_DIR, "prompts")
METRICS_DIR = os.path.join(DATA_DIR, "metrics")
MODELS_DIR = os.path.join(DATA_DIR, "models")
DATASET_DIR = os.path.join(DATA_DIR, "dataset")
DIR_LIST = [DATA_DIR, PROMPTS_DIR, METRICS_DIR, MODELS_DIR, DATASET_DIR]
TOKENIZER_DIR = 'llama2_tokenizer'

_ = list(map(lambda x: os.makedirs(x, exist_ok=True), DIR_LIST))

ENDPOINT_LIST_FPATH:str = os.path.join(MODELS_DIR, "endpoints.json")
REQUEST_PAYLOAD_FPATH:str = os.path.join(PROMPTS_DIR, "payload.jsonl")
RESULTS_FPATH:str = os.path.join(METRICS_DIR, "results.csv")
class TRUNCATE_POLICY(str, Enum):
    AT_PROMPT_TOKEN_LENGTH = 'at-prompt-token-length'    


In [3]:
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [22]:
config = load_config(CONFIG_FILE)
logger.info(json.dumps(config, indent=2))

[2024-01-05 07:37:55,987] p44384 {635462509.py:2} INFO - {
  "aws": {
    "region": "us-east-1",
    "sagemaker_execution_role": "arn:aws:iam::015469603702:role/SageMakerRepoRole"
  },
  "prompt": {
    "template_file": "prompt_template.txt",
    "all_prompts_file": "all_prompts.csv"
  },
  "datasets": [
    {
      "language": "en",
      "min_length_in_tokens": 1,
      "max_length_in_tokens": 500,
      "payload_file": "payload_{lang}_{min}-{max}.jsonl"
    },
    {
      "language": "en",
      "min_length_in_tokens": 500,
      "max_length_in_tokens": 1000,
      "payload_file": "payload_{lang}_{min}-{max}.jsonl"
    },
    {
      "language": "en",
      "min_length_in_tokens": 1000,
      "max_length_in_tokens": 2000,
      "payload_file": "payload_{lang}_{min}-{max}.jsonl"
    },
    {
      "language": "en",
      "min_length_in_tokens": 2000,
      "max_length_in_tokens": 3000,
      "payload_file": "payload_{lang}_{min}-{max}.jsonl"
    },
    {
      "language": "en",
     

In [13]:
fpath: str = os.path.join(PROMPTS_DIR, config['prompt']['template_file'])
prompt_template:str = Path(fpath).read_text().strip()
logger.info(f"prompt template from {fpath} ->\n{prompt_template}")
empty_prompt_len_in_tokens = count_tokens(prompt_template.format(context="", question=""))
logger.info(f"prompt template length={empty_prompt_len_in_tokens} tokens")

[2024-01-05 07:16:12,844] p44384 {3431602775.py:3} INFO - prompt template from data\prompts\prompt_template.txt ->
<s>[INST] <<SYS>>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context in the section demarcated by "```" to answer the question. If you don't know the answer just say that you don't know. Use three sentences maximum and keep the answer concise.
<</SYS>>

```
{context}
```

Question: {question}

[/INST]
Answer:
[2024-01-05 07:16:12,979] p44384 {3431602775.py:5} INFO - prompt template length=97 tokens


In [1]:
data_files = glob.glob(os.path.join(DATASET_DIR, "*"))
logger.info(f"dataset files = {data_files}")

df = pd.concat(map(lambda f: pd.read_json(f, lines=True), data_files))

logger.info(f"dataset read from {data_files}\nhas shape {df.shape}")

In [2]:
df.head()

In [3]:
logger.info(f"distribution of the length field in the dataset is as follows ->\n{df.length.describe()}")

In [4]:
%%time
df['prompt'] = df.apply(lambda row: process_item(row, prompt_template), axis=1)
df['prompt_len'] = df.prompt.map(lambda x: x['prompt_len'])


In [5]:
fpath: str = os.path.join(PROMPTS_DIR, config['prompt']['all_prompts_file'])
logger.info(f"all prompts dataframe of shape {df.shape} saved to {fpath}")
df.to_csv(fpath, index=False)

In [6]:
df.head()

In [26]:
# convert the prompts into payload we can send to the model
def construct_request_payload(row, config: Dict) -> Dict:
    parameters = copy.deepcopy(config['inference_parameters'])
    if parameters['truncate'] == TRUNCATE_POLICY.AT_PROMPT_TOKEN_LENGTH:
        parameters['truncate'] = row['prompt_len']
    return dict(inputs=row['prompt']['prompt'], parameters=parameters)

def create_dataset_payload_file(df: pd.DataFrame, dataset_info: Dict, config: Dict) -> str:
    logger.info(f"going to create a payload file as dataset_info={json.dumps(dataset_info, indent=2)}")
    df['prompt_len_in_range'] = df.prompt.map(lambda x: x['prompt_len'] >= dataset_info['min_length_in_tokens'] and \
                                                        x['prompt_len'] <= dataset_info['max_length_in_tokens'])
    # select prompts between pre-configured threshold lengths and are in the selected language
    df_filtered = df[(df.language == dataset_info['language']) & (df.prompt_len_in_range)]
    logger.info(f"after filtering for {json.dumps(dataset_info, indent=2)}, shape of dataframe is {df_filtered.shape}")
    # df_filtered.head()

    df_filtered['request'] = df_filtered.apply(lambda r: construct_request_payload(r, config), axis=1)
    logger.info(f"payload request entry looks like this -> {json.dumps(df_filtered['request'].iloc[0], indent=2)}")
    
    # save to the payload file
    lang = dataset_info['language']
    min = dataset_info['min_length_in_tokens']
    max = dataset_info['max_length_in_tokens']
    fpath: str = os.path.join(PROMPTS_DIR, dataset_info['payload_file'].format(lang=lang, min=min, max=max))
    logger.info(f"creating payload file={fpath}")

    # write the requests to a jsonl file
    df_filtered['request'].to_json(fpath, orient='records', lines=True)
    logger.info(f"dataset saved to {fpath}")
    return fpath

In [7]:
items = ((df, d, config) for d in config['datasets'])
paths: List = list(itertools.starmap(create_dataset_payload_file, items))