# Load data and store the prompts

1. Download the LLaMA 2 Tokenzier from https://huggingface.co/meta-llama/Llama-2-7b-hf/tree/main 
   and place the files into a directory named `llama2_tokenizer` in the same 
   directory as this notebook.

2. install the python packages below:

In [1]:
import glob
import time
import json
import copy
import asyncio
import logging
import itertools
import sagemaker
import pandas as pd
from globals import *
from datetime import datetime
from transformers import AutoTokenizer
from sagemaker.predictor import Predictor
from utils import load_config, count_tokens
from sagemaker.serializers import JSONSerializer
from typing import Dict, List, Optional, Tuple, Union

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [2]:
# global constants
!pygmentize globals.py

[34mimport[39;49;00m [04m[36mos[39;49;00m[37m[39;49;00m
[34mfrom[39;49;00m [04m[36menum[39;49;00m [34mimport[39;49;00m Enum[37m[39;49;00m
[34mfrom[39;49;00m [04m[36mpathlib[39;49;00m [34mimport[39;49;00m Path[37m[39;49;00m
[37m[39;49;00m
CONFIG_FILE: [36mstr[39;49;00m = [33m"[39;49;00m[33mconfig.yml[39;49;00m[33m"[39;49;00m[37m[39;49;00m
DATA_DIR: [36mstr[39;49;00m = [33m"[39;49;00m[33mdata[39;49;00m[33m"[39;49;00m[37m[39;49;00m
PROMPTS_DIR = os.path.join(DATA_DIR, [33m"[39;49;00m[33mprompts[39;49;00m[33m"[39;49;00m)[37m[39;49;00m
METRICS_DIR = os.path.join(DATA_DIR, [33m"[39;49;00m[33mmetrics[39;49;00m[33m"[39;49;00m)[37m[39;49;00m
MODELS_DIR = os.path.join(DATA_DIR, [33m"[39;49;00m[33mmodels[39;49;00m[33m"[39;49;00m)[37m[39;49;00m
DATASET_DIR = os.path.join(DATA_DIR, [33m"[39;49;00m[33mdataset[39;49;00m[33m"[39;49;00m)[37m[39;49;00m
DIR_LIST = [DATA_DIR, PROMPTS_DIR, METRICS_DIR, MODELS_DIR, DATASET_DIR][

In [3]:
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [4]:
config = load_config(CONFIG_FILE)
logger.info(json.dumps(config, indent=2))

[2024-01-09 15:55:46,610] p3238 {635462509.py:2} INFO - {
  "aws": {
    "region": "us-east-1",
    "sagemaker_execution_role": "arn:aws:iam::218208277580:role/service-role/AmazonSageMaker-ExecutionRole-20230807T175994"
  },
  "prompt": {
    "template_file": "prompt_template.txt",
    "all_prompts_file": "all_prompts.csv"
  },
  "datasets": [
    {
      "language": "en",
      "min_length_in_tokens": 1,
      "max_length_in_tokens": 500,
      "payload_file": "payload_{lang}_{min}-{max}.jsonl"
    },
    {
      "language": "en",
      "min_length_in_tokens": 500,
      "max_length_in_tokens": 1000,
      "payload_file": "payload_{lang}_{min}-{max}.jsonl"
    },
    {
      "language": "en",
      "min_length_in_tokens": 1000,
      "max_length_in_tokens": 2000,
      "payload_file": "payload_{lang}_{min}-{max}.jsonl"
    },
    {
      "language": "en",
      "min_length_in_tokens": 2000,
      "max_length_in_tokens": 3000,
      "payload_file": "payload_{lang}_{min}-{max}.jsonl"
  

In [5]:
date_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

In [1]:
# read the list of deployed endpoints
endpoint_info_list = json.loads(Path(ENDPOINT_LIST_FPATH).read_text())
logger.info(json.dumps(endpoint_info_list, indent=2))

In [3]:
# endpoint names
endpoint_name_list = [e['endpoint']['EndpointName'] for e in endpoint_info_list]
logger.info(f"there are {len(endpoint_name_list)} deployed endpoint(s), endpoint_name_list->{endpoint_name_list}")

In [8]:
# create predictor objects

## create a sagemaker predictor for these endpoints
def create_predictor(endpoint_name: str) -> Optional[sagemaker.base_predictor.Predictor]:
    # Create a SageMaker Predictor object
    predictor = Predictor(
        endpoint_name=endpoint_name,
        sagemaker_session=sagemaker.Session(),
        serializer=JSONSerializer()
    )
    return predictor

predictor_list: List = [create_predictor(ep) for ep in endpoint_name_list]
logger.info(predictor_list)

[2024-01-09 15:55:47,658] p3238 {2677505418.py:14} INFO - [<sagemaker.base_predictor.Predictor object at 0x7f2e11d18790>, <sagemaker.base_predictor.Predictor object at 0x7f2e10ba7b20>]


In [9]:
def safe_sum(l: List) -> Union[int, float]:
    return sum(filter(None, l))

def safe_div(n: Union[int, float], d: Union[int, float]) -> Optional[Union[int, float]]:
    return n/d if d else None

def calculate_metrics(responses, chunk, elapsed_async, experiment_name, concurrency, payload_file) -> Dict:
    errors = [r for r in responses if r['completion'] is None]
    # successes = len(chunk) - errors
    successes = len(chunk) - len(errors)
    all_prompts_token_count = safe_sum([r['prompt_tokens'] for r in responses])
    prompt_token_throughput = round(all_prompts_token_count / elapsed_async, 2)
    prompt_token_count_mean = safe_div(all_prompts_token_count, successes)
    all_completions_token_count = safe_sum([r['completion_tokens'] for r in responses])
    completion_token_throughput = round(all_completions_token_count / elapsed_async, 2)
    completion_token_count_mean = safe_div(all_completions_token_count, successes)
    transactions_per_second = round(successes / elapsed_async, 2)
    transactions_per_minute = int(transactions_per_second * 60)
    latency_mean = safe_div(safe_sum([r['latency'] for r in responses]), successes)

    return {
        'experiment_name': experiment_name,
        'concurrency': concurrency,
        'payload_file': payload_file,
        'errors': errors,
        'successes': successes,
        'error_rate': len(errors)/len(chunk),
        'all_prompts_token_count': all_prompts_token_count,
        'prompt_token_count_mean': prompt_token_count_mean,
        'prompt_token_throughput': prompt_token_throughput,
        'all_completions_token_count': all_completions_token_count,
        'completion_token_count_mean': completion_token_count_mean,
        'completion_token_throughput': completion_token_throughput,
        'transactions': len(chunk),
        'transactions_per_second': transactions_per_second,
        'transactions_per_minute': transactions_per_minute,
        'latency_mean': latency_mean
    }

In [10]:
def set_metrics(endpoint_name=None,
                    prompt=None,
                    inference_params=None,
                    completion=None,
                    prompt_tokens=None,
                    completion_tokens=None,
                    latency=None) -> Dict:
    return dict(endpoint_name=endpoint_name,                
                prompt=prompt,
                **inference_params,
                completion=completion,
                prompt_tokens=prompt_tokens,
                completion_tokens=completion_tokens,
                latency=latency)

def get_inference(predictor, payload) -> Dict:
    logger.info(f"endpoint={predictor.endpoint_name}, payload={json.dumps(payload, indent=2)}")    
    
    try:
        prompt_tokens = count_tokens(payload['inputs'])
        print(prompt_tokens)
        st = time.perf_counter()
        response = predictor.predict(payload)
        latency = time.perf_counter() - st  # Inference latency

        if isinstance(response, bytes):
            response = response.decode('utf-8')
        response_dict = json.loads(response)
        completion = response_dict.get("generated_text", "")
        completion_tokens = count_tokens(completion)
        response = set_metrics(predictor.endpoint_name,
                    payload['inputs'],
                    payload['parameters'],
                    completion,
                    prompt_tokens,
                    completion_tokens,
                    latency)
        logger.info(f"endpoint={predictor.endpoint_name}, payload={json.dumps(response, indent=2)}")
    except Exception as e:        
        print(f"error occurred with {predictor.endpoint_name}, exception={str(e)}")  
        response = set_metrics(predictor.endpoint_name,
                               payload['inputs'],
                               payload['parameters'],
                               None,
                               prompt_tokens,
                               None,
                               None)

    return response

In [11]:
async def async_get_inference(predictor, payload: Dict) -> Dict:
    return await asyncio.to_thread(get_inference, predictor, payload)

async def async_get_all_inferences(predictor, payload_list: List) -> List:
    return await asyncio.gather(*[async_get_inference(predictor, payload) for payload in payload_list])

In [12]:
async def run_inferences(predictor: sagemaker.base_predictor.Predictor, chunk: List, experiment: str, concurrency: int, payload_file: str) -> Tuple[List, Dict]:
    logger.info(f"Processing chunk with concurrency={concurrency}")
    s = time.perf_counter()
    responses = await async_get_all_inferences(predictor, chunk)
    elapsed_async = time.perf_counter() - s

    # Add more metadata about this experiment
    for r in responses:
        r['experiment_name'] = experiment['name']
        r['concurrency'] = concurrency

    metrics = calculate_metrics(responses, chunk, elapsed_async, experiment['name'], concurrency, payload_file)
    return responses, metrics

In [13]:
## Function to create the predictors from the experiment we are iterating over
def create_predictor_for_experiment(experiment: str, config: Dict, endpoint_info_list: List) -> Optional[sagemaker.base_predictor.Predictor]:

    ## Here, we set the index and then iterate through the experiments
    e_idx = config['experiments'].index(experiment) + 1

    ## Iterate through the endpoint information to fetch the endpoint name
    ep_info = [e for e in endpoint_info_list if e['experiment_name'] == experiment['name']]
    if not ep_info:
        logger.error(f"endpoint for experiment={experiment['name']} not found, skipping")
        return None
    ep_name = ep_info[0]['endpoint']['EndpointName']
    logger.info(f"experiment={e_idx}, name={experiment['name']}, ep_name={ep_name}")

    # create a predictor from each endpoint in experiments
    return create_predictor(ep_name)

In [14]:
## Here, we will process combinations of concurrency levels, the payload files and then loop through the 
## different combinations to make payloads splitted in terms of the concurrency metric and how we can run 
## it and make inference
def create_combinations(experiment: str) -> List[Tuple]:
    combinations_data = []

    # Repeat for each concurrency level
    combinations = list(itertools.product(experiment['concurrency_levels'], experiment['payload_files']))
    logger.info(f"there are {len(combinations)} combinations of {combinations} to run")

    for concurrency, payload_file in combinations:
        # Read the payload file
        fpath = os.path.join(PROMPTS_DIR, payload_file)
        payload_list = [json.loads(jline) for jline in Path(fpath).read_text().splitlines()]
        logger.info(f"read {fpath}, contains {len(payload_list)} lines")      

        logger.info(f"creating combinations for concurrency={concurrency}, payload_file={payload_file}, payload_list length={len(payload_list)}")
        # Split the original list into sublists which contain the number of requests we want to send concurrently
        n = concurrency
        payload_list_splitted = [payload_list[i * n:(i + 1) * n] for i in range((len(payload_list) + n - 1) // n )]  

        # Only keep lists that have at least concurrency number of elements
        len_before = len(payload_list_splitted)
        payload_list_splitted = [p for p in payload_list_splitted if len(p) == concurrency]
        logger.info(f"after only retaining chunks of length {concurrency}, we have {len(payload_list_splitted)} chunks, previously we had {len_before} chunks")
        combinations_data.append((concurrency, payload_file, payload_list_splitted))
    logger.info(f"there are {len(combinations)} for {experiment}")
    return combinations_data

# process_combinations(experiment, predictor, PROMPTS_DIR)

In [4]:
# for each experiment
#   - for each endpoint and concurrency in an experiment
per_prompt_responses = []
per_concurrency_level_response_metrics = []

for e_idx, experiment in enumerate(config['experiments']):
    e_idx += 1  # Increment experiment index

    # Call do_experiment function to create the predictor object
    predictor = create_predictor_for_experiment(experiment, config, endpoint_info_list)
    if predictor is None:
        logger.error(f"predictor could not be created for experiment={experiment}, moving to next...")
        continue

    # Process combinations of concurrency levels and payload files
    combination_data = create_combinations(experiment)

    for concurrency, payload_file, split_payload in combination_data:
        for chunk_index, chunk in enumerate(split_payload):
            logger.info(f"e_idx={e_idx+1}, chunk_index={chunk_index+1}/{len(split_payload)}")

            # Process each chunk and calculate metrics
            responses, metrics = await run_inferences(predictor, chunk, experiment, concurrency, payload_file)
            if metrics:
                per_concurrency_level_response_metrics.append(metrics)
            per_prompt_responses.extend(responses)
            
            logger.info(f"completed processing chunk {chunk_index+1}/{len(split_payload)} with concurrency={concurrency}")

    logger.info(f"experiment={e_idx+1}, name={experiment['name']}, done")

In [5]:
for i, response in enumerate(per_prompt_responses[:5]):
    completion = response.get('completion')
    logger.info(f"Response {i}: Completion Exists: {'completion' in response}, Completion Value: {completion}")

# Check if per_prompt_responses is a list of lists
if all(isinstance(item, list) for item in per_prompt_responses):
    logger.info("per_prompt_responses is a list of lists.")
else:
    logger.info("per_prompt_responses is not a list of lists.")

# Flatten the list only if it's a list of lists
if all(isinstance(item, list) for item in per_prompt_responses):
    per_prompt_responses_flattened = list(itertools.chain(*per_prompt_responses))
else:
    per_prompt_responses_flattened = per_prompt_responses

# Proceed with your existing filtering logic
per_prompt_responses_valid = [r for r in per_prompt_responses_flattened if isinstance(r, dict) and r.get('completion') is not None]
logger.info(f"Valid responses: {len(per_prompt_responses_valid)} out of {len(per_prompt_responses_flattened)}")

In [6]:
df_responses = pd.DataFrame(per_prompt_responses_valid)
logger.info(f"created dataframe of shape {df_responses.shape} from all responses")
df_responses.head()

In [7]:
df_endpoints = pd.json_normalize(endpoint_info_list)
df_endpoints['instance_type'] = df_endpoints['endpoint_config.ProductionVariants'].map(lambda x: x[0]['InstanceType'])
df_endpoints
cols_of_interest = ['experiment_name', 
                    'instance_type',
                    'endpoint.EndpointName',
                    'model_config.ModelName',
                    'model_config.PrimaryContainer.Image',   
                    'model_config.PrimaryContainer.ModelDataSource.S3DataSource.S3Uri',
                    'model_config.PrimaryContainer.Environment.OPTION_DTYPE',
                    'model_config.PrimaryContainer.Environment.OPTION_MAX_ROLLING_BATCH_SIZE',
                    'model_config.PrimaryContainer.Environment.OPTION_NEURON_OPTIMIZE_LEVEL',
                    'model_config.PrimaryContainer.Environment.OPTION_N_POSITIONS',
                    'model_config.PrimaryContainer.Environment.OPTION_ROLLING_BATCH',
                    'model_config.PrimaryContainer.Environment.OPTION_TENSOR_PARALLEL_DEGREE',
                    'model_config.PrimaryContainer.Environment.SAGEMAKER_MODEL_SERVER_WORKERS']
df_endpoints = df_endpoints[cols_of_interest]
df_endpoints = df_endpoints[cols_of_interest]
cols_of_interest_renamed = [c.split('.')[-1] for c in cols_of_interest]
df_endpoints.columns = cols_of_interest_renamed

# Check if 'experiment_name' column exists in both DataFrames
print("Columns in df_responses:", df_responses.columns)
print("Columns in df_endpoints:", df_endpoints.columns)

# Merge operation
df_results = pd.merge(left=df_responses, right=df_endpoints, how='left', left_on='experiment_name', right_on='experiment_name')

# Inspect the result
df_results.head()

In [8]:
df_results = pd.merge(left=df_responses, right=df_endpoints, how='left', left_on='experiment_name', right_on='experiment_name')
df_results.head()

In [9]:
fpath: str = os.path.join(METRICS_DIR, config['results']['per_inference_request_file']).format(datetime=date_time)
df_results.to_csv(fpath, index=False)
logger.info(f"saved results dataframe of shape={df_results.shape} in {fpath}")

In [10]:
df_metrics = pd.DataFrame(per_concurrency_level_response_metrics)
df_metrics = pd.merge(left=df_metrics, right=df_endpoints, how='left', left_on='experiment_name', right_on='experiment_name')
df_metrics.head()
fpath: str = os.path.join(METRICS_DIR, config['results']['all_metrics_file']).format(datetime=date_time)
df_metrics.to_csv(fpath, index=False)
logger.info(f"saved metrics results dataframe of shape={df_metrics.shape} in {fpath}")