## LLaMa Model Deployments (for neuron in inf. based instances) with different model configurations

-- Run the config.yaml file to store the models as well as your account execution role.

In [1]:
import sys
import time
import json
import boto3
import asyncio
import logging
import pathlib
import importlib.util
from globals import *
from pathlib import Path
from utils import load_config
from typing import Dict, List, Optional
from botocore.exceptions import ClientError

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
# global constants
!pygmentize globals.py

import os
import yaml
from enum import Enum
from pathlib import Path

CONFIG_FILE: str = "config.yml"
with open(CONFIG_FILE, 'r') as file:
    config = yaml.safe_load(file)

DATA_DIR: str = "data"
PROMPTS_DIR = os.path.join(DATA_DIR, "prompts")
METRICS_DIR = os.path.join(DATA_DIR, "metrics", config['general']['name'])
METRICS_PER_INFERENCE_DIR  = os.path.join(METRICS_DIR, "per_inference")
METRICS_PER_CHUNK_DIR  = os.path.join(METRICS_DIR, "per_chunk")
MODELS_DIR = os.path.join(DATA_DIR, "models", config['general']['name'])
DATASET_DIR = os.path.join(DATA_DIR, "dataset")
SCRIPTS_DIR: str = "scripts"
DIR_LIST = [DATA_DIR, PROMPTS_DIR, METRICS_DIR, MODELS_DIR, DATASET_DIR, METRICS_PER_INFERENCE_DIR, METRICS_PER_CHUNK_DIR]
TOKENIZER_DIR = 'llama2_tokenizer'

_ = list(map(lambda x: os.makedirs(x, exist_ok=True), DIR_LIST))

ENDPOINT_LIST_FPATH:str = os.path.join(MODELS_DIR, "endpoints.json")
REQUEST_PAYLOAD_FPATH:str = os.path.join(PROMPTS_DIR, "payload.jsonl")
RESULTS_FPATH:str = os.path.j

In [3]:
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [4]:
config = load_config(CONFIG_FILE)
aws_region = config['aws']['region']
sagemaker_execution_role = config['aws']['sagemaker_execution_role']
logger.info(f"aws_region={aws_region}, sagemaker_execution_role={sagemaker_execution_role}")
logger.info(f"config={json.dumps(config, indent=2)}")

[2024-01-17 22:37:54,060] p51768 {2442081166.py:4} INFO - aws_region=us-east-1, sagemaker_execution_role=arn:aws:iam::015469603702:role/SageMakerRepoRole
[2024-01-17 22:37:54,062] p51768 {2442081166.py:5} INFO - config={
  "general": {
    "name": "llama2-inf2-g5-p4d-v1"
  },
  "aws": {
    "region": "us-east-1",
    "sagemaker_execution_role": "arn:aws:iam::015469603702:role/SageMakerRepoRole"
  },
  "prompt": {
    "template_file": "prompt_template.txt",
    "all_prompts_file": "all_prompts.csv"
  },
  "datasets": [
    {
      "language": "en",
      "min_length_in_tokens": 1,
      "max_length_in_tokens": 500,
      "payload_file": "payload_{lang}_{min}-{max}.jsonl"
    },
    {
      "language": "en",
      "min_length_in_tokens": 500,
      "max_length_in_tokens": 1000,
      "payload_file": "payload_{lang}_{min}-{max}.jsonl"
    },
    {
      "language": "en",
      "min_length_in_tokens": 1000,
      "max_length_in_tokens": 2000,
      "payload_file": "payload_{lang}_{min}-{ma

In [5]:
# function to deploy a model
def deploy_model(experiment_config: Dict, aws_region: str, role_arn: str) -> Optional[Dict]:
    logger.info(f"going to deploy {experiment_config}, in {aws_region} with {role_arn}")
    model_deployment_result = None
    deploy = experiment_config.get('deploy', False)
    if deploy is False:
        logger.error(f"skipping deployment of {experiment_config['model_id']} because deploy={deploy}")
        return model_deployment_result
    
    try:        
        module_name = Path(experiment_config['deployment_script']).stem
        file_path = os.path.join(pathlib.Path().absolute().resolve(), SCRIPTS_DIR, f"{module_name}.py")
        logger.info(f"going to deploy using code in {file_path}")

        spec = importlib.util.spec_from_file_location(module_name, file_path)
        module = importlib.util.module_from_spec(spec)
        sys.modules[module_name] = module
        spec.loader.exec_module(module)
        model_deployment_result = module.deploy(experiment_config, role_arn)
        return model_deployment_result
    
    except ClientError as error:
        print(f"an error occurred: {error}")
        return model_deployment_result

In [6]:
async def async_deploy_model(experiment_config: Dict, role_arn: str, aws_region: str) -> str:
    return await asyncio.to_thread(deploy_model, experiment_config, role_arn, aws_region)

async def async_deploy_all_models(config: Dict) -> List[Dict]:
    experiments: List[Dict] = config['experiments']
    n: int = 4 # max concurrency so as to not get a throttling exception
    experiments_splitted = [experiments[i * n:(i + 1) * n] for i in range((len(experiments) + n - 1) // n )]
    results = []
    for exp_list in experiments_splitted:
        result = await asyncio.gather(*[async_deploy_model(m,
                                                           config['aws']['region'],
                                                           config['aws']['sagemaker_execution_role']) for m in exp_list])
        results.extend(result)
    return results

In [7]:
# async version
s = time.perf_counter()
endpoint_names = await async_deploy_all_models(config)
elapsed_async = time.perf_counter() - s
print(f"endpoint_names -> {endpoint_names}, deployed in {elapsed_async:0.2f} seconds")

[2024-01-17 22:37:54,101] p51768 {2014866166.py:3} INFO - going to deploy {'name': 'llama2-70b-chat-p4d.24xlarge-tgi-inference-2.0.1-tgi0.9.3-gpu-py39-cu118', 'model_id': 'llama2-70b-chat', 'model_version': '*', 'model_name': 'llama2-70b-MODEL_chat', 'ep_name': 'llama-2-70b-chat-p4d-24xlarge', 'instance_type': 'ml.p4d.24xlarge', 'image_uri': '763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi0.9.3-gpu-py39-cu118-ubuntu20.04', 'deploy': True, 'instance_count': 1, 'deployment_script': 'p4d_hf_tgi.py', 'payload_files': ['payload_en_1-500.jsonl'], 'concurrency_levels': [1], 'accept_eula': True, 'env': {'MODEL_LOADING_TIMEOUT': '3600', 'NUMBER_OF_GPU': 8, 'INSTANCE_COUNT': 1, 'HEALTH_CHECK_TIMEOUT': 300}}, in us-east-1 with arn:aws:iam::015469603702:role/SageMakerRepoRole
[2024-01-17 22:37:54,106] p51768 {2014866166.py:3} INFO - going to deploy {'name': 'llama2-13b-g5.12xlarge-huggingface-pytorch-tgi-inference-2.0.1-tgi1.1.0', 'model_id': 'meta-textgene

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\aroraai\AppData\Local\sagemaker\sagemaker\config.yaml


[2024-01-17 22:37:57,030] p51768 {credentials.py:1278} INFO - Found credentials in shared credentials file: ~/.aws/credentials
[2024-01-17 22:37:57,031] p51768 {credentials.py:1278} INFO - Found credentials in shared credentials file: ~/.aws/credentials
[2024-01-17 22:37:57,031] p51768 {credentials.py:1278} INFO - Found credentials in shared credentials file: ~/.aws/credentials


region name -> us-east-1


[2024-01-17 22:38:00,270] p51768 {p4d_hf_tgi.py:108} INFO - deploying the model using the llm_model and the configurations ....
[2024-01-17 22:38:00,295] p51768 {image_uris.py:579} INFO - Defaulting to only available Python version: py39
[2024-01-17 22:38:00,312] p51768 {image_uris.py:503} INFO - Defaulting to only supported image scope: gpu.
[2024-01-17 22:38:00,313] p51768 {p4d_hf_tgi.py:112} INFO - retrieved the inference uri -> 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi0.9.3-gpu-py39-cu118-ubuntu20.04
[2024-01-17 22:38:00,337] p51768 {p4d_hf_tgi.py:116} INFO - the llm_model has been defined .... <sagemaker.huggingface.model.HuggingFaceModel object at 0x000002801277BF90>


first, retrieving the hugging face image uri .....
The image uri being used -> 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi0.9.3-gpu-py39-cu118-ubuntu20.04
Setting the model configurations .....
Hugging face model defined using {'HF_MODEL_ID': 'meta-llama/Llama-2-70b-chat-hf', 'SM_NUM_GPUS': '8', 'MAX_INPUT_LENGTH': '4090', 'MAX_TOTAL_TOKENS': '4096', 'MAX_BATCH_TOTAL_TOKENS': '8192', 'HUGGING_FACE_HUB_TOKEN': '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'} -> <sagemaker.huggingface.model.HuggingFaceModel object at 0x000002801277BF90>


Model 'meta-textgeneration-llama-2-13b' requires accepting end-user license agreement (EULA). See https://jumpstart-cache-prod-us-east-1.s3.us-east-1.amazonaws.com/fmhMetadata/eula/llamaEula.txt for terms of use.
[2024-01-17 22:38:00,610] p51768 {utils.py:475} INFO - Model 'meta-textgeneration-llama-2-13b' requires accepting end-user license agreement (EULA). See https://jumpstart-cache-prod-us-east-1.s3.us-east-1.amazonaws.com/fmhMetadata/eula/llamaEula.txt for terms of use.
Using model 'meta-textgeneration-llama-2-13b' with wildcard version identifier '*'. You can pin to version '3.0.2' for more stable results. Note that models may have different input/output signatures after a major version upgrade.
[2024-01-17 22:38:00,615] p51768 {session.py:3701} INFO - Creating model with name: meta-textgeneration-llama-2-13b-2024-01-18-03-38-00-613
Model 'meta-textgeneration-llama-2-13b' requires accepting end-user license agreement (EULA). See https://jumpstart-cache-prod-us-east-1.s3.us-east-

------------------------------------!-!---!------!

[2024-01-17 22:47:10,863] p51768 {p4d_hf_tgi.py:119} INFO - Deploying the model now ....
[2024-01-17 22:47:11,585] p51768 {p4d_hf_tgi.py:122} INFO - Endpoint status: InService
[2024-01-17 22:47:11,596] p51768 {2014866166.py:3} INFO - going to deploy {'name': 'llama2-13b-inf2.24xlarge-djl-0.24.0-neuronx-sdk-2.14.1-bs=4-tpd=12', 'model_id': 'meta-textgenerationneuron-llama-2-13b-f', 'model_version': '1.0.0', 'model_name': 'llama2-13b', 'ep_name': 'llama-2-13b-inf2-24xlarge', 'instance_type': 'ml.inf2.24xlarge', 'image_uri': '763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1', 'deploy': True, 'instance_count': 1, 'deployment_script': 'jumpstart.py', 'payload_files': ['payload_en_1-500.jsonl'], 'concurrency_levels': [1], 'accept_eula': True, 'env': {'OPTION_DTYPE': 'fp16', 'OPTION_MAX_ROLLING_BATCH_SIZE': '4', 'OPTION_N_POSITIONS': '4096', 'OPTION_TENSOR_PARALLEL_DEGREE': '12', 'SAGEMAKER_MODEL_SERVER_WORKERS': '1', 'SAGEMAKER_TS_RESPONSE_TIMEOUT': '120', 

------------------------------!!endpoint_names -> [{'endpoint_name': 'huggingface-pytorch-tgi-inference-2024-01-18-03-38-07-058', 'experiment_name': 'llama2-70b-chat-p4d.24xlarge-tgi-inference-2.0.1-tgi0.9.3-gpu-py39-cu118'}, {'endpoint_name': 'llama-2-13b-g5-12xlarge-1705549080', 'experiment_name': 'llama2-13b-g5.12xlarge-huggingface-pytorch-tgi-inference-2.0.1-tgi1.1.0'}, {'endpoint_name': 'llama-2-13b-g5-24xlarge-1705549080', 'experiment_name': 'llama2-13b-g5.24xlarge-huggingface-pytorch-tgi-inference-2.0.1-tgi1.1.0'}, {'endpoint_name': 'llama-2-13b-g5-48xlarge-1705549080', 'experiment_name': 'llama2-13b-g5.48xlarge-huggingface-pytorch-tgi-inference-2.0.1-tgi1.1.0'}, {'endpoint_name': 'llama-2-13b-inf2-24xlarge-1705549632', 'experiment_name': 'llama2-13b-inf2.24xlarge-djl-0.24.0-neuronx-sdk-2.14.1-bs=4-tpd=12'}, {'endpoint_name': 'llama-2-13b-inf2-48xlarge-1705549632', 'experiment_name': 'llama2-13b-inf2.48xlarge-djl-0.24.0-neuronx-sdk-2.14.1-bs=4-tpd=24'}], deployed in 1046.00 seco

In [8]:
def get_all_info_for_endpoint(ep: Dict) -> Dict:
    ep_name = ep['endpoint_name']
    experiment_name = ep['experiment_name']
    if ep_name is None:
        return None
    sm_client = boto3.client('sagemaker')
    endpoint = sm_client.describe_endpoint(EndpointName=ep_name)
    endpoint_config = sm_client.describe_endpoint_config(EndpointConfigName=endpoint['EndpointConfigName'])
    model_config = sm_client.describe_model(ModelName=endpoint_config['ProductionVariants'][0]['ModelName'])
    info = dict(experiment_name=experiment_name,
                endpoint=endpoint,
                endpoint_config=endpoint_config,
                model_config=model_config)
    return info

all_info = list(map(get_all_info_for_endpoint, [ep for ep in endpoint_names if ep is not None]))
all_info

[{'experiment_name': 'llama2-70b-chat-p4d.24xlarge-tgi-inference-2.0.1-tgi0.9.3-gpu-py39-cu118',
  'endpoint': {'EndpointName': 'huggingface-pytorch-tgi-inference-2024-01-18-03-38-07-058',
   'EndpointArn': 'arn:aws:sagemaker:us-east-1:015469603702:endpoint/huggingface-pytorch-tgi-inference-2024-01-18-03-38-07-058',
   'EndpointConfigName': 'huggingface-pytorch-tgi-inference-2024-01-18-03-38-07-058',
   'ProductionVariants': [{'VariantName': 'AllTraffic',
     'DeployedImages': [{'SpecifiedImage': '763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi0.9.3-gpu-py39-cu118-ubuntu20.04',
       'ResolvedImage': '763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference@sha256:947399ae8b3fa131fc6d2da99f56c9c41195c7ce7cbd890e1e6c0dc328d238cd',
       'ResolutionTime': datetime.datetime(2024, 1, 17, 22, 38, 8, 323000, tzinfo=tzlocal())}],
     'CurrentWeight': 1.0,
     'DesiredWeight': 1.0,
     'CurrentInstanceCount': 1,
     'Desired

In [9]:
# write all end point info to a file so that other notebooks can read it
Path(ENDPOINT_LIST_FPATH).write_text(json.dumps(all_info, indent=2, default=str))

28176