## LLaMa Model Deployments (for neuron in inf. based instances) with different model configurations

-- Run the config.yaml file to store the models as well as your account execution role.

In [2]:
import sys
import time
import json
import boto3
import asyncio
import logging
import pathlib
import importlib.util
from globals import *
from pathlib import Path
from utils import load_config
from typing import Dict, List, Optional
from botocore.exceptions import ClientError

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [3]:
# global constants
!pygmentize globals.py

[34mimport[39;49;00m [04m[36mos[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36myaml[39;49;00m[37m[39;49;00m
[34mfrom[39;49;00m [04m[36menum[39;49;00m [34mimport[39;49;00m Enum[37m[39;49;00m
[34mfrom[39;49;00m [04m[36mpathlib[39;49;00m [34mimport[39;49;00m Path[37m[39;49;00m
[37m[39;49;00m
CONFIG_FILE: [36mstr[39;49;00m = [33m"[39;49;00m[33mconfig.yml[39;49;00m[33m"[39;49;00m[37m[39;49;00m
[34mwith[39;49;00m [36mopen[39;49;00m(CONFIG_FILE, [33m'[39;49;00m[33mr[39;49;00m[33m'[39;49;00m) [34mas[39;49;00m file:[37m[39;49;00m
    config = yaml.safe_load(file)[37m[39;49;00m
[37m[39;49;00m
DATA_DIR: [36mstr[39;49;00m = [33m"[39;49;00m[33mdata[39;49;00m[33m"[39;49;00m[37m[39;49;00m
PROMPTS_DIR = os.path.join(DATA_DIR, [33m"[39;49;00m[33mprompts[39;49;00m[33m"[39;49;00m)[37m[39;49;00m
METRICS_DIR = os.path.join(DATA_DIR, [33m"[39;49;00m[33mmetrics[39;49;00m[33m"[39;49;00m, config[[33m'[39;49;00m[33mge

In [4]:
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [5]:
config = load_config(CONFIG_FILE)
aws_region = config['aws']['region']
sagemaker_execution_role = config['aws']['sagemaker_execution_role']
logger.info(f"aws_region={aws_region}, sagemaker_execution_role={sagemaker_execution_role}")
logger.info(f"config={json.dumps(config, indent=2)}")

[2024-01-18 15:34:20,035] p565 {2442081166.py:4} INFO - aws_region=us-east-1, sagemaker_execution_role=arn:aws:iam::015469603702:role/SageMakerRepoRole
[2024-01-18 15:34:20,036] p565 {2442081166.py:5} INFO - config={
  "general": {
    "name": "llama2-inf2-g5-p4d-v1"
  },
  "aws": {
    "region": "us-east-1",
    "sagemaker_execution_role": "arn:aws:iam::015469603702:role/SageMakerRepoRole"
  },
  "prompt": {
    "template_file": "prompt_template.txt",
    "all_prompts_file": "all_prompts.csv"
  },
  "datasets": [
    {
      "language": "en",
      "min_length_in_tokens": 1,
      "max_length_in_tokens": 500,
      "payload_file": "payload_{lang}_{min}-{max}.jsonl"
    },
    {
      "language": "en",
      "min_length_in_tokens": 500,
      "max_length_in_tokens": 1000,
      "payload_file": "payload_{lang}_{min}-{max}.jsonl"
    },
    {
      "language": "en",
      "min_length_in_tokens": 1000,
      "max_length_in_tokens": 2000,
      "payload_file": "payload_{lang}_{min}-{max}.j

In [6]:
# function to deploy a model
def deploy_model(experiment_config: Dict, aws_region: str, role_arn: str) -> Optional[Dict]:
    logger.info(f"going to deploy {experiment_config}, in {aws_region} with {role_arn}")
    model_deployment_result = None
    deploy = experiment_config.get('deploy', False)
    if deploy is False:
        logger.error(f"skipping deployment of {experiment_config['model_id']} because deploy={deploy}")
        return model_deployment_result
    
    try:        
        module_name = Path(experiment_config['deployment_script']).stem
        file_path = os.path.join(pathlib.Path().absolute().resolve(), SCRIPTS_DIR, f"{module_name}.py")
        logger.info(f"going to deploy using code in {file_path}")

        spec = importlib.util.spec_from_file_location(module_name, file_path)
        module = importlib.util.module_from_spec(spec)
        sys.modules[module_name] = module
        spec.loader.exec_module(module)
        model_deployment_result = module.deploy(experiment_config, role_arn)
        return model_deployment_result
    
    except ClientError as error:
        print(f"an error occurred: {error}")
        return model_deployment_result

In [7]:
async def async_deploy_model(experiment_config: Dict, role_arn: str, aws_region: str) -> str:
    return await asyncio.to_thread(deploy_model, experiment_config, role_arn, aws_region)

async def async_deploy_all_models(config: Dict) -> List[Dict]:
    experiments: List[Dict] = config['experiments']
    n: int = 4 # max concurrency so as to not get a throttling exception
    experiments_splitted = [experiments[i * n:(i + 1) * n] for i in range((len(experiments) + n - 1) // n )]
    results = []
    for exp_list in experiments_splitted:
        result = await asyncio.gather(*[async_deploy_model(m,
                                                           config['aws']['region'],
                                                           config['aws']['sagemaker_execution_role']) for m in exp_list])
        results.extend(result)
    return results

In [8]:
# async version
s = time.perf_counter()
endpoint_names = await async_deploy_all_models(config)
elapsed_async = time.perf_counter() - s
print(f"endpoint_names -> {endpoint_names}, deployed in {elapsed_async:0.2f} seconds")

[2024-01-18 15:34:20,250] p565 {2014866166.py:3} INFO - going to deploy {'name': 'llama2-13b-g5.12xlarge-huggingface-pytorch-tgi-inference-2.0.1-tgi1.1.0', 'model_id': 'meta-textgeneration-llama-2-13b', 'model_version': '*', 'model_name': 'llama2-13b', 'ep_name': 'llama-2-13b-g5-12xlarge', 'instance_type': 'ml.g5.12xlarge', 'image_uri': '763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04', 'deploy': True, 'instance_count': 1, 'deployment_script': 'jumpstart.py', 'payload_files': ['payload_en_1-500.jsonl', 'payload_en_500-1000.jsonl', 'payload_en_1000-2000.jsonl', 'payload_en_2000-3000.jsonl', 'payload_en_3000-4000.jsonl'], 'concurrency_levels': [1, 2, 4], 'accept_eula': True, 'env': {'SAGEMAKER_PROGRAM': 'inference.py', 'ENDPOINT_SERVER_TIMEOUT': '3600', 'MODEL_CACHE_ROOT': '/opt/ml/model', 'SAGEMAKER_ENV': '1', 'HF_MODEL_ID': '/opt/ml/model', 'MAX_INPUT_LENGTH': '4095', 'MAX_TOTAL_TOKENS': '4096', 'SM_NUM_GPUS': '4'

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


Model 'meta-textgeneration-llama-2-13b' requires accepting end-user license agreement (EULA). See https://jumpstart-cache-prod-us-east-1.s3.us-east-1.amazonaws.com/fmhMetadata/eula/llamaEula.txt for terms of use.
[2024-01-18 15:34:22,377] p565 {utils.py:475} INFO - Model 'meta-textgeneration-llama-2-13b' requires accepting end-user license agreement (EULA). See https://jumpstart-cache-prod-us-east-1.s3.us-east-1.amazonaws.com/fmhMetadata/eula/llamaEula.txt for terms of use.
Using model 'meta-textgeneration-llama-2-13b' with wildcard version identifier '*'. You can pin to version '3.0.2' for more stable results. Note that models may have different input/output signatures after a major version upgrade.
Model 'meta-textgeneration-llama-2-13b' requires accepting end-user license agreement (EULA). See https://jumpstart-cache-prod-us-east-1.s3.us-east-1.amazonaws.com/fmhMetadata/eula/llamaEula.txt for terms of use.
[2024-01-18 15:34:22,408] p565 {utils.py:475} INFO - Model 'meta-textgenerati

---------------------------!!-!endpoint_names -> [{'endpoint_name': 'llama-2-13b-g5-12xlarge-1705592062', 'experiment_name': 'llama2-13b-g5.12xlarge-huggingface-pytorch-tgi-inference-2.0.1-tgi1.1.0'}, {'endpoint_name': 'llama-2-13b-g5-24xlarge-1705592062', 'experiment_name': 'llama2-13b-g5.24xlarge-huggingface-pytorch-tgi-inference-2.0.1-tgi1.1.0'}, {'endpoint_name': 'llama-2-13b-g5-48xlarge-1705592062', 'experiment_name': 'llama2-13b-g5.48xlarge-huggingface-pytorch-tgi-inference-2.0.1-tgi1.1.0'}], deployed in 339.28 seconds


In [9]:
def get_all_info_for_endpoint(ep: Dict) -> Dict:
    ep_name = ep['endpoint_name']
    experiment_name = ep['experiment_name']
    if ep_name is None:
        return None
    sm_client = boto3.client('sagemaker')
    endpoint = sm_client.describe_endpoint(EndpointName=ep_name)
    endpoint_config = sm_client.describe_endpoint_config(EndpointConfigName=endpoint['EndpointConfigName'])
    model_config = sm_client.describe_model(ModelName=endpoint_config['ProductionVariants'][0]['ModelName'])
    info = dict(experiment_name=experiment_name,
                endpoint=endpoint,
                endpoint_config=endpoint_config,
                model_config=model_config)
    return info

all_info = list(map(get_all_info_for_endpoint, [ep for ep in endpoint_names if ep is not None]))
all_info

[{'experiment_name': 'llama2-13b-g5.12xlarge-huggingface-pytorch-tgi-inference-2.0.1-tgi1.1.0',
  'endpoint': {'EndpointName': 'llama-2-13b-g5-12xlarge-1705592062',
   'EndpointArn': 'arn:aws:sagemaker:us-east-1:015469603702:endpoint/llama-2-13b-g5-12xlarge-1705592062',
   'EndpointConfigName': 'llama-2-13b-g5-12xlarge-1705592062',
   'ProductionVariants': [{'VariantName': 'AllTraffic',
     'DeployedImages': [{'SpecifiedImage': '763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04',
       'ResolvedImage': '763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference@sha256:2739b630b95d8a95e6b4665e66d8243dd43b99c4fdb865feff13aab9c1da06eb',
       'ResolutionTime': datetime.datetime(2024, 1, 18, 15, 34, 25, 30000, tzinfo=tzlocal())}],
     'CurrentWeight': 1.0,
     'DesiredWeight': 1.0,
     'CurrentInstanceCount': 1,
     'DesiredInstanceCount': 1}],
   'EndpointStatus': 'InService',
   'CreationTim

In [11]:
# write all end point info to a file so that other notebooks can read it
Path(ENDPOINT_LIST_FPATH).write_text(json.dumps(all_info, indent=2, default=str))

14174