## Deploy Text Generation Model (GPT NeoXT Chat Base)

#### Imports 

In [2]:
from sagemaker.jumpstart.notebook_utils import list_jumpstart_models
from sagemaker.predictor import Predictor
from sagemaker import get_execution_role
from sagemaker import ModelPackage
from sagemaker.model import Model
from sagemaker import image_uris 
from sagemaker import model_uris
import numpy as np
import sagemaker
import logging
import boto3
import time
import json

##### Setup logging 

In [3]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [4]:
logger.info(f'Using sagemaker=={sagemaker.__version__}')
logger.info(f'Using boto3=={boto3.__version__}')

Using sagemaker==2.145.0
Using boto3==1.26.111


#### Setup essentials 

In [5]:
region = boto3.Session().region_name
logger.info(f'Region = {region}')

Region = us-east-1


##### Get list of language models available in JS model hub

In [6]:
models = list_jumpstart_models()
logger.info(f'Total number of models in SageMaker JumpStart hub = {len(models)}')

Total number of models in SageMaker JumpStart hub = 678


##### Setup inference deployment config params

In [7]:
MODEL_ID = 'huggingface-textgeneration2-gpt-neoxt-chat-base-20b-fp16'  # this is hard-coded
MODEL_VERSION = '*'
INSTANCE_TYPE = 'ml.p3.8xlarge'
INSTANCE_COUNT = 1
IMAGE_SCOPE = 'inference'
MODEL_DATA_DOWNLOAD_TIMEOUT = 3600  # in seconds
CONTAINER_STARTUP_HEALTH_CHECK_TIMEOUT = 3600
EBS_VOLUME_SIZE = 256  # in GB
CONTENT_TYPE = 'application/json'

# set up roles and clients 
client = boto3.client('sagemaker-runtime')
ROLE = get_execution_role()
logger.info(f'Role => {ROLE}')

Role => arn:aws:iam::119174016168:role/service-role/AmazonSageMaker-ExecutionRole-20211014T093628


In [8]:
unix_time = int(time.time())

endpoint_name = f'neox-chat-{unix_time}'
logger.info(f'Endpoint name: {endpoint_name}')

Endpoint name: neox-chat-1686760562


#### Retrieve Image and Model URIs

In [9]:
deploy_image_uri = image_uris.retrieve(region=None, 
                                       framework=None, 
                                       image_scope=IMAGE_SCOPE, 
                                       model_id=MODEL_ID, 
                                       model_version=MODEL_VERSION, 
                                       instance_type=INSTANCE_TYPE)
logger.info(f'Deploy image URI => {deploy_image_uri}')

Ignoring unnecessary Python version: py38.
Ignoring unnecessary instance type: ml.p3.8xlarge.
Deploy image URI => 763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.0-cu117


In [10]:
model_uri = model_uris.retrieve(model_id=MODEL_ID, 
                                model_version=MODEL_VERSION, 
                                model_scope=IMAGE_SCOPE)
logger.info(f'Model URI => {model_uri}')

Model URI => s3://jumpstart-cache-prod-us-east-1/huggingface-infer/prepack/v1.0.1/infer-prepack-huggingface-textgeneration2-gpt-neoxt-chat-base-20b-fp16.tar.gz


In [11]:
env = {
    'SAGEMAKER_MODEL_SERVER_TIMEOUT': str(3600),
    'MODEL_CACHE_ROOT': '/opt/ml/model', 
    'SAGEMAKER_ENV': '1',
    'SAGEMAKER_SUBMIT_DIRECTORY': '/opt/ml/model/code/',
    'SAGEMAKER_PROGRAM': 'inference.py',
    'SAGEMAKER_MODEL_SERVER_WORKERS': '1', 
    'TS_DEFAULT_WORKERS_PER_MODEL': '1', 
}

#### Create SageMaker model

In [12]:
model_name = endpoint_name.replace('huggingface-textgeneration2-gpt-', '')
model = Model(image_uri=deploy_image_uri, 
              model_data=model_uri, 
              role=ROLE, 
              predictor_cls=Predictor, 
              name=model_name, 
              env=env)

#### Deploy text generation model as SageMaker endpoint for real-time synchronous inference

In [13]:
%%time

_ = model.deploy(initial_instance_count=INSTANCE_COUNT, 
                 instance_type=INSTANCE_TYPE, 
                 endpoint_name=endpoint_name, 
                 volume_size=EBS_VOLUME_SIZE,
                 model_data_download_timeout=MODEL_DATA_DOWNLOAD_TIMEOUT, 
                 container_startup_health_check_timeout=CONTAINER_STARTUP_HEALTH_CHECK_TIMEOUT)

Creating model with name: neox-chat-1686760562
CreateModel request: {
    "ModelName": "neox-chat-1686760562",
    "ExecutionRoleArn": "arn:aws:iam::119174016168:role/service-role/AmazonSageMaker-ExecutionRole-20211014T093628",
    "PrimaryContainer": {
        "Image": "763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.0-cu117",
        "Environment": {
            "SAGEMAKER_MODEL_SERVER_TIMEOUT": "3600",
            "MODEL_CACHE_ROOT": "/opt/ml/model",
            "SAGEMAKER_ENV": "1",
            "SAGEMAKER_SUBMIT_DIRECTORY": "/opt/ml/model/code/",
            "SAGEMAKER_PROGRAM": "inference.py",
            "SAGEMAKER_MODEL_SERVER_WORKERS": "1",
            "TS_DEFAULT_WORKERS_PER_MODEL": "1"
        },
        "ModelDataUrl": "s3://jumpstart-cache-prod-us-east-1/huggingface-infer/prepack/v1.0.1/infer-prepack-huggingface-textgeneration2-gpt-neoxt-chat-base-20b-fp16.tar.gz"
    },
    "Tags": [
        {
            "Key": "aws-jumpstart-inference-model

--------------------------!CPU times: user 193 ms, sys: 24.7 ms, total: 218 ms
Wall time: 13min 34s


### II. Invoke SageMaker endpoint to test the deployed model for natural language understanding (NLU) and natural language generation (NLG) tasks


***
This model also supports many advanced parameters while performing inference. They include:

* **max_length:** Model generates text until the output length (which includes the input context length) reaches `max_length`. If specified, it must be a positive integer.
* **num_return_sequences:** Number of output sequences returned. If specified, it must be a positive integer.
* **num_beams:** Number of beams used in the greedy search. If specified, it must be integer greater than or equal to `num_return_sequences`.
* **no_repeat_ngram_size:** Model ensures that a sequence of words of `no_repeat_ngram_size` is not repeated in the output sequence. If specified, it must be a positive integer greater than 1.
* **temperature:** Controls the randomness in the output. Higher temperature results in output sequence with low-probability words and lower temperature results in output sequence with high-probability words. If `temperature` -> 0, it results in greedy decoding. If specified, it must be a positive float.
* **early_stopping:** If True, text generation is finished when all beam hypotheses reach the end of sentence token. If specified, it must be boolean.
* **do_sample:** If True, sample the next word as per the likelihood. If specified, it must be boolean.
* **top_k:** In each step of text generation, sample from only the `top_k` most likely words. If specified, it must be a positive integer.
* **top_p:** In each step of text generation, sample from the smallest possible set of words with cumulative probability `top_p`. If specified, it must be a float between 0 and 1.
* **seed:** Fix the randomized state for reproducibility. If specified, it must be an integer.
* **stopping_criteria:** A list of strings that, if any member is generated, will stop the text generation process.

We may specify any subset of the parameters mentioned above while invoking an endpoint. Next, we show an example of how to invoke endpoint with these arguments

***

In [14]:
prompt = """<human>: hi
<bot>:hello
<human>: classify the following tweet into 'positive' or 'negative' => the movie was boring
<bot>:
"""

In [15]:
payload = {
    'text_inputs': prompt,
    'seed': 123,
    'temperature': 0.01,
    'max_new_tokens': 128,
    'num_return_sequences': 1,
    'top_k': 50,
    'top_p': 0.95,
    'do_sample': True,
    'stopping_criteria': ['<human>'],
}

In [16]:
payload = json.dumps(payload).encode('utf-8')

In [17]:
%%time 
response = client.invoke_endpoint(EndpointName=endpoint_name, 
                                  ContentType=CONTENT_TYPE, 
                                  Body=payload)

CPU times: user 8.71 ms, sys: 4.14 ms, total: 12.8 ms
Wall time: 1.84 s


#### Parse response to extract completion

In [18]:
model_predictions = json.loads(response['Body'].read())
model_predictions

[[{'generated_text': "<human>: hi\n<bot>:hello\n<human>: classify the following tweet into 'positive' or 'negative' => the movie was boring\n<bot>:\n\nThe sentiment of the tweet is 'negative'. The tweet contains a sentence, and that sentence is a complaint about the movie.\n<human>:"}]]

In [19]:
generated_text = model_predictions[0][0]['generated_text']
turns = generated_text.split('\n')
completion = turns[-2]
completion

"The sentiment of the tweet is 'negative'. The tweet contains a sentence, and that sentence is a complaint about the movie."