## Compress model data for artifact upload

In [1]:
import os
import tarfile
os.chdir('model')

with tarfile.open('model.tar.gz', 'w:gz') as tar:
    tar.add(".")

## Create Sagemaker endpoint

In [None]:
import os
import sagemaker
from sagemaker.pytorch import PyTorchModel
from sagemaker.huggingface.model import HuggingFaceModel


sagemaker_session = sagemaker.Session()
bucket_name = 'abhi-sagemaker'
model_artifacts_location = 's3://abhi-sagemaker/models/gpt2-artifacts/model.tar.gz'

role = sagemaker.get_execution_role()


huggingface_model = HuggingFaceModel(
   model_data="s3://abhi-sagemaker/models/gpt2-artifacts/model.tar.gz", 
   role=role,                                            
   transformers_version="4.26",                           
   pytorch_version="1.13",                                
   py_version='py39',                                    
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
   initial_instance_count=1,
   instance_type="ml.m5.xlarge"
)


# Test request on sagemaker
data = {
   "inputs": "<len> 5 <word> the <text> "
}

# request
predictor.predict(data)

## Invoke the Sagemaker endpoint from local machine

In [1]:
import boto3
import json

AWS_REGION = 'us-east-1'

# SageMaker endpoint name
SAGEMAKER_ENDPOINT_NAME = 'huggingface-pytorch-inference-2024-02-13-03-17-20-651'

input_text = "<len> 4 <word> the <text> "

sagemaker_runtime = boto3.client('sagemaker-runtime', region_name=AWS_REGION)

request_payload = {
    "inputs": input_text
}

# Invoke the SageMaker endpoint
response = sagemaker_runtime.invoke_endpoint(
    EndpointName=SAGEMAKER_ENDPOINT_NAME,
    Body=json.dumps(request_payload),
    ContentType='application/json'
)

response_body = json.loads(response['Body'].read())
output_text = response_body[0]['generated_text']
print("Generated Text:", output_text)


Generated Text: <len> 4 <word> the <text> New Dennis Cash Show


## Measure latency and throughput

In [4]:
import time

latency = []
for i in range(100):
    start_time = time.time()
    response = sagemaker_runtime.invoke_endpoint(
        EndpointName=SAGEMAKER_ENDPOINT_NAME,
        Body=json.dumps(request_payload),
        ContentType='application/json'
    )
    end_time = time.time()
    latency.append(end_time - start_time)

mean_latency = sum(latency)/len(latency)
througput = 60/mean_latency

print("Latency: ", mean_latency)
print("Throughput per minute: ",througput)


Latency:  0.5369790315628051
Throughput per minute:  111.7362065803167


## Deployment optimization considerations

- Bigger hammer: Better sagemaker instance type
- Efficient model: Use distilled models or quantized models
- We can batch multiple requests together to reduce number of requests
- Depending on reproducibility, we can cache the results
- Send preprocessed request to offload processing overhead on server
- Configure endpoint to process requent in parallel