# Adapt TGI container to long context
Reference: https://huggingface.co/amazon/FalconLite

This notebook requires a SageMaker Notebook instance as uses docker to customize the TGI container

In [1]:
import sagemaker
import boto3

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client("iam")
    role = iam.get_role(RoleName="sagemaker_execution_role")["Role"]["Arn"]

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::843197046435:role/service-role/AmazonSageMaker-ExecutionRole-20230626T095793
sagemaker session region: eu-west-1


## Update configuration

In [9]:
ROLE = role
HF_MODEL_ID = "amazon/FalconLite"
REPO_NAME = "falcon-lctx"

## Package FalconLight model

In [3]:
! pip install huggingface_hub hf_transfer

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [4]:
from pathlib import Path
import os

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
from huggingface_hub import snapshot_download


model_tar_dir = Path(HF_MODEL_ID.split("/")[-1])
model_tar_dir.mkdir(exist_ok=True)

# Download model from Hugging Face into model_dir
if os.path.exists(model_tar_dir):
    snapshot_download(
        HF_MODEL_ID,
        local_dir=str(model_tar_dir),
        revision="main",
        local_dir_use_symlinks=False,
        ignore_patterns=[
            "*.msgpack*",
            "*.h5*",
            "*.bin*",
        ],  # to load safetensor weights only
    )

# check if safetensor weights are downloaded and available
assert len(list(model_tar_dir.glob("*.safetensors"))) > 0, "Model download failed"

In [5]:

parent_dir=os.getcwd()
# change to model dir
os.chdir(str(model_tar_dir))
# use pigz for faster and parallel compression
!tar -cf model.tar.gz --use-compress-program=pigz *
# change back to parent dir
os.chdir(parent_dir)

In [6]:
from sagemaker.s3 import S3Uploader
import sagemaker

sess = sagemaker.Session()

s3_model_uri = S3Uploader.upload(
    local_path=str(model_tar_dir.joinpath("model.tar.gz")),
    desired_s3_uri=f"s3://{sess.default_bucket()}/{model_tar_dir}",
)

print(f"model uploaded to: {s3_model_uri}")

model uploaded to: s3://sagemaker-eu-west-1-843197046435/FalconLite/model.tar.gz


## Build Custom Container

In [30]:
!cd build-container; bash build.sh {REPO_NAME}

REPO_NAME: falcon-lctx
REGION: eu-west-1
ACCOUNT_ID: 843197046435
0.8.2: Pulling from huggingface/text-generation-inference
Digest: sha256:a19c3bbd772603286eeeea4b6d5e83b210101e8735c0264cc83c389b787031c8
Status: Image is up to date for ghcr.io/huggingface/text-generation-inference:0.8.2
ghcr.io/huggingface/text-generation-inference:0.8.2
https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
{
    "repositories": [
        {
            "repositoryArn": "arn:aws:ecr:eu-west-1:843197046435:repository/falcon-lctx",
            "registryId": "843197046435",
            "repositoryName": "falcon-lctx",
            "repositoryUri": "843197046435.dkr.ecr.eu-west-1.amazonaws.com/falcon-lctx",
            "createdAt": 1691177994.0,
            "imageTagMutability": "MUTABLE",
            "imageScanningConfiguration": {
                "scanOnPush": false
            },
            "encryptionConfiguration": {
                "encryptionType": "AES256"
  

In [31]:
# ! docker image rm

## Deploy SageMaker Endpoint

In [32]:
import boto3


def get_aws_region():
    # Get the current AWS region from the default session
    session = boto3.session.Session()
    return session.region_name


def get_aws_account_id():
    # Get the current AWS account ID from the default session
    sts_client = boto3.client("sts")
    response = sts_client.get_caller_identity()
    return response["Account"]


REGION = get_aws_region()
ACCOUNT_ID = get_aws_account_id()

In [33]:
custom_image = f"{ACCOUNT_ID}.dkr.ecr.{REGION}.amazonaws.com/{REPO_NAME}"
custom_image

'843197046435.dkr.ecr.eu-west-1.amazonaws.com/falcon-lctx'

In [34]:
import sagemaker
import json
from sagemaker.huggingface import HuggingFaceModel
from sagemaker.huggingface import get_huggingface_llm_image_uri


print(f"sagemaker role arn: {ROLE}")
print(f"MODEL_S3_LOCATION: {s3_model_uri}")

instance_type = "ml.g5.12xlarge"
num_gpu = 4

health_check_timeout = 600

config = {
    "HF_MODEL_ID": "/opt/ml/model",
    "SM_NUM_GPUS": json.dumps(num_gpu),
    "MAX_INPUT_LENGTH": json.dumps(12000),
    "MAX_TOTAL_TOKENS": json.dumps(12001),
    "HF_MODEL_QUANTIZE": "gptq",
    "TRUST_REMOTE_CODE": json.dumps(True),
    "MAX_BATCH_PREFILL_TOKENS": json.dumps(12001),
    "MAX_BATCH_TOTAL_TOKENS": json.dumps(12001),
    "GPTQ_BITS": json.dumps(4),
    "GPTQ_GROUPSIZE": json.dumps(128),
    "DNTK_ALPHA_SCALER": json.dumps(0.25),
}

endpoint_name = sagemaker.utils.name_from_base(f"falconlite-g5-{num_gpu}gpu")

llm_model = HuggingFaceModel(
    role=ROLE, image_uri=custom_image, env=config, model_data=s3_model_uri
)

llm_model.deploy(
    endpoint_name=endpoint_name,
    initial_instance_count=1,
    instance_type=instance_type,
    container_startup_health_check_timeout=health_check_timeout,
    wait=False,
)

print(f"Endpointname: {endpoint_name}")

sagemaker role arn: arn:aws:iam::843197046435:role/service-role/AmazonSageMaker-ExecutionRole-20230626T095793
MODEL_S3_LOCATION: s3://sagemaker-eu-west-1-843197046435/FalconLite/model.tar.gz
Endpointname: falconlite-g5-4gpu-2023-08-04-20-03-43-999


## Test Endpoint

In [35]:
%%time

import boto3
import json


def call_endpoint(text: str, endpoint_name: str):
    client = boto3.client("sagemaker-runtime")

    parameters = {
        "max_new_tokens": 250,
        "do_sample": True,
        "temperature": None,
        "typical_p": 0.2,
        "use_cache": True,
        "seed": 1,
    }

    payload = {"inputs": text, "parameters": parameters}

    response = client.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=json.dumps(payload),
        ContentType="application/json",
    )

    output = json.loads(response["Body"].read().decode())

    result = output[0]["generated_text"]
    return result


prompt_template = "<|prompter|>{text}<|endoftext|><|assistant|>"
text = "What are the main challenges to support a long context for LLM?"
prompt = prompt_template.format(text=text)
print(prompt)

result = call_endpoint(prompt, endpoint_name)
print(result)

<|prompter|>What are the main challenges to support a long context for LLM?<|endoftext|><|assistant|>
Large Language Models (LLMs) have made significant progress in recent years, but supporting a long context still poses several challenges. Some of the main challenges include:

Memory Constraints: LLMs typically require a large amount of memory to store their parameters. This can become a challenge when dealing with long contexts, as the model needs to keep track of a large number of previous tokens.

Inference Time: Inference time can become slow when dealing with long contexts, as the model needs to process a large amount of information. This can be particularly challenging for real-time applications, such as chatbots.

Model Size: Long contexts can increase the size of the model, which can make it more difficult to train and deploy.

Overfitting: When dealing with long contexts, the model may overfit to the training data, which can lead to poor generalization performance.

Inconsist