## Install dependencies

In [None]:
!pip install huggingface_hub hf_transfer

## Update configuration
please change the `ROLE` variable to a valid SageMaker execution role

In [None]:
ROLE = "arn:aws:iam::XXXX:role/service-role/AmazonSageMaker-ExecutionRole-20190405T234154"
HF_MODEL_ID = "amazon/FalconLite2"
REPO_NAME = "falconlite2-tgi103-ecr"


## Package FalconLight2 model

In [None]:
from pathlib import Path
import os

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
from huggingface_hub import snapshot_download


model_tar_dir = Path(HF_MODEL_ID.split("/")[-1])
model_tar_dir.mkdir(exist_ok=True)

# Download model from Hugging Face into model_dir
if os.path.exists(model_tar_dir):
    snapshot_download(
        HF_MODEL_ID,
        local_dir=str(model_tar_dir),
        revision="main",
        local_dir_use_symlinks=False,
        ignore_patterns=["*.msgpack*", "*.h5*", "*.bin*"], # to load safetensor weights only
    )

# check if safetensor weights are downloaded and available
assert len(list(model_tar_dir.glob("*.safetensors"))) > 0, "Model download failed"



In [None]:
if True:
    parent_dir=os.getcwd()
    # change to model dir
    os.chdir(str(model_tar_dir))
    # use pigz for faster and parallel compression
    !tar -cf model.tar.gz --use-compress-program=pigz *
    # change back to parent dir
    os.chdir(parent_dir)

In [None]:
if True:
    from sagemaker.s3 import S3Uploader
    import sagemaker

    sess = sagemaker.Session()

    s3_model_uri = S3Uploader.upload(
        local_path=str(model_tar_dir.joinpath("model.tar.gz")),
        desired_s3_uri=f"s3://{sess.default_bucket()}/{model_tar_dir}",
    )

    print(f"model uploaded to: {s3_model_uri}")


## Clean up the Docker build context
Please manually move it to other directories if you do not wish to delete the model files

In [None]:
!rm -rf {model_tar_dir}

## Build Custom Container
First we augment the Dockerfile for SageMaker.

In [None]:
sm_entry_stmt = """

COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh

ENTRYPOINT ["./entrypoint.sh"]
CMD [ "" ]
"""

In [None]:
with open("Dockerfile_rebuild_vllm_rope-theta", "r") as fin:
    docker_content = fin.read()

sm_docker_cotent = docker_content + sm_entry_stmt

with open("Dockerfile_sm", "w") as fout:
    fout.write(sm_docker_cotent)

Then we build the image. This could take 10 minutes, so feel free to run it directly in the terminal in case the notebook cell times out.  

**Important Note** - Please ensure the `ROLE` has sufficient permission to push Docker images to Elastic Container Registry.

In [None]:
!bash sm_build.sh {REPO_NAME}

## Deploy SageMaker Endpoint

In [None]:
import boto3

def get_aws_region():
    # Get the current AWS region from the default session
    session = boto3.session.Session()
    return session.region_name

def get_aws_account_id():
    # Get the current AWS account ID from the default session
    sts_client = boto3.client("sts")
    response = sts_client.get_caller_identity()
    return response["Account"]

REGION = get_aws_region()
ACCOUNT_ID = get_aws_account_id()


In [None]:
custom_image = f"{ACCOUNT_ID}.dkr.ecr.{REGION}.amazonaws.com/{REPO_NAME}"
custom_image

In [None]:
import sagemaker
import json
from sagemaker.huggingface import HuggingFaceModel
from sagemaker.huggingface import get_huggingface_llm_image_uri


print(f"sagemaker role arn: {ROLE}")
print(f"MODEL_S3_LOCATION: {s3_model_uri}")

instance_type = "ml.g5.12xlarge"
num_gpu = 4

health_check_timeout = 600

max_input_length = 24000
max_total_tokens = 24576

config = {
    "HF_MODEL_ID": "/opt/ml/model",
    "SM_NUM_GPUS": json.dumps(num_gpu), 
    "MAX_INPUT_LENGTH": json.dumps(max_input_length),
    "MAX_TOTAL_TOKENS": json.dumps(max_total_tokens),
    "HF_MODEL_QUANTIZE": "gptq",
    "TRUST_REMOTE_CODE": json.dumps(True),
    "MAX_BATCH_PREFILL_TOKENS": json.dumps(max_input_length),
}

endpoint_name = sagemaker.utils.name_from_base(f"falconlite2-g5-{num_gpu}gpu")

llm_model = HuggingFaceModel(
    role=ROLE,
    image_uri=custom_image,
    env=config,
    model_data=s3_model_uri
)

llm_model.deploy(
    endpoint_name=endpoint_name,
    initial_instance_count=1,
    instance_type=instance_type,
    container_startup_health_check_timeout=health_check_timeout,
    wait=False,
)

print(f"Endpointname: {endpoint_name}")

## Test Endpoint

In [None]:
%%time

import boto3
import json

def call_endpoint(text:str, endpoint_name:str):
    client = boto3.client("sagemaker-runtime")

    parameters = {
        "max_new_tokens": 250,
        "do_sample": True,
        "temperature": None,
        "typical_p": 0.2,
        "use_cache": True,
        "seed": 1,
    }

    payload = {"inputs": text, "parameters": parameters}

    response = client.invoke_endpoint(
        EndpointName=endpoint_name, Body=json.dumps(payload), ContentType="application/json"
    )

    output = json.loads(response["Body"].read().decode())

    result = output[0]["generated_text"]
    return result


prompt_template = "<|prompter|>{text}<|endoftext|><|assistant|>"
text = "What are the main challenges to support a long context for LLM?"
prompt = prompt_template.format(text=text)
print(prompt)

result = call_endpoint(prompt, endpoint_name)
print(result)

Try the long context of over 13,400 tokens, which are copied from [Amazon Aurora FAQs](https://aws.amazon.com/rds/aurora/faqs/)

In [None]:
%%time
with open("example_long_ctx.txt", "r") as fin:
    prompt = fin.read()
prompt = prompt.format(
    my_question="please tell me how does pgvector help with Generative AI and give me some examples."
)
prompt = prompt_template.format(text=prompt)
result = call_endpoint(prompt, endpoint_name)
print(result)
