# Supervised Fine-Tuning (SFT) with Serverless customization on SageMaker AI

## Lab 4 - LLM Deployment

In this notebook, we are going to deploy the fine-tuned LLM using SageMaker Real-time endpoint

***

### Prerequistes

#### Setup and dependencies

In [None]:
import boto3
import os
from rich.pretty import pprint
from sagemaker.core.helper.session_helper import Session, get_execution_role

sess = Session()
sagemaker_session_bucket = None

if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = get_execution_role()
except ValueError:
    iam = boto3.client("iam")
    role = iam.get_role(RoleName="sagemaker_execution_role")["Role"]["Arn"]

s3_client = boto3.client("s3")
sess = Session(default_bucket=sagemaker_session_bucket)
sm_client = boto3.client("sagemaker", region_name=sess.boto_region_name)
bucket_name = sess.default_bucket()
default_prefix = sess.default_bucket_prefix

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

Edit model package group name and model package version if needed

In [None]:
from sagemaker.core.resources import ModelPackage, ModelPackageGroup

base_model_id = "huggingface-llm-qwen2-5-7b-instruct"

model_package_group_name = f"{base_model_id}-mpg"
model_package_version = "1"

model_name = f"{base_model_id}-sft"
endpoint_config_name = f"{base_model_id}-sft-config"
endpoint_name = f"{base_model_id}-sft-endpoint"
ic_name = f"{base_model_id}-sft-ic"

In [None]:
model_package_group = ModelPackageGroup.get(model_package_group_name)

fine_tuned_model_package_group_arn = model_package_group.model_package_group_arn
print(f"Fine-tuned Model Package Group ARN: {fine_tuned_model_package_group_arn}")

fine_tuned_model_package_arn = f"{model_package_group.model_package_group_arn.replace("model-package-group", "model-package", 1)}/{model_package_version}"
print(f"Fine-tuned Model Package ARN: {fine_tuned_model_package_arn}")

model_package = ModelPackage.get(fine_tuned_model_package_arn)

model_s3_uri = os.path.join(model_package.inference_specification.containers[0].model_data_source.s3_data_source.s3_uri, "checkpoints", "hf_merged")
if not model_s3_uri.endswith("/"):
    model_s3_uri += "/"
print(f"Fine-tuned Model S3 Path: {model_s3_uri}")

***

### Utility functions

Utility functions to check the creation status of endpoints and inference components

In [None]:
import time

In [None]:
def wait_for_endpoint(sm_client, endpoint_name):
    while True:
        status = sm_client.describe_endpoint(EndpointName=endpoint_name)[
            "EndpointStatus"
        ]
        print(f"Endpoint status: {status}")
        if status in ["InService", "Failed"]:
            return status
        time.sleep(30)


def wait_for_inference_component(sm_client, component_name):
    while True:
        status = sm_client.describe_inference_component(
            InferenceComponentName=component_name
        )["InferenceComponentStatus"]
        print(f"Inference component status: {status}")
        if status in ["InService", "Failed"]:
            return status
        time.sleep(30)

***

### Create Endpoint Configuration

Define inference configuration

In [None]:
instance_count = 1
instance_type = "ml.g5.2xlarge"
number_of_gpu = 1
health_check_timeout = 700

In [None]:
endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ExecutionRoleArn=role,
    ProductionVariants=[
        {
            "VariantName": "AllTraffic",
            "InstanceType": instance_type,
            "InitialInstanceCount": instance_count,
            "ModelDataDownloadTimeoutInSeconds": health_check_timeout,
            "RoutingConfig": {"RoutingStrategy": "LEAST_OUTSTANDING_REQUESTS"},
        }
    ],
)

pprint(endpoint_config_response)

### Create Endpoint

A SageMaker Endpoint is a fully managed, always-on HTTPS API that hosts your deployed model and serves real-time inference requests.

In [None]:
endpoint_response = sm_client.create_endpoint(
    EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
)

pprint(endpoint_response)

Let's wait for the creation

In [None]:
wait_for_endpoint(sm_client, endpoint_name)

### Create Model from Model Package

Get the image URI

In [None]:
import json
from sagemaker.core import image_uris

In [None]:
image_uri = image_uris.retrieve(
    framework="djl-lmi",
    region=sess.boto_region_name,
    version="latest",
)

image_uri = image_uri.split("/")[0] + "/djl-inference:0.36.0-lmi18.0.0-cu128"

image_uri

In [None]:
env = {
    "HF_MODEL_ID": "/opt/ml/model",  # path to where sagemaker stores the model
    "OPTION_TRUST_REMOTE_CODE": "true",
    "OPTION_MODEL_LOADING_TIMEOUT": "3600",
    "OPTION_TENSOR_PARALLEL_DEGREE": "max",
    "SERVING_FAIL_FAST": "true",
    "OPTION_ROLLING_BATCH": "disable",
    "OPTION_ASYNC_MODE": "true",
    "OPTION_ENTRYPOINT": "djl_python.lmi_vllm.vllm_async_service",
    "OPTION_DTYPE": "bf16",
    "OPTION_QUANTIZE": "fp8",
    "OPTION_MAX_MODEL_LEN": json.dumps(1024 * 32)
}

In [None]:
model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": image_uri,
        "Environment": env,
        "ModelDataSource": {
            "S3DataSource": {
                "S3Uri": model_s3_uri,
                "S3DataType": "S3Prefix",
                "CompressionType": "None",
            }
        },
    },
)

pprint(model_response)

### Create Inference Component

In [None]:
ic_response = sm_client.create_inference_component(
    InferenceComponentName=ic_name,
    EndpointName=endpoint_name,
    VariantName="AllTraffic",
    Specification={
        "ModelName": model_name,
        "ComputeResourceRequirements": {
            "MinMemoryRequiredInMb": 12288,
            "NumberOfAcceleratorDevicesRequired": 1,
        },
    },
    RuntimeConfig={"CopyCount": 1},
)

pprint(ic_response)

Let's wait for the creation

In [None]:
wait_for_inference_component(sm_client, ic_name)

***

### Test endpoint

In [None]:
import io
import json
import boto3

In [None]:
sagemaker_client = boto3.client(service_name="sagemaker-runtime")

### Iterator class for streaming inference

Utility class to parse streaming responses

In [None]:
class LineIterator:
    def __init__(self, stream):
        self.byte_iterator = iter(stream)
        self.buffer = io.BytesIO()
        self.read_pos = 0

    def __iter__(self):
        return self

    def __next__(self):
        while True:
            self.buffer.seek(self.read_pos)
            line = self.buffer.readline()

            if line and line[-1] == ord("\n"):
                self.read_pos += len(line)
                return line[:-1]

            try:
                chunk = next(self.byte_iterator)
            except StopIteration:
                if self.read_pos < self.buffer.getbuffer().nbytes:
                    continue
                raise

            if "PayloadPart" not in chunk:
                continue

            self.buffer.seek(0, io.SEEK_END)
            self.buffer.write(chunk["PayloadPart"]["Bytes"])

Utility function to parse model answer

In [None]:
def parse_streaming_response(line_str):
    """Parse a single streaming response line and return content if found."""
    if not line_str.strip() or line_str.strip() == "data: [DONE]":
        return None

    if line_str.startswith("data: "):
        line_str = line_str[6:]

    try:
        data = json.loads(line_str)
        if "choices" in data:
            for choice in data["choices"]:
                if "delta" in choice and "content" in choice["delta"]:
                    return choice["delta"]["content"]
    except json.JSONDecodeError:
        pass

    return None

In [None]:
prompt = """
Regarding the temporomandibular joint, which statements are true or false: 
Is the temporomandibular joint a synovial joint? 
Is the articular disc a remnant of the tendon of the medial pterygoid? 
Do gliding movements occur in the lower compartment and rotatory movements occur in the upper compartment? 
Is the joint capsule thick and tight in the lower part and loose and lax in the upper part? 
Does the sphenomandibular ligament act as a false support to the joint and attach to the angle of the mandible?
"""

In [None]:
request_body = {
    "messages": [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
            ],
        }
    ],
    "max_tokens": 4096,
    "temperature": 0.3,
    "top_p": 0.9,
    "stop": ["<|im_end|>"],
    "stream": True,
}

response = sagemaker_client.invoke_endpoint_with_response_stream(
    EndpointName=endpoint_name,
    InferenceComponentName=ic_name,
    Body=json.dumps(request_body),
    ContentType="application/json",
)

generated_text = ""

for line in LineIterator(response["Body"]):
    if line:
        content = parse_streaming_response(line.decode("utf-8"))
        if content:
            generated_text += content
            print(content, end="", flush=True)

***

### Delete resources

In [None]:
import boto3

sm_client = boto3.client("sagemaker")

base_model_id = "huggingface-llm-qwen2-5-7b-instruct"

model_name = f"{base_model_id}-sft"
endpoint_config_name = f"{base_model_id}-sft-config"
endpoint_name = f"{base_model_id}-sft-endpoint"
ic_name = f"{base_model_id}-sft-ic"

In [None]:
# Delete inference component
sm_client.delete_inference_component(InferenceComponentName=ic_name)

In [None]:
# Delete model
sm_client.delete_model(ModelName=model_name)

In [None]:
# Delete endpoint config (optional)
sm_client.delete_endpoint_config(EndpointConfigName=endpoint_config_name)

In [None]:
# Delete endpoint (optional - if you want to remove the endpoint too)
sm_client.delete_endpoint(EndpointName=endpoint_name)