# Deploy Fine-Tuned Models with Amazon SageMaker AI

In [87]:
import sagemaker
from sagemaker.session import Session
import boto3
import json

In [88]:
sagemaker_session = Session()
sagemaker_session_bucket = None

if sagemaker_session_bucket is None and sagemaker_session is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sagemaker_session.default_bucket()

try:
    role = sagemaker.session.get_execution_role()
except ValueError:
    iam = boto3.client("iam")
    role = iam.get_role(RoleName="sagemaker_execution_role")["Role"]["Arn"]

sagemaker_session = Session(default_bucket=sagemaker_session_bucket)
bucket_name = sagemaker_session.default_bucket()
default_prefix = sagemaker_session.default_bucket_prefix

sm_client = boto3.client("sagemaker", region_name=sagemaker_session.boto_region_name)
sts = boto3.client("sts", region_name=sagemaker_session.boto_region_name)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sagemaker_session.default_bucket()}")
print(f"sagemaker session region: {sagemaker_session.boto_region_name}")

sagemaker role arn: arn:aws:iam::905418447590:role/service-role/AmazonSageMaker-ExecutionRole-20251117T113783
sagemaker bucket: sagemaker-us-west-2-905418447590
sagemaker session region: us-west-2


## Attach to a Completed Training Job

In [89]:
from sagemaker.estimator import Estimator

pytorch_estimator = Estimator.attach(
    training_job_name="meta-llama--Llama-3-2-3B-Instruct-finetune-20251118054934"
)

s3_model_data_uri = pytorch_estimator.model_data
print(f"Fine-tuned model location: {s3_model_data_uri}")


2025-11-18 06:16:27 Starting - Starting the training job
2025-11-18 06:16:27 Pending - Preparing the instances for training
2025-11-18 06:16:27 Downloading - Downloading the training image
2025-11-18 06:16:27 Training - Training image download completed. Training in progress.
2025-11-18 06:16:27 Uploading - Uploading generated training model
2025-11-18 06:16:27 Completed - Instances not retained as a result of warmpool resource limits being exceeded
Fine-tuned model location: s3://sagemaker-us-west-2-905418447590/meta-llama--Llama-3-2-3B-Instruct-finetune/meta-llama--Llama-3-2-3B-Instruct-finetune-20251118054934/output/model.tar.gz


### Untar the final model weights - `model.tar.gz` and upload the weights to S3

In [90]:
import os
import boto3
import json
import sagemaker
import tarfile
from getpass import getpass
from datetime import datetime
from sagemaker.s3 import S3Downloader, S3Uploader
from sagemaker.compute_resource_requirements.resource_requirements import ResourceRequirements

In [91]:
hf_token = getpass()

 ········


In [92]:
local_model_path = "/tmp/tmp_cache_local_model"
os.makedirs(local_model_path, exist_ok=True)

In [93]:
S3Downloader.download(
    s3_uri=s3_model_data_uri,
    local_path=local_model_path
)
print(f"download model file to {local_model_path}")

In [9]:
local_gpt_oss_model_path = os.path.join(local_model_path, "gpt-oss-model-finetuned-spectrum")
os.makedirs(local_gpt_oss_model_path, exist_ok=True)


def untar_file(tar_path: str, destination: str) -> None:

    if not os.path.isfile(tar_path):
        raise FileNotFoundError(f"The file '{tar_path}' does not exist.")

    os.makedirs(destination, exist_ok=True)

    with tarfile.open(tar_path, "r:gz") as tar:
        tar.extractall(path=destination)
        print(f"Extracted '{tar_path}' to '{destination}'.")


# untar model file
untar_file(
    tar_path=os.path.join(local_model_path, os.path.basename(s3_model_data_uri)), 
    destination=local_gpt_oss_model_path
)

  tar.extractall(path=destination)


Extracted '/tmp/tmp_cache_local_model/model.tar.gz' to '/tmp/tmp_cache_local_model/gpt-oss-model-finetuned-spectrum'.


In [13]:
model_s3_uri = os.path.join(os.path.dirname(s3_model_data_uri), "full-model-paths")

uploaded_model_s3_uri = S3Uploader.upload(
    local_path=local_gpt_oss_model_path,
    desired_s3_uri=model_s3_uri
)
print(f"Uploaded {local_gpt_oss_model_path} to > {uploaded_model_s3_uri}")

Uploaded /tmp/tmp_cache_local_model/gpt-oss-model-finetuned-spectrum to > s3://sagemaker-us-west-2-905418447590/meta-llama--Llama-3-2-3B-Instruct-finetune/meta-llama--Llama-3-2-3B-Instruct-finetune-20251118054934/output/full-model-paths


## Deploy as a SageMaker Endpoint

In [95]:
import time
import sys
from datetime import datetime

In [96]:
region = sagemaker_session.boto_region_name

inference_image = f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.34.0-lmi16.0.0-cu128"

print(f"inference image: {inference_image}")

inference image: 763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.34.0-lmi16.0.0-cu128


In [97]:
model_name = sagemaker.utils.name_from_base(f"tuned-model")
inference_component_name = f"ic-{model_name}"
endpoint_config_name = f"epc-{model_name}"
endpoint_name = f"ep-{model_name}"

In [98]:
instance_type = "ml.g6e.2xlarge"
num_gpu = 1
variant_name = "AllTraffic"

In [99]:
print(f">> Model name: {model_name}")
print(f">> IC name: {inference_component_name}")
print(f">> Endpoint Config name: {endpoint_config_name}")
print(f">> Endpoint name: {endpoint_name}")
print(f">> Instance: {instance_type}")

>> Model name: tuned-model-2025-11-18-20-04-22-021
>> IC name: ic-tuned-model-2025-11-18-20-04-22-021
>> Endpoint Config name: epc-tuned-model-2025-11-18-20-04-22-021
>> Endpoint name: ep-tuned-model-2025-11-18-20-04-22-021
>> Instance: ml.g6e.2xlarge


### Endpoint Configuration

In [100]:
sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ExecutionRoleArn=role,
    ProductionVariants=[
        {
            "VariantName": variant_name,
            "InstanceType": instance_type,
            "InitialInstanceCount": 1,
            "ModelDataDownloadTimeoutInSeconds": 3600,
            "ContainerStartupHealthCheckTimeoutInSeconds": 3600,
            "ManagedInstanceScaling": {
                "Status": "ENABLED",
                "MinInstanceCount": 1,
                "MaxInstanceCount": 1,
            },
            "RoutingConfig": {"RoutingStrategy": "LEAST_OUTSTANDING_REQUESTS"},
        }
    ],
)

{'EndpointConfigArn': 'arn:aws:sagemaker:us-west-2:905418447590:endpoint-config/epc-tuned-model-2025-11-18-20-04-22-021',
 'ResponseMetadata': {'RequestId': '6cdaea58-b09d-4ddb-b1ee-e3bee0bab791',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '6cdaea58-b09d-4ddb-b1ee-e3bee0bab791',
   'strict-transport-security': 'max-age=47304000; includeSubDomains',
   'x-frame-options': 'DENY',
   'content-security-policy': "frame-ancestors 'none'",
   'cache-control': 'no-cache, no-store, must-revalidate',
   'x-content-type-options': 'nosniff',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '120',
   'date': 'Tue, 18 Nov 2025 20:04:39 GMT'},
  'RetryAttempts': 0}}

### SageMaker Endpoint

In [101]:
sm_client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name,
)

In [82]:
sagemaker_session.wait_for_endpoint(endpoint_name)

----!

{'EndpointName': 'ep-tuned-model-2025-11-18-19-46-16-841',
 'EndpointArn': 'arn:aws:sagemaker:us-west-2:905418447590:endpoint/ep-tuned-model-2025-11-18-19-46-16-841',
 'EndpointConfigName': 'epc-tuned-model-2025-11-18-19-46-16-841',
 'ProductionVariants': [{'VariantName': 'AllTraffic',
   'CurrentInstanceCount': 1,
   'DesiredInstanceCount': 1,
   'ManagedInstanceScaling': {'Status': 'ENABLED',
    'MinInstanceCount': 1,
    'MaxInstanceCount': 1},
   'RoutingConfig': {'RoutingStrategy': 'LEAST_OUTSTANDING_REQUESTS'}}],
 'EndpointStatus': 'InService',
 'CreationTime': datetime.datetime(2025, 11, 18, 19, 46, 19, 103000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2025, 11, 18, 19, 48, 26, 877000, tzinfo=tzlocal()),
 'ResponseMetadata': {'RequestId': 'a7a165ea-2687-42e2-b600-70b146d2afe5',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'a7a165ea-2687-42e2-b600-70b146d2afe5',
   'strict-transport-security': 'max-age=47304000; includeSubDomains',
   'x-frame-o

In [94]:
uploaded_model_s3_uri

's3://sagemaker-us-west-2-905418447590/meta-llama--Llama-3-2-3B-Instruct-finetune/meta-llama--Llama-3-2-3B-Instruct-finetune-20251118054934/output/full-model-paths'

### Model

In [83]:
model_configuration = {
    "Image": inference_image,
    'ModelDataSource': {
                'S3DataSource': {
                    'S3Uri': f"{uploaded_model_s3_uri}/",
                    'S3DataType': 'S3Prefix',
                    'CompressionType': 'None',
                }
            },
    "Environment": {
        "SAGEMAKER_MODEL_SERVER_WORKERS": "1",
        "MESSAGES_API_ENABLED": "true",
        "OPTION_MAX_ROLLING_BATCH_SIZE": "8",
        "OPTION_MODEL_LOADING_TIMEOUT": "1500",
        "SERVING_FAIL_FAST": "true",
        "OPTION_ROLLING_BATCH": "disable",
        "OPTION_ASYNC_MODE": "true",
        "OPTION_ENTRYPOINT": "djl_python.lmi_vllm.vllm_async_service",
        "OPTION_ENABLE_STREAMING": "true",
        "HF_TOKEN": hf_token,
        "MAX_TOTAL_TOKENS": json.dumps(4096)
    },
}

In [84]:
sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    Containers=[model_configuration],
)

{'ModelArn': 'arn:aws:sagemaker:us-west-2:905418447590:model/tuned-model-2025-11-18-19-46-16-841',
 'ResponseMetadata': {'RequestId': '9297deb0-8eac-492e-a88a-7d99d2f3351e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '9297deb0-8eac-492e-a88a-7d99d2f3351e',
   'strict-transport-security': 'max-age=47304000; includeSubDomains',
   'x-frame-options': 'DENY',
   'content-security-policy': "frame-ancestors 'none'",
   'cache-control': 'no-cache, no-store, must-revalidate',
   'x-content-type-options': 'nosniff',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '97',
   'date': 'Tue, 18 Nov 2025 19:48:50 GMT'},
  'RetryAttempts': 0}}

### Inference Component

In [85]:
sm_client.create_inference_component(
    InferenceComponentName=inference_component_name,
    EndpointName=endpoint_name,
    VariantName=variant_name,
    Specification={
        "ModelName": model_name,
        "ComputeResourceRequirements": {
            "NumberOfAcceleratorDevicesRequired": num_gpu,
            "NumberOfCpuCoresRequired": 1,
            "MinMemoryRequiredInMb": 1024,
        },
    },
    RuntimeConfig={"CopyCount": 1},
)

{'InferenceComponentArn': 'arn:aws:sagemaker:us-west-2:905418447590:inference-component/ic-tuned-model-2025-11-18-19-46-16-841',
 'ResponseMetadata': {'RequestId': '741fafa1-7b3c-4186-a421-6aba023410d6',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '741fafa1-7b3c-4186-a421-6aba023410d6',
   'strict-transport-security': 'max-age=47304000; includeSubDomains',
   'x-frame-options': 'DENY',
   'content-security-policy': "frame-ancestors 'none'",
   'cache-control': 'no-cache, no-store, must-revalidate',
   'x-content-type-options': 'nosniff',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '127',
   'date': 'Tue, 18 Nov 2025 19:48:51 GMT'},
  'RetryAttempts': 0}}

In [86]:
start_time = time.time()
while True:
    desc = sm_client.describe_inference_component(
        InferenceComponentName=inference_component_name
    )
    status = desc["InferenceComponentStatus"]
    print(status)
    sys.stdout.flush()
    if status in ["InService", "Failed"]:
        break
    time.sleep(30)
total_time = time.time() - start_time
print(f"\nTotal time taken: {total_time:.2f} seconds ({total_time/60:.2f} minutes)")

Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating


## Run Inference

Invoke your running endpoint with boto3 invoke_endpoint or invoke_endpoint_with_response_stream runtime api calls. If you have an existing endpoint, you don't need to recreate the predictor and can follow below example to invoke the endpoint with an endpoint name.

In [102]:
import boto3
import json

In [None]:
sagemaker_runtime = boto3.client('sagemaker-runtime')

In [None]:
prompt = {
    'messages':[
    {"role": "user", "content": "How many R are in STRAWBERRY? Keep your answer and explanation short!"}
],
    'temperature':0.7,
    'top_p':0.8,
    'top_k':20,
    'max_tokens':512,
}
response = sagemaker_runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    InferenceComponentName=inference_component_name,
    ContentType="application/json",
    Body=json.dumps(prompt)
)

In [None]:
response_dict = json.loads(response['Body'].read().decode("utf-8"))
response_content = response_dict['choices'][0]['message']['content']
print(response_content)