In [1]:
%pip install -U sagemaker
%pip install -U xgboost
%pip install -U jinja2

Collecting sagemaker
  Downloading sagemaker-2.235.2-py3-none-any.whl.metadata (16 kB)
Collecting boto3<2.0,>=1.34.142 (from sagemaker)
  Downloading boto3-1.35.70-py3-none-any.whl.metadata (6.7 kB)
Collecting numpy<2.0,>=1.9.0 (from sagemaker)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting sagemaker-core<2.0.0,>=1.0.15 (from sagemaker)
  Downloading sagemaker_core-1.0.16-py3-none-any.whl.metadata (4.9 kB)
Collecting botocore<1.36.0,>=1.35.70 (from boto3<2.0,>=1.34.142->sagemaker)
  Downloading botocore-1.35.70-py3-none-any.whl.metadata (5.7 kB)
Collecting mock<5.0,>4.0 (from sagemaker-core<2.0.0,>=1.0.15->sagemaker)
  Downloading mock-4.0.3-py3-none-any.whl.metadata (2.8 kB)
Downloading sagemaker-2.235.2-py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading boto3-1.35.70-py3-none-any.whl (139 kB)
Downloading numpy

In [22]:
import sagemaker
from sagemaker.multidatamodel import MultiDataModel
from sagemaker.model import Model

from pathlib import Path
import boto3
import json
import shutil
import datetime as dt
import tarfile
import xgboost as xgb
import pandas as pd
import numpy as np
import time
from concurrent.futures import ThreadPoolExecutor


In [4]:
role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name

triton_framework = "sagemaker-tritonserver"
version = "24.09"
instance_type = "ml.g5.2xlarge"

test_data_path = (
    "s3://sagemaker-us-east-1-152804913371/nvidia-aws-fraud-detection-demo/test/"
)
trained_model_path = "s3://sagemaker-us-east-1-152804913371/pytorch-training-2024-11-22-15-54-43-056/output/model.tar.gz"

mme_s3_uri = f"s3://{sess.default_bucket()}/xgboost-mme"

mme_triton_image_uri = sagemaker.image_uris.retrieve(
    framework=triton_framework,
    region=region,
    version=version,
    instance_type=instance_type,
)

In [5]:
model_tar_name = Path(trained_model_path).name
!aws s3 cp {trained_model_path} {model_tar_name}
with tarfile.open(model_tar_name, "r:gz") as tar:
    model_file_name = tar.getnames()[0]
    tar.extractall()

print(f"Model file name: {model_file_name}")

download: s3://sagemaker-us-east-1-152804913371/pytorch-training-2024-11-22-15-54-43-056/output/model.tar.gz to ./model.tar.gz
Model file name: xgboost.json


In [6]:
bst = xgb.Booster()
bst.load_model(model_file_name)
num_features = bst.num_features()

In [7]:
from jinja2 import Template

config_template = """backend: "fil"
max_batch_size: 1000
input [                                 
 {  
  name: "input__0"
  data_type: TYPE_FP32
  dims: [ {{ input_size }} ]                    
  } 
]
output [
 {
  name: "output__0"
  data_type: TYPE_FP32
  dims: [ 2 ]
  }
]
instance_group [{ kind: KIND_{{ device }} }]
parameters [
  {
  key: "model_type"
  value: { string_value: "xgboost_json" }
  },
  {
  key: "predict_proba"
  value: { string_value: "true" }
  },
  {
  key: "output_class"
  value: { string_value: "true" }
  },
  {
  key: "threshold"
  value: { string_value: "0.5" }
  },
  {
  key: "storage_type"
  value: { string_value: "AUTO" }
  }
]

dynamic_batching {

}
"""

template = Template(config_template)

In [8]:
model_workspace = Path("workspace")
gpu_model_path = model_workspace / "xgboost_gpu"
cpu_model_path = model_workspace / "xgboost_cpu"

for device, model_path in zip(["GPU", "CPU"], [gpu_model_path, cpu_model_path]):
    model_path.mkdir(parents=True, exist_ok=True)
    with open(model_path / "config.pbtxt", "w") as f:
        f.write(
            template.render(
                input_size=num_features,
                device=device,
            )
        )
    (model_path / "1").mkdir(parents=True, exist_ok=True)
    shutil.copy(model_file_name, model_path / "1" / model_file_name) 

cpu_tar_name = "xgboost_cpu.tar.gz"
gpu_tar_name = "xgboost_gpu.tar.gz"

with tarfile.open(cpu_tar_name, "w:gz") as tar:
    tar.add(cpu_model_path, arcname=cpu_model_path.name)

with tarfile.open(gpu_tar_name, "w:gz") as tar:
    tar.add(gpu_model_path, arcname=gpu_model_path.name)


In [9]:
gpu_s3_uri = sess.upload_data(gpu_tar_name, bucket=sess.default_bucket(), key_prefix="xgboost-mme")
cpu_s3_uri = sess.upload_data(cpu_tar_name, bucket=sess.default_bucket(), key_prefix="xgboost-mme")

In [10]:
model = Model(
    model_data=cpu_s3_uri,
    image_uri=mme_triton_image_uri,
    role=role,
    sagemaker_session=sess
)

mme = MultiDataModel(
    name="triton-fil-" + dt.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"),
    model_data_prefix=mme_s3_uri,
    model=model,
    sagemaker_session=sess,
)

In [11]:
mme.deploy(
    initial_instance_count=1,
    instance_type=instance_type,
)

-----------!

In [13]:
list(mme.list_models())

['/xgboost_cpu.tar.gz', '/xgboost_gpu.tar.gz']

In [14]:
predictor = sagemaker.predictor.Predictor(endpoint_name=mme.endpoint_name, sagemaker_session=sess)

In [15]:
test_data_bucket = test_data_path.split("/")[2]
test_data_prefix = "/".join(test_data_path.split("/")[3:])


test_file = sess.list_s3_files(test_data_bucket, test_data_prefix)[0]
test_file =f"s3://{test_data_bucket}/{test_file}"

In [16]:
df = pd.read_parquet(test_file)

In [17]:
df.drop(columns=["TX_FRAUD_1"], inplace=True)

In [18]:
def prepare_payload(input_data: np.ndarray, num_features: int) -> dict:

    input_data = input_data.reshape(-1, num_features)
    
    payload = {
        "inputs": [
            {
                "name": "input__0",
                "shape": input_data.shape,
                "datatype": "FP32",
                "data": input_data.tolist(),
            }
        ]
    }
    return json.dumps(payload)

def make_batches (data: np.ndarray, batch_size: int) -> list:
    return [data[i:i+batch_size] for i in range(0, len(data), batch_size)]

In [26]:
BATCH_SIZE = 500
payloads = [prepare_payload(batch, num_features=32) for batch in make_batches(df.values, BATCH_SIZE)]

In [27]:
NUM_CLIENTS = 10
from functools import partial

start_time = time.perf_counter()
with ThreadPoolExecutor(max_workers=NUM_CLIENTS) as executor:
    cpu_predict = partial(predictor.predict, target_model="/xgboost_cpu.tar.gz")
    results = list(executor.map(cpu_predict, payloads))
cpu_throughput = len(df) / (time.perf_counter() - start_time)
print(f"CPU throughput: {cpu_throughput:.2f} records per second")

start_time = time.perf_counter()
with ThreadPoolExecutor(max_workers=NUM_CLIENTS) as executor:
    cpu_predict = partial(predictor.predict, target_model="/xgboost_gpu.tar.gz")
    results = list(executor.map(cpu_predict, payloads))
cpu_throughput = len(df) / (time.perf_counter() - start_time)
print(f"GPU throughput: {cpu_throughput:.2f} records per second")

CPU throughput: 123673.93 records per second
GPU throughput: 348148.16 records per second


In [None]:
mme.delete_endpoint()