In [None]:
import numpy as np
import torch
from transformers import AutoTokenizer
import tritonclient.http as httpclient
from tritonclient.utils import InferenceServerException

triton_client = httpclient.InferenceServerClient(url="127.0.0.1:7000")


def test_infer(
    model_name,
    input_ids__0,
    attention_mask__1,
    token_type_ids__2,
    headers=None,
    request_compression_algorithm=None,
    response_compression_algorithm=None,
):
    inputs = []
    outputs = []
    inputs.append(httpclient.InferInput("input_ids", [1, 512], "INT64"))
    inputs.append(httpclient.InferInput("attention_mask", [1, 512], "INT64"))
    inputs.append(httpclient.InferInput("token_type_ids", [1, 512], "INT64"))

    # Initialize the data
    inputs[0].set_data_from_numpy(input_ids__0, binary_data=False)
    inputs[1].set_data_from_numpy(attention_mask__1, binary_data=False)
    inputs[2].set_data_from_numpy(token_type_ids__2, binary_data=False)

    outputs.append(
        httpclient.InferRequestedOutput("last_hidden_state", binary_data=False)
    )

    results = triton_client.infer(
        model_name,
        inputs,
        headers=headers,
        request_compression_algorithm=request_compression_algorithm,
        response_compression_algorithm=response_compression_algorithm,
    )

    return results


tokenizer = AutoTokenizer.from_pretrained(
    "/huggingface/sentence_transformers/bge-small-zh-v1.5-onnx/"
)

In [None]:
import tritonclient.grpc.aio as grpcclient

trion_grpc_client = grpcclient.InferenceServerClient(url="127.0.0.1:7001")

In [None]:
result = await trion_grpc_client.get_inference_statistics()
result

model_stats {
  name: "embedding"
  version: "1"
  last_inference: 1720592176099
  inference_count: 16254
  execution_count: 16254
  inference_stats {
    success {
      count: 16254
      ns: 66613751697476
    }
    fail {
    }
    queue {
      count: 16254
      ns: 66576533712199
    }
    compute_input {
      count: 16254
      ns: 662801534
    }
    compute_infer {
      count: 16254
      ns: 30256391063
    }
    compute_output {
      count: 16254
      ns: 2323889201
    }
    cache_hit {
    }
    cache_miss {
    }
  }
  batch_stats {
    batch_size: 1
    compute_input {
      count: 16254
      ns: 662801534
    }
    compute_infer {
      count: 16254
      ns: 30256391063
    }
    compute_output {
      count: 16254
      ns: 2323889201
    }
  }
}

In [None]:
from sre_parse import FLAGS


async def test_infer_async(
    model_name, input_ids__0, attention_mask__1, token_type_ids__2, headers=None
):
    inputs = []
    outputs = []
    inputs.append(grpcclient.InferInput("input_ids", [1, 512], "INT64"))
    inputs.append(grpcclient.InferInput("attention_mask", [1, 512], "INT64"))
    inputs.append(grpcclient.InferInput("token_type_ids", [1, 512], "INT64"))

    # Initialize the data
    inputs[0].set_data_from_numpy(input_ids__0)
    inputs[1].set_data_from_numpy(attention_mask__1)
    inputs[2].set_data_from_numpy(token_type_ids__2)

    outputs.append(grpcclient.InferRequestedOutput("last_hidden_state"))

    results = await trion_grpc_client.infer(
        model_name,
        inputs=inputs,
        outputs=outputs,
        headers=headers,
        compression_algorithm="gzip",
    )

    return results


inputs = tokenizer(
    ["我爱北京天安门"],
    is_split_into_words=False,
    return_tensors="pt",
    padding="max_length",
    max_length=512,
)

results = await test_infer_async(
    "embedding",
    inputs["input_ids"].numpy(),
    inputs["attention_mask"].numpy(),
    inputs["token_type_ids"].numpy(),
)

model_output = torch.from_numpy(results.as_numpy("last_hidden_state"))
embeddings = model_output[:, 0]
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
embeddings

  from sre_parse import FLAGS


tensor([[ 4.8712e-02,  1.1275e-01,  5.5719e-02,  3.1040e-02,  2.5992e-02,
         -2.7015e-02,  1.2640e-02,  3.2104e-02,  3.8956e-02,  1.9829e-02,
          5.6384e-02, -2.4350e-01, -2.2022e-02,  3.0023e-02, -6.0440e-02,
          6.8122e-04, -4.3383e-02, -3.3204e-02, -2.5863e-02, -3.1667e-02,
         -5.8114e-02, -4.7395e-02,  3.5730e-03, -1.9483e-02,  3.2852e-02,
          1.4670e-02,  1.2862e-02, -6.6240e-02, -1.0591e-02, -3.6343e-02,
         -1.0499e-04,  3.6151e-02,  4.2404e-02, -3.7466e-02,  1.4994e-02,
         -8.7496e-02, -7.3626e-02,  1.8903e-03,  7.7163e-03,  7.2207e-02,
         -9.2240e-03, -3.8684e-03, -2.6567e-03, -2.8499e-02, -1.3102e-02,
          4.2263e-03,  1.8580e-02, -8.9796e-03, -8.3271e-03,  3.2754e-02,
         -1.5525e-02,  9.7978e-03,  3.6212e-02, -1.5610e-02,  1.5459e-02,
         -2.4204e-02,  7.4954e-03,  7.8971e-02,  2.0036e-03,  6.4317e-02,
         -9.3554e-03, -1.1949e-02, -5.7218e-02, -3.8853e-02, -6.1532e-02,
          1.4281e-02, -1.7761e-03,  3.

In [None]:
import asyncio

import time

start = time.time()
tasks = []
for i in range(5000):
    inputs = tokenizer(
        ["我爱北京天安门"],
        is_split_into_words=False,
        return_tensors="pt",
        padding="max_length",
        max_length=512,
    )
    tasks.append(
        test_infer_async(
            "embedding",
            inputs["input_ids"].numpy(),
            inputs["attention_mask"].numpy(),
            inputs["token_type_ids"].numpy(),
        )
    )

print("elapsed: ", time.time() - start)
results = await asyncio.gather(*tasks)
print("elapsed: ", time.time() - start)

embedding_list = []
for result in results:
    model_output = torch.from_numpy(result.as_numpy("last_hidden_state"))
    embeddings = model_output[:, 0]
    embedding_list.append(torch.nn.functional.normalize(embeddings, p=2, dim=1))

elapsed:  1.4345321655273438
elapsed:  12.041364431381226


In [None]:
import onnx

model = onnx.load(
    "/huggingface/sentence_transformers/bge-small-zh-v1.5-onnx/model.onnx"
)
output = [node.name for node in model.graph.output]

input_all = [node.name for node in model.graph.input]
input_initializer = [node.name for node in model.graph.initializer]
net_feed_input = list(set(input_all) - set(input_initializer))

print("Inputs: ", net_feed_input)
print("Outputs: ", output)

Inputs:  ['input_ids', 'token_type_ids', 'attention_mask']
Outputs:  ['last_hidden_state']


In [None]:
from pai_rag.modules.embedding.my_ort_embedding import MyORTModelForFeatureExtraction
from transformers import AutoTokenizer

onnx_path = "/huggingface/sentence_transformers/bge-small-zh-v1.5-onnx"
model = MyORTModelForFeatureExtraction.from_pretrained(
    onnx_path, file_name="model.onnx", provider="CUDAExecutionProvider"
)
tokenizer = AutoTokenizer.from_pretrained(onnx_path)
max_length = model.config.max_position_embeddings

/huggingface/sentence_transformers/bge-small-zh-v1.5-onnx/model.onnx
False
<onnxruntime.capi.onnxruntime_pybind11_state.SessionOptions object at 0x7f978eede2f0>
[<onnxruntime.capi.onnxruntime_pybind11_state.NodeArg object at 0x7f978eede2f0>, <onnxruntime.capi.onnxruntime_pybind11_state.NodeArg object at 0x7f978eedef30>, <onnxruntime.capi.onnxruntime_pybind11_state.NodeArg object at 0x7f978eede7b0>]
[<onnxruntime.capi.onnxruntime_pybind11_state.NodeArg object at 0x7f96819311b0>]


[0;93m2024-07-10 15:12:47.787512132 [W:onnxruntime:, transformer_memcpy.cc:74 ApplyImpl] 4 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m
[0;93m2024-07-10 15:12:47.788369070 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-07-10 15:12:47.788377674 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


In [None]:
for i in range(5000):
    text = "我爱北京天安门"
    encoded_input = tokenizer(
        text,
        padding=True,
        max_length=max_length,
        truncation=True,
        return_tensors="pt",
    )

    model_output = model(**encoded_input)
    embeddings = model_output[0][:, 0]

    embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)