# Deploy NCD model to Triton Inference Server on MIR

description: (preview) deploy a bi-directional attention flow (bidaf) Q&A model to V100s on AKS via Triton

Please note that this Public Preview release is subject to the [Supplemental Terms of Use for Microsoft Azure Previews](https://azure.microsoft.com/support/legal/preview-supplemental-terms/).

In [None]:
import os
from azureml.core import Workspace

subscription_id = os.getenv("SUBSCRIPTION_ID", default="<subscription_id>")
resource_group = os.getenv("RESOURCE_GROUP", default="<resource_group>")
workspace_name = os.getenv("WORKSPACE_NAME", default="<workspace_name>")

ws = Workspace.get(
    subscription_id = subscription_id, 
    resource_group = resource_group, 
    name = workspace_name)

print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

## Download model

In [None]:
import os
import sys
from pathlib import Path
from src.model_utils import download_triton_models, delete_triton_models

prefix = Path(".")
download_triton_models(prefix)

## Register model

In [None]:
from azureml.core.model import Model

model = Model.register(
    model_path="./models",
    model_name="bidaf-9",
    tags={"area": "Natural language processing", "type": "Question-answering"},
    description="Question answering from ONNX model zoo",
    workspace=ws,
    model_framework=Model.Framework.MULTI,
)

print(model)

## Deploy webservice

In [None]:
from azureml.contrib.mir.webservice import MirWebservice
from azureml.exceptions import WebserviceException
from azureml.core import Webservice

mir_config = MirWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 4, sku = "Standard_NC24s_v3", 
    num_replicas = 1, gpu_cores = 1, tags = {"mir.enableMount": "true"})

service_name = "triton-ncd-bidaf-gpus-1"

mir_service = Model.deploy(ws, service_name , [model], deployment_config = mir_config)
mir_service.wait_for_deployment(show_output = True)

In [None]:
print(mir_service.state)
print(mir_service.scoring_uri)

## Test the webservice

In [None]:
access_token = mir_service.get_access_token().access_token
print(access_token)

service_endpoint = mir_service.scoring_uri.replace("/score", "")
print(service_endpoint)

Install Triton client libraries @https://github.com/triton-inference-server/server/releases

Open cmd window and run the following commands

! cd /tmp && \
  wget https://github.com/NVIDIA/triton-inference-server/releases/download/v2.6.0/v2.6.0_ubuntu2004.clients.tar.gz && \
  tar xzf v2.6.0_ubuntu2004.clients.tar.gz && \
  pip install python/tritonclient-2.6.0-py3-none-any.whl numpy pillow

Note this depends on the local OS (in this case Ubuntu) version.

In [None]:
import tritonclient.http as tritonhttpclient
from tritonclient.utils import triton_to_np_dtype

headers = {}
headers["Authorization"] = f"Bearer {access_token}"

uri = service_endpoint[8:]
print(uri)

triton_client = tritonhttpclient.InferenceServerClient(uri, ssl=True)

context = "A quick brown fox jumped over the lazy dog."
query = "Which animal was lower?"

model_name = "bidaf-9"

model_metadata = triton_client.get_model_metadata(
    model_name=model_name, headers=headers
)

input_meta = model_metadata["inputs"]
output_meta = model_metadata["outputs"]

# We use the np.object data type for string data
np_dtype = triton_to_np_dtype(input_meta[0]["datatype"])
cw, cc = preprocess(context, np_dtype)
qw, qc = preprocess(query, np_dtype)

input_mapping = {
    "query_word": qw,
    "query_char": qc,
    "context_word": cw,
    "context_char": cc,
}

inputs = []
outputs = []

# Populate the inputs array
for in_meta in input_meta:
    input_name = in_meta["name"]
    data = input_mapping[input_name]

    input = tritonhttpclient.InferInput(input_name, data.shape, in_meta["datatype"])

    input.set_data_from_numpy(data, binary_data=False)
    inputs.append(input)

# Populate the outputs array
for out_meta in output_meta:
    output_name = out_meta["name"]
    output = tritonhttpclient.InferRequestedOutput(output_name, binary_data=False)
    outputs.append(output)

# Run inference
res = triton_client.infer(
    model_name,
    inputs,
    request_id="0",
    outputs=outputs,
    model_version="1",
    headers=headers,
)

result = postprocess(context_words=cw, answer=res)

result

## Delete the webservice and the downloaded model

In [None]:
%%time
mir_service.delete()
model.delete()
delete_triton_models(prefix)