# GRPC Inference

### Setup

In [1]:
!pip list | grep -e grpcio -e protobuf

grpcio                          1.56.0
grpcio-tools                    1.33.2
protobuf                        3.20.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
!pip install grpcio==1.56.0 grpcio-tools==1.33.2 protobuf==3.20.3


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
!pip list | grep -e grpcio -e protobuf

grpcio                          1.56.0
grpcio-tools                    1.33.2
protobuf                        3.20.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
grpc_host = 'modelmesh-serving'
grpc_port = 8033

textencoder_model_name = 'textencoder'
unet_model_name = 'unet'
vaeencoder_model_name = 'vaeencoder'
vaedecoder_model_name = 'vaedecoder'

### Inspecting the gRPC Endpoint

Let's check out the gRPC endpoint's model metadata.

In [5]:
import grpc
import grpc_predict_v2_pb2
import grpc_predict_v2_pb2_grpc


channel = grpc.insecure_channel(f"{grpc_host}:{grpc_port}")
stub = grpc_predict_v2_pb2_grpc.GRPCInferenceServiceStub(channel)

# request = grpc_predict_v2_pb2.ModelMetadataRequest(name=textencoder_model_name)
# response = stub.ModelMetadata(request)
# print(response)

request = grpc_predict_v2_pb2.ModelMetadataRequest(name=unet_model_name)
response = stub.ModelMetadata(request)
print(response)

# request = grpc_predict_v2_pb2.ModelMetadataRequest(name=vaeencoder_model_name)
# response = stub.ModelMetadata(request)
# print(response)

# request = grpc_predict_v2_pb2.ModelMetadataRequest(name=vaedecoder_model_name)
# response = stub.ModelMetadata(request)
# print(response)


name: "unet__isvc-6ee2a9ae97"
versions: "1"
platform: "onnxruntime_onnx"
inputs {
  name: "encoder_hidden_states"
  datatype: "FP32"
  shape: -1
  shape: -1
  shape: 768
}
inputs {
  name: "timestep"
  datatype: "INT64"
  shape: -1
  shape: 1
}
inputs {
  name: "sample"
  datatype: "FP32"
  shape: -1
  shape: -1
  shape: -1
  shape: -1
}
outputs {
  name: "out_sample"
  datatype: "FP32"
  shape: -1
  shape: -1
  shape: -1
  shape: -1
}



### Request Function

Builds and submits our gRPC request.

In [6]:
import torch

def unet_grpc_request(encoder_hidden_states, timestep, sample):
    inputs = []
    inputs.append(grpc_predict_v2_pb2.ModelInferRequest().InferInputTensor())
    inputs[0].name = "encoder_hidden_states"
    inputs[0].datatype = "FP32"
    inputs[0].shape.extend([2, 77, 768])
    arr = encoder_hidden_states.flatten()
    inputs[0].contents.fp32_contents.extend(arr)

    inputs.append(grpc_predict_v2_pb2.ModelInferRequest().InferInputTensor())
    inputs[1].name = "timestep"
    inputs[1].datatype = "INT64"
    inputs[1].shape.extend([2, 1])
    arr = timestep.flatten()
    inputs[1].contents.int64_contents.extend(arr)

    inputs.append(grpc_predict_v2_pb2.ModelInferRequest().InferInputTensor())
    inputs[2].name = "sample"
    inputs[2].datatype = "FP32"
    inputs[2].shape.extend([2, 4, 64, 64])
    arr = sample.flatten()
    inputs[2].contents.fp32_contents.extend(arr)

    request = grpc_predict_v2_pb2.ModelInferRequest()
    request.model_name = unet_model_name
    request.inputs.extend(inputs)

    response = stub.ModelInfer(request)
    out_sample = np.frombuffer(response.raw_output_contents[0], dtype=np.float32)

    return torch.tensor(out_sample.reshape([-1, 4, 64, 64]))

### Run the Request

In [7]:
import numpy as np
import torch

from IPython.display import Image

latent_model_input = np.load("latent_model_input.npy")
text_embeddings = np.load("text_embeddings.npy")
timestep = np.load("t.npy")


In [8]:
import torch
import numpy as np


out_sample = unet_grpc_request(text_embeddings, torch.tensor([1, 1]), latent_model_input)

In [9]:
print(out_sample.dtype)
print(out_sample.shape)
print(out_sample)

torch.float32
torch.Size([2, 4, 64, 64])
tensor([[[[-0.1189, -0.1315, -0.1506,  ..., -0.3761,  0.3015,  0.1556],
          [ 0.1426,  0.1699,  0.1616,  ...,  0.3701, -0.1057,  0.4558],
          [-0.1506, -0.0131, -0.3082,  ..., -0.1181, -0.4185, -0.4856],
          ...,
          [ 0.0302, -0.0854, -0.3603,  ..., -0.4310, -0.0114, -0.0228],
          [ 0.1836, -0.2188, -0.2780,  ...,  0.4091, -0.1020,  0.3825],
          [ 0.1844, -0.0516,  0.2451,  ..., -0.8658, -0.3165,  0.3079]],

         [[ 0.4641,  0.2499, -0.0863,  ...,  0.1582, -0.1716,  0.3040],
          [-0.1720, -0.3084, -0.0990,  ...,  0.0532,  0.2248,  0.3627],
          [ 0.0750, -0.0261,  0.3229,  ..., -0.0181, -0.3284, -0.0850],
          ...,
          [-0.2300,  0.1531,  0.1063,  ..., -0.2409, -0.1880, -0.3696],
          [-0.3809,  0.0884,  0.1340,  ..., -0.3222,  0.1739, -0.1053],
          [-0.0616,  0.2329, -0.0213,  ..., -0.0629, -0.1140, -0.0694]],

         [[ 0.0459, -0.0218, -0.0303,  ...,  0.1230, -0.4340,