In this notebook I will try to convert the models into tensorrt engines by firs converting them into onnx counterparts then using trt_exec cli we will optimize the computation graph' s for our specific hardware

Let us start by our t5 model

In [1]:
import torch
import torch_tensorrt

from wan.utils.timer import Timer

from wan.modules.t5 import T5EncoderModel

# wan_shared_cfg.t5_model = 'umt5_xxl'
# wan_shared_cfg.t5_dtype = torch.bfloat16
# wan_shared_cfg.text_len = 512
# ti2v_5B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
# ti2v_5B.t5_tokenizer = 'google/umt5-xxl'


print("Loading model to cpu first")
model = T5EncoderModel(
    text_len=512, # from config
    dtype=torch.float16,
    device=torch.device("cpu"),
    checkpoint_path="Wan2.2-TI2V-5B/models_t5_umt5-xxl-enc-bf16.pth",
    tokenizer_path="Wan2.2-TI2V-5B/google/umt5-xxl"

)
# print("Transfering model to cuda")
# model.model.eval().to("cuda")



texts = ["Alper example input"]


  from .autonotebook import tqdm as notebook_tqdm
TensorRT-LLM is not installed. Please install TensorRT-LLM or set TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops


Loading model to cpu first


Tokenization logic from T5EncoderModel

In [2]:
device = "cpu"
to_optimize = model.model
ids, mask = model.tokenizer(
    texts, return_mask=True, add_special_tokens=True)
ids = ids.to(device).to(torch.int32)
mask = mask.to(device).to(torch.int32)
seq_lens = mask.gt(0).sum(dim=1).long()

In [None]:
torch.onnx.export(
    model,           
    (ids, mask),
    "t5model.onnx",
    opset_version=17,           
    input_names=['input'],      
    output_names=['output']
)

I save the relevant tensors to disk in order not to generate the over and over again

In [12]:
torch.save(ids, open("ids.pth", "wb"))
torch.save(mask, open("mask.pth", "wb"))

In [1]:
import torch
ids = torch.load(open("ids.pth", "rb"))
mask = torch.load(open("mask.pth", "rb"))
ids_device = ids.cuda().to(torch.long).contiguous()
mask_device = mask.cuda().to(torch.long).contiguous()

In [2]:
import tensorrt as trt
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit

# 1. Setup Logger and Load Engine
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
# engine_path = "/home/alpfischer/Wan2.2/t5_model_last.engine"
engine_path = "/home/alpfischer/Wan2.2/Wan2.2-TI2V-5B/t5_model_last.engine"


print(f"Loading engine from {engine_path}...")
with open(engine_path, "rb") as f:
    engine_data = f.read()

runtime = trt.Runtime(TRT_LOGGER)
engine = runtime.deserialize_cuda_engine(engine_data)

if engine is None:
    raise RuntimeError("Failed to load engine!")

# 2. Create Context and Stream
context = engine.create_execution_context()
stream = cuda.Stream()

# 3. Define Input Shapes
# You stated inputs are (1, 512). T5 usually takes INT32 for input_ids.
input_shape = (1, 512) 
input_dtype = np.int64 

# Prepare dictionaries to hold memory pointers
host_inputs = []
cuda_inputs = []
host_outputs = []
cuda_outputs = []
bindings = [] # For legacy viewing, though we use set_tensor_address

print("\nINSPECTING TENSORS:")
# 4. Allocate Memory Helper
for i in range(engine.num_io_tensors):
    tensor_name = engine.get_tensor_name(i)
    tensor_mode = engine.get_tensor_mode(tensor_name)
    
    # Check if it is input or output
    if tensor_mode == trt.TensorIOMode.INPUT:
        print(f"  Input found: {tensor_name}")
        # Set the shape for the specific input
        context.set_input_shape(tensor_name, input_shape)
    else:
        print(f"  Output found: {tensor_name}")

    # Determine required memory size
    # Note: We query the context for shape because output sizes might be dynamic based on input
    shape = context.get_tensor_shape(tensor_name)


    # Get the TensorRT type
    trt_dtype = engine.get_tensor_dtype(tensor_name)

    print(f"trt_dtype for {tensor_name}:{trt_dtype}")

    # manual mapping for types that trt.nptype() misses
    if trt_dtype == trt.DataType.BF16:
        # NumPy doesn't have native BF16, so we usually map it to int16 for storage 
        # or float32 if you plan to convert immediately. 
        # For pure buffer size calculation, int16 (2 bytes) is the correct size equivalent.
        dtype = np.dtype(np.int16) 
    elif trt_dtype == trt.DataType.FP8:
        # FP8 takes 1 byte, similar to int8/uint8
        dtype = np.dtype(np.int8)
    else:
        # Fallback to the standard mapping for Float32, Float16, Int32, etc.
        try:
            dtype = np.dtype(trt.nptype(trt_dtype))
        except TypeError:
            # If it still fails, print the type to debug
            print(f"Error: Unsupported TensorRT type found: {trt_dtype}")
            raise

    # Calculate volume (product of dimensions)
    vol = 1
    for dim in shape:
        # Safety check for dynamic shapes that weren't resolved
        if dim < 0: 
            raise RuntimeError(f"Found dynamic shape dimension {dim} for {tensor_name}. Ensure input shapes are set correctly.")
        vol *= dim
            
    size = vol * dtype.itemsize
 
    # Allocate Device Memory
    d_mem = cuda.mem_alloc(size)
    
    # Allocation Host Memory (Page-locked/Pinned for speed)
    h_mem = cuda.pagelocked_empty(vol, dtype)
    
    # Store pointers and bind address
    # IMPORTANT: TensorRT > 8.5 uses set_tensor_address
    context.set_tensor_address(tensor_name, int(d_mem))

    if tensor_mode == trt.TensorIOMode.INPUT:
        host_inputs.append(h_mem)
        cuda_inputs.append(d_mem)
    else:
        host_outputs.append(h_mem)
        cuda_outputs.append(d_mem)

# 5. Prepare Dummy Data (Replace this with your actual T5 Token IDs)
# T5 Inputs: Usually [input_ids, decoder_input_ids]
print("\nGenerating dummy input data...")
np.copyto(host_inputs[0], ids.detach().numpy().astype(input_dtype))
np.copyto(host_inputs[1], mask.detach().numpy().astype(input_dtype))


# 6. Inference Loop
print("Running Inference...")

# Copy Host -> Device
for h_mem, d_mem in zip(host_inputs, cuda_inputs):
    cuda.memcpy_htod_async(d_mem, h_mem, stream)

# Execute (Async v3 is the standard for modern TRT)
success = context.execute_async_v3(stream_handle=stream.handle)

torch.cuda.synchronize()

print(f"Did execution request succeed? {success}")


# Copy Device -> Host
for h_mem, d_mem in zip(host_outputs, cuda_outputs):
    cuda.memcpy_dtoh_async(h_mem, d_mem, stream)

# Synchronize to ensure completion
stream.synchronize()

print("Inference Complete.")
print(f"Output Shape: {host_outputs[0].shape}")
print("Output Data snippet:", host_outputs[0][:10])

Loading engine from /home/alpfischer/Wan2.2/Wan2.2-TI2V-5B/t5_model_last.engine...

INSPECTING TENSORS:
  Input found: input
trt_dtype for input:DataType.INT64
  Input found: onnx::Reshape_1
trt_dtype for onnx::Reshape_1:DataType.INT64
  Output found: output
trt_dtype for output:DataType.BF16

Generating dummy input data...
Running Inference...
Did execution request succeed? True
Inference Complete.
Output Shape: (2097152,)
Output Data snippet: [ 15029 -17462 -17242  15968 -17255 -17593  15572  15915 -16869 -16961]


In [6]:
tensor_name = engine.get_tensor_name(2)
shape = context.get_tensor_shape(tensor_name)
shape

(1, 512, 4096)

In [7]:
len(host_outputs[0])

2097152

In [3]:
import numpy as np
import ml_dtypes

# 1. Assume 'trt_output' is your int16 array from TensorRT
#    e.g., trt_output = np.array([16256, 16384], dtype=np.int16) 

# 2. View the int16 bits as bfloat16
bf16_data = host_outputs[0].view(ml_dtypes.bfloat16)

# 3. Convert to float32 for comparison/math
final_output = bf16_data.astype(np.float32)

print("Decoded values:", final_output)

Decoded values: [ 0.  0. -0. ...  0.  0.  0.]


In [15]:
seq_lens = mask.gt(0).sum(dim=1).long()

In [23]:
final_output = torch.Tensor(final_output).reshape([1,512,4096])
t5_engine_output = [u[:v] for u, v in zip( final_output, seq_lens)]

In [24]:
final_output.shape

torch.Size([1, 512, 4096])

In [25]:
with open("t5_engine_outputs.pth", "wb") as f:
    torch.save(torch.Tensor(final_output), f)

In [2]:

model.model.eval().to("cuda")
pytorch_model_output = model(texts, "cuda")

In [3]:
with open("t5_engine_outputs.pth", "rb") as f:
    engine_output = torch.load(f)

In [4]:
engine_output.shape

torch.Size([1, 512, 4096])

In [5]:
pytorch_model_output[0].shape


torch.Size([5, 4096])

In [6]:
engine_output[:5]

tensor([[[ 0.0014, -0.0062, -0.0203,  ...,  0.0002,  0.1084, -0.1260],
         [ 0.0030,  0.0118, -0.0171,  ...,  0.0014,  0.0227, -0.0204],
         [ 0.0016,  0.0204,  0.0635,  ...,  0.0008, -0.0471, -0.1504],
         ...,
         [ 0.0009, -0.0082,  0.0596,  ...,  0.0002,  0.0461,  0.0265],
         [ 0.0009, -0.0082,  0.0596,  ...,  0.0002,  0.0461,  0.0265],
         [ 0.0009, -0.0082,  0.0596,  ...,  0.0002,  0.0461,  0.0265]]])

In [10]:
pytorch_model_output[0]

tensor([[ 0.0014, -0.0061, -0.0205,  ...,  0.0002,  0.1085, -0.1257],
        [ 0.0030,  0.0117, -0.0172,  ...,  0.0014,  0.0228, -0.0203],
        [ 0.0016,  0.0202,  0.0638,  ...,  0.0008, -0.0472, -0.1509],
        [ 0.0014,  0.0591,  0.0010,  ...,  0.0005, -0.0332, -0.0512],
        [-0.0011, -0.0063,  0.0008,  ...,  0.0003, -0.0091,  0.0115]],
       device='cuda:0', dtype=torch.float16)

In [13]:
torch.abs(engine_output[0][:5].to("cpu")-pytorch_model_output[0].to("cpu")).mean()

tensor(0.0001)

As we can see diff is very small that means our engine export from trtexec is valid. Let's move to the implementation and utilize the engine

In [16]:
ids.shape

torch.Size([1, 512])

In [17]:
mask.shape

torch.Size([1, 512])

After integrating tensorrt engine into our pipeline we see that single inference with trt engine reduce the total inference time of a single t5 encoder from 0.1 ( which was obtained via negative prompt caching and torch.compile) seconds to 0.009 seconds which is a huge speed gain of %50 means we can process x10 more text in one go. We can apply the same operation to VAE and Diffusion models as well. Because this demonstration of huge speed gain can be easily extended to other models I am skipping doing the same for them