<a href="https://colab.research.google.com/github/ayyucedemirbas/Model_Serving/blob/main/Torch_TensorRT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install timm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting timm
  Downloading timm-0.6.12-py3-none-any.whl (549 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m549.1/549.1 KB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, timm
Successfully installed huggingface-hub-0.13.3 timm-0.6.12


In [3]:
!pip install torch-tensorrt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch-tensorrt
  Downloading torch_tensorrt-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.2/15.2 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorrt<8.6.0,>=8.5.1.7
  Downloading tensorrt-8.5.3.1-cp39-none-manylinux_2_17_x86_64.whl (549.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m549.5/549.5 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cublas-cu11
  Downloading nvidia_cublas_cu11-11.11.3.6-py3-none-manylinux1_x86_64.whl (417.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m417.9/417.9 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu11
  Downloading nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux1_x86_64.whl (875 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import torch
import torch_tensorrt
import timm
import time
import numpy as np
import torch.backends.cudnn as cudnn

In [5]:
torch.hub._validate_not_a_forked_repo=lambda a,b,c: True

In [6]:
efficientnet_b0 = timm.create_model('efficientnet_b0',pretrained=True)

Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b0_ra-3dd342df.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_ra-3dd342df.pth


In [7]:
model = efficientnet_b0.eval().to("cuda")
detections_batch = model(torch.randn(128, 3, 224, 224).to("cuda"))
detections_batch.shape

torch.Size([128, 1000])

In [8]:
cudnn.benchmark = True

In [9]:
def benchmark(model, input_shape=(1024, 3, 512, 512), dtype='fp32', nwarmup=50, nruns=1000):
    input_data = torch.randn(input_shape)
    input_data = input_data.to("cuda")
    if dtype=='fp16':
        input_data = input_data.half()
        
    print("Warm up ...")
    with torch.no_grad():
        for _ in range(nwarmup):
            features = model(input_data)
    torch.cuda.synchronize()
    print("Start timing ...")
    timings = []
    with torch.no_grad():
        for i in range(1, nruns+1):
            start_time = time.time()
            pred_loc  = model(input_data)
            torch.cuda.synchronize()
            end_time = time.time()
            timings.append(end_time - start_time)
            if i%10==0:
                print('Iteration %d/%d, avg batch time %.2f ms'%(i, nruns, np.mean(timings)*1000))

    print("Input shape:", input_data.size())
    print('Average throughput: %.2f images/second'%(input_shape[0]/np.mean(timings)))

In [10]:
model = efficientnet_b0.eval().to("cuda")
benchmark(model, input_shape=(1, 3, 224, 224), nruns=100)

Warm up ...
Start timing ...
Iteration 10/100, avg batch time 14.80 ms
Iteration 20/100, avg batch time 19.79 ms
Iteration 30/100, avg batch time 16.18 ms
Iteration 40/100, avg batch time 14.26 ms
Iteration 50/100, avg batch time 13.18 ms
Iteration 60/100, avg batch time 12.44 ms
Iteration 70/100, avg batch time 11.91 ms
Iteration 80/100, avg batch time 11.59 ms
Iteration 90/100, avg batch time 11.25 ms
Iteration 100/100, avg batch time 10.97 ms
Input shape: torch.Size([1, 3, 224, 224])
Average throughput: 91.19 images/second


In [11]:
traced_model = torch.jit.trace(model, torch.randn((1,3,224,224)).to("cuda"))
torch.jit.save(traced_model, "efficientnet_b0_traced.jit.pt")
benchmark(traced_model, input_shape=(1, 3, 224, 224), nruns=100)

Warm up ...
Start timing ...
Iteration 10/100, avg batch time 26.23 ms
Iteration 20/100, avg batch time 31.20 ms
Iteration 30/100, avg batch time 24.86 ms
Iteration 40/100, avg batch time 22.31 ms
Iteration 50/100, avg batch time 20.23 ms
Iteration 60/100, avg batch time 19.72 ms
Iteration 70/100, avg batch time 18.57 ms
Iteration 80/100, avg batch time 17.68 ms
Iteration 90/100, avg batch time 18.47 ms
Iteration 100/100, avg batch time 17.84 ms
Input shape: torch.Size([1, 3, 224, 224])
Average throughput: 56.04 images/second


In [14]:
trt_model = torch_tensorrt.compile(model, 
    inputs= [torch_tensorrt.Input((1, 3, 224, 224))],
    enabled_precisions= { torch_tensorrt.dtype.half} # Run with FP16
)

In [15]:
benchmark(trt_model, input_shape=(1, 3, 224, 224), nruns=100, dtype="fp16")

Warm up ...


RuntimeError: ignored