# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/cene555/ruCLIP-SB/blob/main/notebooks/ruCLIP_SB_onnx.ipynb)

In [3]:
#@title Allowed Resources
import multiprocessing
import torch
from psutil import virtual_memory

ram_gb = round(virtual_memory().total / 1024**3, 1)

print('CPU:', multiprocessing.cpu_count())
print('RAM GB:', ram_gb)
print("PyTorch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)
print("cuDNN version:", torch.backends.cudnn.version())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device.type)

!nvidia-smi

CPU: 2
RAM GB: 12.7
PyTorch version: 1.10.0+cu111
CUDA version: 11.1
cuDNN version: 8005
device: cuda
Tue Jan 25 17:45:47 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8    28W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                  

## Install requirements

In [2]:
%%capture
!pip install git+https://github.com/cene555/ruCLIP-SB.git
!pip install pymorphy2
!gdown -O ruCLIP-SB.pkl https://drive.google.com/uc?id=1-CghuC9TCIDyn5H3zQS6ho_TNiudzJCX

!pip install git+https://github.com/Lednik7/CLIP-ONNX.git
!pip install git+https://github.com/openai/CLIP.git
!pip install onnxruntime-gpu

!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true

## Import libraries

In [1]:
import torch
from torchvision import transforms
import transformers
from transformers import BertTokenizer
from ruclipsb import ruCLIPSB
from ruclipsb.utils import tokenize, _convert_image_to_rgb
from PIL import ImageCms, Image
import cv2
import numpy as np
try:
    from torchvision.transforms import InterpolationMode
    BICUBIC = InterpolationMode.BICUBIC
except ImportError:
    BICUBIC = Image.BICUBIC

In [3]:
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

In [6]:
torch.manual_seed(1)
device = torch.device('cpu')

## Load RuCLIP-SB model

In [7]:
model = ruCLIPSB()
model.load_state_dict(torch.load('ruCLIP-SB.pkl', map_location=device))
model = model.half().to(device)

model = model.eval()
for x in model.parameters(): x.requires_grad = False
torch.cuda.empty_cache()

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
tokenizer = BertTokenizer.from_pretrained("cointegrated/rubert-tiny")

In [9]:
transform = transforms.Compose([
            transforms.Resize(224),
            transforms.CenterCrop(224),
            _convert_image_to_rgb,
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225]),])

## Prepare functions

In [12]:
# batch first
image = transform(Image.open("CLIP.png")).unsqueeze(0).cpu() # [1, 3, 224, 224]
image_onnx = image.detach().cpu().numpy().astype(np.float32)

# batch first
texts = ['диаграмма', 'собака', 'кошка']
text_tokens, attention_mask = tokenize(tokenizer, texts, 77)
text_tokens, attention_mask = text_tokens.cpu(), attention_mask.cpu() # [3, 77]
text_onnx = torch.stack([text_tokens, attention_mask]).detach().cpu().numpy().astype(np.int64)

In [14]:
class Textual(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_data):
        input_ids, attention_mask = input_data
        x = self.model.transformer(input_ids=input_ids, attention_mask=attention_mask)
        x = x.last_hidden_state[:, 0, :]
        x = self.model.final_ln(x)
        return x

## Convert RuCLIP-SB model to ONNX

In [None]:
from clip_onnx import clip_onnx

def convert_textual(self, dummy_input):
    textual = Textual(self.model)
    torch.onnx.export(textual, dummy_input, self.textual_path,
                  input_names=['input'], output_names=['output'],
                  export_params=True, verbose=False, opset_version=14,
                  do_constant_folding=True,
                  dynamic_axes={'input': {1: 'batch_size'}, 'output': {0: 'batch_size'}})
    self.onnx_checker(self.textual_path)

clip_onnx.convert_textual = convert_textual

visual_path = "clip_visual.onnx"
textual_path = "clip_textual.onnx"

dummy_input_text = torch.stack([text_tokens, attention_mask]).detach().cpu()

In [None]:
onnx_model = clip_onnx(model.float().cpu(), visual_path=visual_path, textual_path=textual_path)
onnx_model.convert2onnx(image, dummy_input_text, verbose=True)

## [ONNX] CUDA inference mode

In [None]:
# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
onnx_model.start_sessions(providers=["CUDAExecutionProvider"]) # cuda mode

In [16]:
image_features = onnx_model.encode_image(image_onnx)
text_features = onnx_model.encode_text(text_onnx)

logits_per_image, logits_per_text = onnx_model(image_onnx, text_onnx)
probs = logits_per_image.softmax(dim=-1).detach().cpu().numpy()

print("Label probs:", probs) # [[0.9844646  0.01167088 0.00386453]]

Label probs: [[0.9844646  0.01167088 0.00386453]]


In [17]:
%timeit onnx_model.encode_image(image_onnx)

10 loops, best of 5: 18 ms per loop


In [18]:
%timeit onnx_model.encode_text(text_onnx)

100 loops, best of 5: 2.76 ms per loop
