In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
import onnx
import numpy as np
import pandas as pd
from torch.onnx import TrainingMode

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Pipeline

In [None]:
model_name = "joeddav/xlm-roberta-large-xnli"
classifier = pipeline("zero-shot-classification", model=model_name)

In [3]:
sequence_to_classify = "Manchester United"
# we can specify candidate labels in Russian or any other language above:
candidate_labels = ["football team", "city", "England", "Masha"]

In [5]:
classifier(sequence_to_classify, candidate_labels, multi_label=True)

{'sequence': 'Manchester United',
 'labels': ['football team', 'city', 'England', 'Masha'],
 'scores': [0.9946267604827881,
  0.5824565887451172,
  0.4575895667076111,
  0.0036103464663028717]}

## Parts of models

In [4]:
model_name = "joeddav/xlm-roberta-large-xnli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Get scores

In [9]:
import pandas as pd
from tqdm import tqdm

data_folder = "/root/data/"
data = pd.read_csv(data_folder + "markup_w_scores.csv")

In [10]:
data_input = []
data_attention = []
data_scores = []

for i in tqdm(range(500)):

    candidates = data.loc[i, ["level3_name", "level4_name", "type"]].values
    name = data.loc[i, "name"]
    hypothesis_template = "Это товар из категории {}."

    sequence_pairs = []
    for candidate_label in candidates:
        sequence_pairs.extend([[name, hypothesis_template.format(candidate_label)]])

    x = tokenizer(
        sequence_pairs,
        add_special_tokens=True,
        return_tensors="pt",
        padding=True,
        truncation="only_first",
        pad_to_multiple_of=128,
        max_length=128,
    )
    data_input.append(x["input_ids"].numpy())
    data_attention.append(x["attention_mask"].numpy())

    output = model(x["input_ids"].to(device), x["attention_mask"].to(device))

    entail_contradiction_logits = output.logits[:, [0, 2]]
    probs = entail_contradiction_logits.softmax(dim=1)
    prob_label_is_true = probs[:, 1].detach().cpu().numpy()

    example = {
        key: value
        for key, value in zip(
            ["cat3_score", "cat4_score", "type_score"], prob_label_is_true
        )
    }
    data_scores.append(example)

100%|██████████| 500/500 [00:13<00:00, 35.88it/s]


In [11]:
with open(data_folder + "input.npy", "wb") as f:
    np.save(f, np.stack(data_input))

In [12]:
with open(data_folder + "attention.npy", "wb") as f:
    np.save(f, np.stack(data_attention))

In [13]:
pd.DataFrame(data_scores).to_csv(data_folder + "scores.csv", index=False)

### Example

In [7]:
hypothesis_template = "This example is {}."


sequence_pairs = []
for candidate_label in candidate_labels:
    sequence_pairs.extend(
        [[sequence_to_classify, hypothesis_template.format(candidate_label)]]
    )

In [112]:
sequence_pairs

[['Manchester United', 'This example is football team.'],
 ['Manchester United', 'This example is city.'],
 ['Manchester United', 'This example is England.'],
 ['Manchester United', 'This example is Masha.']]

In [101]:
tokenizer.model_max_length

512

In [132]:
x = tokenizer(
    sequence_pairs,
    add_special_tokens=True,
    return_tensors="pt",
    padding=True,
    truncation="only_first",
    pad_to_multiple_of=15,
    max_length=15,
)

In [128]:
x["input_ids"].shape

torch.Size([4, 15])

In [129]:
x

{'input_ids': tensor([[     0,  30749,  14098,      2,      2,   3293,  27781,     83, 101740,
           7175,      5,      2,      1,      1,      1],
        [     0,  30749,  14098,      2,      2,   3293,  27781,     83,  26349,
              5,      2,      1,      1,      1,      1],
        [     0,  30749,  14098,      2,      2,   3293,  27781,     83,  30715,
              5,      2,      1,      1,      1,      1],
        [     0,  30749,  14098,      2,      2,   3293,  27781,     83, 122203,
              5,      2,      1,      1,      1,      1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])}

In [133]:
tokenizer.decode(x["input_ids"][3])

'<s> Manchester United</s></s> This example is Masha.</s><pad><pad><pad><pad>'

In [155]:
output = model(x["input_ids"].to(device), x["attention_mask"].to(device))

entail_contradiction_logits = output.logits[:, [0, 2]]
probs = entail_contradiction_logits.softmax(dim=1)
prob_label_is_true = probs[:, 1]

In [156]:
output.logits.softmax(dim=1)

tensor([[0.0021, 0.6138, 0.3842],
        [0.1256, 0.6962, 0.1782],
        [0.2491, 0.5425, 0.2084],
        [0.9781, 0.0184, 0.0036]], device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [136]:
candidate_labels

['football team', 'city', 'England', 'Masha']

In [137]:
prob_label_is_true

tensor([0.9946, 0.5850, 0.4567, 0.0036], device='cuda:0',
       grad_fn=<SelectBackward0>)

## ONNX

In [14]:
def save_as_onnx(net, x, device):

    # export the model to ONNX
    input_ids = x["input_ids"].type(torch.int32).to(device)
    attention_mask = x["attention_mask"].type(torch.int32).to(device)
    net = net.to(device)

    with torch.no_grad():
        torch.onnx.export(
            net,
            (input_ids, attention_mask),  # у нас два тензора на вход
            "/root/onnx_models/roberta.onnx",
            verbose=False,
            export_params=True,
            opset_version=13,
            do_constant_folding=True,
            input_names=["input_ids", "attention_mask"],
            output_names=["logits"],
            dynamic_axes={
                "input_ids": {0: "batch"},
                "attention_mask": {0: "batch"},
                "logits": {0: "batch"},
            },
            training=TrainingMode.EVAL,
        )

In [15]:
x = tokenizer(
    sequence_pairs,
    add_special_tokens=True,
    return_tensors="pt",
    padding=True,
    truncation="only_first",
    pad_to_multiple_of=128,
    max_length=128,
)

save_as_onnx(model, x, device)

## Data for test

In [24]:
data_folder = "/root/data/"
scores = pd.read_csv(data_folder + "scores.csv").values
triton_scores = pd.read_csv(data_folder + "triton_scores.csv").values

with open(data_folder + "input.npy", "rb") as f:
    data_input = np.load(f)

with open(data_folder + "attention.npy", "rb") as f:
    data_attention = np.load(f)

### Test ONNX

In [11]:
import onnxruntime as ort


ort_sess = ort.InferenceSession(
    "/root/onnx_models/roberta.onnx",
    providers=[
        "TensorrtExecutionProvider",
        "CUDAExecutionProvider",
        "CPUExecutionProvider",
    ],
)



In [148]:
onnx_res = ort_sess.run(
    None,
    {
        "input_ids": x["input_ids"].numpy().astype("int32"),
        "attention_mask": x["attention_mask"].numpy().astype("int32"),
    },
)

pre_output = onnx_res[0][:, [0, 2]]
output = np.exp(pre_output) / np.sum(np.exp(pre_output), axis=1, keepdims=True)

In [12]:
data_input.shape

(500, 3, 128)

In [35]:
for i in range(50):
    if (np.abs(triton_scores[i] - scores[i]) > 0.1).sum():
        print(i, end=" ")

8 12 33 

In [28]:
np.abs(triton_scores[8] - scores[8])

array([0.1790491, 0.0003463, 0.0007632])

In [29]:
onnx_res = ort_sess.run(
    None,
    {
        "input_ids": data_input[8].astype("int32"),
        "attention_mask": data_attention[8].astype("int32"),
    },
)
pre_output = onnx_res[0][:, [0, 2]]
onnx_output = np.exp(pre_output) / np.sum(np.exp(pre_output), axis=1, keepdims=True)

In [33]:
np.abs(onnx_output[:, 1] - scores[8]),

array([2.09523384e-03, 2.29477386e-06, 4.38036804e-06])

In [34]:
np.abs(onnx_output[:, 1] - triton_scores[8])

array([0.17695387, 0.00034401, 0.00075882])

In [36]:
onnx_res = ort_sess.run(
    None,
    {
        "input_ids": data_input[12].astype("int32"),
        "attention_mask": data_attention[12].astype("int32"),
    },
)
pre_output = onnx_res[0][:, [0, 2]]
onnx_output = np.exp(pre_output) / np.sum(np.exp(pre_output), axis=1, keepdims=True)

In [38]:
np.abs(onnx_output[:, 1] - scores[12])

array([2.43510107e-04, 2.85875992e-06, 2.85875992e-06])

### TensorRT

In [16]:
!/usr/src/tensorrt/bin/trtexec --onnx=/root/onnx_models/roberta.onnx --saveEngine=/root/trt_models/roberta.trt --workspace=80000 --minShapes=input_ids:1x128,attention_mask:1x128 --optShapes=input_ids:3x128,attention_mask:3x128 --maxShapes=input_ids:10x128,attention_mask:10x128

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
&&&& RUNNING TensorRT.trtexec [TensorRT v8003] # /usr/src/tensorrt/bin/trtexec --onnx=/root/onnx_models/roberta.onnx --saveEngine=/root/trt_models/roberta.trt --workspace=80000 --minShapes=input_ids:1x128,attention_mask:1x128 --optShapes=input_ids:3x128,attention_mask:3x128 --maxShapes=input_ids:10x128,attention_mask:10x128
[05/29/2022-20:23:07] [I] === Model Options ===
[05/29/2022-20:23:07] [I] Format: ONNX
[05/29/2022-20:23:07] [I] Model: /root/onnx_models/roberta.onnx
[05/29/2022-20:23:07] [I] Output:
[05/29/2022-20:23:07] [I] === Build Options ===
[05/29/2022-20:23:07] [I] Max batch: explicit
[05/29/2022-20:23:07] [I] Workspace: 80000 MiB
[05/29/2022-20:23:07] [I] minTiming: 1
[05/29/2022-20:23:07] [I] 

In [None]:
!cp /root/trt_models/roberta.trt /root/trt_models/model.plan