# dummy classic ml model

In [18]:
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
import joblib

X, y = make_regression(1000, 3)

model = LinearRegression().fit(X, y)

In [19]:
model.predict([[1,2,3]])

array([506.48185252])

In [11]:
# классическое сохранение в joblib
joblib.dump(model, "model.joblib")

# загрузка из joblib
# joblib.load("model.joblib")

['model.joblib']

In [20]:
# from skl2onnx import convert_sklearn
import onnx
from skl2onnx import to_onnx
from skl2onnx.common.data_types import FloatTensorType

initial_type = [("input", FloatTensorType([None, X.shape[1]]))]
# onnx_model = convert_sklearn(model, initial_types=initial_type)
onnx_model = to_onnx(model, initial_types=initial_type)

with open("model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

print("Inputs:", [inp.name for inp in onnx_model.graph.input])
print("Outputs:", [out.name for out in onnx_model.graph.output])

Inputs: ['input']
Outputs: ['variable']


In [None]:
import requests

r = requests.post(
    url="http://localhost:8080/triton_classic_ml",
    json={"a": 1, "b": 2, "c": 3},
    params={"model_name": "classic_model", "model_version": 1}
)

r.json()

In [18]:
for _ in range(10):
    r = requests.post(url="http://localhost:8080/classic", json={"a":2, "b": 2000, "c": 10})
    r.json()

# dummy LLM (BERT) model

In [1]:
from transformers import BertModel, BertTokenizer

model = BertModel.from_pretrained("ai-forever/ruBert-base")
tokenizer = BertTokenizer.from_pretrained("ai-forever/ruBert-base", do_lower_case=True)

dtypes = set(param.dtype for param in model.parameters())
print(f"Модель содержит параметры следующих типов: {dtypes}")

for name, module in model.named_modules():
    print(f"{name}: {type(module)}")

  from .autonotebook import tqdm as notebook_tqdm


Модель содержит параметры следующих типов: {torch.float32}
: <class 'transformers.models.bert.modeling_bert.BertModel'>
embeddings: <class 'transformers.models.bert.modeling_bert.BertEmbeddings'>
embeddings.word_embeddings: <class 'torch.nn.modules.sparse.Embedding'>
embeddings.position_embeddings: <class 'torch.nn.modules.sparse.Embedding'>
embeddings.token_type_embeddings: <class 'torch.nn.modules.sparse.Embedding'>
embeddings.LayerNorm: <class 'torch.nn.modules.normalization.LayerNorm'>
embeddings.dropout: <class 'torch.nn.modules.dropout.Dropout'>
encoder: <class 'transformers.models.bert.modeling_bert.BertEncoder'>
encoder.layer: <class 'torch.nn.modules.container.ModuleList'>
encoder.layer.0: <class 'transformers.models.bert.modeling_bert.BertLayer'>
encoder.layer.0.attention: <class 'transformers.models.bert.modeling_bert.BertAttention'>
encoder.layer.0.attention.self: <class 'transformers.models.bert.modeling_bert.BertSdpaSelfAttention'>
encoder.layer.0.attention.self.query: <c

## сохранение модели

In [None]:
encodings = tokenizer.encode_plus(
    " ".join(["word"] * 10),
    add_special_tokens=True,
    max_length=512,
    truncation=True,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors="pt",
)
output = model(**encodings)
# output.pooler_output[0]



In [2]:
import torch

encodings = tokenizer.encode_plus(
    " ".join(["word"] * 10),
    add_special_tokens=True,
    max_length=512,
    truncation=True,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors="pt",
)
dummy_input = (encodings["input_ids"], encodings["attention_mask"], encodings["token_type_ids"]) # Пример входного тензора
onnx_file_path = "model.onnx"  # Путь для сохранения ONNX файла

# Экспорт модели
with torch.no_grad():
    torch.onnx.export(
        model,  # Модель PyTorch
        dummy_input,  # Пример входных данных
        onnx_file_path,  # Путь для сохранения ONNX файла
        export_params=True,  # Экспортировать обученные параметры
        opset_version=14,  # Версия ONNX операторов
        do_constant_folding=True,  # Оптимизация констант
        input_names=["input_ids", "attention_mask", "token_type_ids"],
        output_names=["pooler_output"],
        dynamic_axes={
            "input_ids": {0: "batch_size", 1: "seq_len"},
            "attention_mask": {0: "batch_size", 1: "seq_len"},
            "token_type_ids": {0: "batch_size", 1: "seq_len"},
            "pooler_output": {0: "batch_size"},
        },
    )

print(f"Модель успешно экспортирована в файл: {onnx_file_path}")



Модель успешно экспортирована в файл: model.onnx


## квантизация модели

In [11]:
import torch
from torch.ao.quantization import quantize_dynamic

torch.backends.quantized.engine = "qnnpack"  # Intel/AMD: "fbgemm"

quantized_model = quantize_dynamic(
    model=model,
    qconfig_spec={torch.nn.Linear},
    dtype=torch.qint8
)

In [12]:
print(f"FP32 size: {sum(p.numel() for p in model.parameters()) * 4 / 1e6:.2f} MB")
print(f"INT8 size: {sum(p.numel() for p in quantized_model.parameters()) * 1 / 1e6:.2f} MB")

FP32 size: 713.23 MB
INT8 size: 92.70 MB


In [None]:
from transformers import BertTokenizer, BertModel
import torch

model_name = "ai-forever/ruBert-base"
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

model.eval()

# Пример входных данных
inputs = tokenizer("Тестовое предложение", return_tensors="pt")
input_names = ["input_ids", "attention_mask"]
output_names = ["last_hidden_state", "pooler_output"]

# Экспорт в ONNX
torch.onnx.export(
    model,
    (inputs["input_ids"], inputs["attention_mask"]),
    "rubert_base.onnx",
    input_names=input_names,
    output_names=output_names,
    opset_version=13,
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "seq_len"},
        "attention_mask": {0: "batch_size", 1: "seq_len"},
        "last_hidden_state": {0: "batch_size", 1: "seq_len"},
        "pooler_output": {0: "batch_size"},
    },
)

In [4]:
from onnxruntime.quantization import quantize_dynamic, QuantType

quantize_dynamic(
    model_input="model.onnx",
    model_output="rubert_base_quant.onnx",
    weight_type=QuantType.QInt8,  # Можно использовать QuantType.QUInt8
)

AttributeError: INT4

Конвертация модели в tensorrt

https://docs.nvidia.com/deeplearning/tensorrt/latest/reference/command-line-programs.html

```bash
docker exec -it trtexec_container bash

trtexec --onnx=model.onnx --help

trtexec \
    --onnx=model.onnx \
    --saveEngine=model.plan \
    --minShapes=input:1x3 \
    --optShapes=input:8x3 \
    --maxShapes=input:16x3 \
    --fp16 \
    --useSpinWait
```

## пример обращения по API

In [1]:
import requests

r = requests.post(
    url="http://localhost:8080/triton_llm",
    params={"model_name": "bert_model"},
    json={"text": "example of the text"},
)

r.json()

{'embedding': [-0.4801458716392517,
  0.13364841043949127,
  -0.058410122990608215,
  0.26510411500930786,
  0.20883221924304962,
  -0.06469129025936127,
  1.700914978981018,
  -0.27123305201530457,
  0.4615176022052765,
  0.8974533677101135,
  0.6616724133491516,
  0.7245630621910095,
  0.03962193801999092,
  0.670417845249176,
  0.6577914357185364,
  -0.16094788908958435,
  0.6497836112976074,
  0.7598335146903992,
  -0.09708482027053833,
  -0.33649399876594543,
  -0.2876434624195099,
  -0.29156625270843506,
  1.0977646112442017,
  -0.49361521005630493,
  0.8974349498748779,
  -0.5249939560890198,
  -0.6229562163352966,
  0.21012605726718903,
  -0.47258394956588745,
  0.2673443555831909,
  0.12862682342529297,
  0.06678874790668488,
  -0.3363344669342041,
  0.14100868999958038,
  0.05123268440365791,
  -1.5838018655776978,
  0.06379370391368866,
  0.15384118258953094,
  -0.507167637348175,
  0.1046915352344513,
  -0.3378352224826813,
  -0.656221866607666,
  -0.5526315569877625,
  -0.

# dummy GPT-2 model

In [4]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.eval()

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
input_ids = tokenizer("Hello, my name is", return_tensors="pt")["input_ids"]


# Создаём класс-обёртку, чтобы избавиться от past_key_values
class GPT2Wrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_ids):
        outputs = self.model(input_ids=input_ids, use_cache=False)
        return outputs.logits


# Оборачиваем
wrapped_model = GPT2Wrapper(model)

# Трейсим
traced = torch.jit.trace(wrapped_model, input_ids)
traced.save("model.pt")

print("✅ TorchScript модель успешно сохранена.")

✅ TorchScript модель успешно сохранена.


In [24]:
import requests

r = requests.post(
    url="http://localhost:8080/triton_gpt2",
    params={
        "model_name": "gpt2_model",
        "max_new_tokens": 20
    },
    json={"text": "once upon a time there was a"},
)

r.json()

{'generated_text': 'once upon a time there was a great deal of talk about the possibility of a new world order.\n\nThe idea of a new'}