In [1]:
from encoding_service.src.global_configs import CLIP_IMAGE1_ONNX_PATH, CLIP_TEXT1_ONNX_PATH, DEVICE, CLIP_TEXT1_ONNX_PATH_FP16, CLIP_IMAGE1_ONNX_PATH_FP16
from encoding_service.src.models.clip_openai.config import MODEL_CLIP
import torch
import onnx
from onnxconverter_common import float16
from transformers import CLIPProcessor, CLIPModel
from encoding_service.src.models.clip_openai.encoder_image.clip_image_encoder import CLIPImageEncoder
from encoding_service.src.models.clip_openai.encoder_text.clip_text_encoder import CLIPTextEncoder
import warnings

warnings.filterwarnings("ignore")


model = CLIPModel.from_pretrained(MODEL_CLIP).to(DEVICE)
processor = CLIPProcessor.from_pretrained(MODEL_CLIP)

text_encoder = CLIPTextEncoder(model).to(DEVICE)
image_encoder = CLIPImageEncoder(model).to(DEVICE)

model.eval()


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05,

In [2]:
from pathlib import Path


#============================Text===============================================
def convert_text():
    try:
        text_input = processor(
            text=["hello world"],
            return_tensors="pt",
            padding=True
        ).to(DEVICE)
        with torch.no_grad():
            torch.onnx.export(
                text_encoder,
                (text_input["input_ids"], text_input["attention_mask"]),
                CLIP_TEXT1_ONNX_PATH,
                do_constant_folding=True,
                export_params=True,
                input_names=["input_ids", "attention_mask"],
                output_names=["text_features"],
                dynamic_axes={
                    "input_ids": {0: "batch", 1: "seq_len"},
                    "attention_mask": {0: "batch", 1: "seq_len"},
                    "text_features": {0: "batch"},
                },
                opset_version=17
            )
    except Exception as ex:
        print(ex)

convert_text()



  if seq_length > max_position_embedding:
  if input_shape[-1] > 1 or self.sliding_window is not None:
  if past_key_values_length > 0:
  inverted_mask = torch.tensor(1.0, dtype=dtype) - expanded_mask


In [3]:
model = onnx.load(CLIP_TEXT1_ONNX_PATH)
model_fp16 = float16.convert_float_to_float16(model)
onnx.save(model_fp16, CLIP_TEXT1_ONNX_PATH_FP16)



In [3]:
#=======================Image===================================================
def convert_image():
    try:
        image = torch.rand(1, 3, 224, 224, device=DEVICE)
        image_input = processor(
            images=image,
            return_tensors="pt"
        )["pixel_values"].to(DEVICE)
        with torch.no_grad():
            torch.onnx.export(
                image_encoder,
                (image_input,),
                CLIP_IMAGE1_ONNX_PATH,
                do_constant_folding=True,
                export_params=True,
                input_names=["pixel_values"],
                output_names=["image_features"],
                dynamic_axes={
                    "pixel_values": {0: "batch"},
                    "image_features": {0: "batch"},
                },
                opset_version=17
            )
    except Exception as ex:
        print(ex)

convert_image()


It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.


In [4]:
model = onnx.load(CLIP_IMAGE1_ONNX_PATH)
model_fp16 = float16.convert_float_to_float16(model)
onnx.save(model_fp16, CLIP_IMAGE1_ONNX_PATH_FP16)