In [None]:
# !git clone https://github.com/openai/CLIP
# !conda activate clipenv
# ! pip install ftfy regex tqdm
# !sed -i -e 's/def forward(self, image, text):/def old_forward(self, image, text):/g' ./clip/model.py
# !sed -i -e 's/def encode_text(self, text):/def forward(self, text):/g' ./clip/model.py

In [None]:
import numpy as np
import clip

clip.available_models()

model, preprocess = clip.load("ViT-B/32")
model.cuda().eval()
input_resolution = model.visual.input_resolution
context_length =  model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

In [None]:
import os
import skimage
from PIL import Image
import numpy as np

from collections import OrderedDict
import torch

# images in skimage to use and their textual descriptions
descriptions = {
    "astronaut": "a portrait of an astronaut with the American flag",
}

original_images = []
images = []
texts = []
for filename in [filename for filename in os.listdir(skimage.data_dir) if filename.endswith(".png") or filename.endswith(".jpg")]:
    name = os.path.splitext(filename)[0]
    if name not in descriptions:
        continue
    image = Image.open(os.path.join(skimage.data_dir, filename)).convert("RGB")
    original_images.append(image)
    images.append(preprocess(image))
    texts.append(descriptions[name])
    
image_input = torch.tensor(np.stack(images)).cuda()
text_tokens = clip.tokenize(["This is " + desc for desc in texts]).cuda()

model.visual = model.visual.to(torch.float32)
model = model.to(torch.float32)
model.visual(image_input)[0] # astronaut pic embedding
model(text_tokens)[0] # astronaut text embedding
torch.onnx.export(model, text_tokens, "clip-text-vit-32.onnx", export_params=True, opset_version=14, do_constant_folding=True, input_names = ['input'], output_names = ['output'], dynamic_axes={'input' : {0 : 'batch_size'}, 'output' : {0 : 'batch_size'}})
torch.onnx.export(model.visual, image_input, "clip-image-vit-32.onnx", export_params=True, opset_version=14, do_constant_folding=True, input_names = ['input'], output_names = ['output'], dynamic_axes={'input' : {0 : 'batch_size'}, 'output' : {0 : 'batch_size'}})
    

In [None]:
# Attempt at quantizing model to uint8 (doesn't seem to work? no errors, but onnx file is same size)
# Reference: https://github.com/minimaxir/imgbeddings/blob/36fb4d7ac6b82694d109cef6f887d4cb9c49da0f/imgbeddings/models.py#L94
# Here's the model the above code generates: https://huggingface.co/minimaxir/imgbeddings/blob/main/patch32_v1.onnx
# Here's a demo of the above ONNX model with ORT Web: https://jsbin.com/nupehazaju/edit?html,output  <-- seems to work, but this model doesn't have the projection head that squashes 768 vec to 512 elements (so can be compared to text embeddings of same length)
# !pip install onnxruntime
# !pip install onnx
from onnxruntime.quantization import quantize_dynamic, QuantType
quantize_dynamic("clip-image-vit-32.onnx", "clip-image-vit-32-uint8.onnx", weight_type=QuantType.QUInt8, extra_options={"MatMulConstBOnly":False}) # I added the MatMulConstBOnly as a guess due to warnings that it outputs without it

# The code below is for converting to tflite, tfjs and tf saved model:
"""
!pip install git+https://github.com/onnx/onnx-tensorflow.git
!onnx-tf convert -i clip-image-vit-32.onnx -o clip-image-vit-32-tf
!onnx-tf convert -i clip-text-vit-32.onnx -o clip-text-vit-32-tf
!pip install tensorflowjs
!tensorflowjs_converter --input_format tf_saved_model ./clip-image-vit-32-tf ./clip-image-vit-32-tfjs
!tensorflowjs_converter --input_format tf_saved_model ./clip-text-vit-32-tf ./clip-text-vit-32-tfjs
"""
     
import tensorflow as tf
# image encoder:
converter = tf.lite.TFLiteConverter.from_saved_model("./clip-image-vit-32-tf")
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] # This line is needed because: https://github.com/tensorflow/tfjs/issues/5844
tflite_model = converter.convert()
with open('clip-image-vit-32.tflite', 'wb') as f:
  f.write(tflite_model)

# text encoder:
converter = tf.lite.TFLiteConverter.from_saved_model("./clip-text-vit-32-tf")
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] # This line is needed because: https://github.com/tensorflow/tfjs/issues/5844
tflite_model = converter.convert()
with open('clip-text-vit-32.tflite', 'wb') as f:
  f.write(tflite_model)
