In [None]:
import os
import torch
import skimage
import random
from PIL import Image
# !pip install onnxruntime
# !pip install onnx
# !pip install git+https://github.com/onnx/onnx-tensorflow.git
# !pip install tensorflowjs
# !git clone https://github.com/apple/ml-mobileclip.git

In [None]:

def get_args_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument('--pretrained_path',type=str, default='./ml-mobileclip/checkpoints')
    parser.add_argument('--model', default='mobileclip_s0', type=str)
    return parser
parser = get_args_parser()
args = parser.parse_args(args=[])
args.pretrained_path = os.path.join(args.pretrained_path, args.model) + '.pt'
print('args.pretrained_path:', args.pretrained_path)

In [6]:
model, _, preprocess = mobileclip.create_model_and_transforms(args, args.model, pretrained=args.pretrained_path, reparameterize=False)
tokenizer = mobileclip.get_tokenizer(args.model_cfg)

In [7]:

label_to_caption = [
                        "This is an example of a real face",
                        "This is a bonafide face",
                        "This is a real face",
                        "This is how a real face looks like",
                        "a photo of a real face",
                        "This is not a spoof face",
                        "This is an example of a spoof face",
                        "This is an example of an attack face",
                        "This is not a real face",
                        "This is how a spoof face looks like",
                        "a photo of a spoof face",
                        "a printout shown to be a spoof face",
                        ]

In [8]:
descriptions = {
    "astronaut": "a portrait of an astronaut with the American flag",
}
original_images = []
images = []
texts = []
for filename in [filename for filename in os.listdir(skimage.data_dir) if filename.endswith(".png") or filename.endswith(".jpg")]:
    name = os.path.splitext(filename)[0]
    if name not in descriptions:
        continue
    image = Image.open(os.path.join(skimage.data_dir, filename)).convert("RGB")
    original_images.append(image)
    images.append(preprocess(image))
    caption = random.choice(label_to_caption)
    text = tokenizer(caption, return_tensors="pt").squeeze()
    texts.append(text)

In [16]:
model = model.cuda()
model.eval()
image_features = model.image_encoder(images[0].unsqueeze(0).cuda())
text_features = model.text_encoder(texts[0].unsqueeze(0).cuda())

In [None]:
output = './tflite_tmp'
os.makedirs(output, exist_ok=True)
# Text encoder part
torch.onnx.export(model.text_encoder, texts[0].unsqueeze(0).cuda(), os.path.join(output,"mobileclip-text-vit-32_v2.onnx"), verbose=False, opset_version=12, input_names=['images'],
                    output_names=['output'],
                    dynamic_axes=None)
# Image encoder part
torch.onnx.export(model.image_encoder, images[0].unsqueeze(0).cuda(), os.path.join(output,"mobileclip-image-vit-32_v2.onnx"), verbose=False, opset_version=12, input_names=['images'],
                    output_names=['output'],
                    dynamic_axes=None)

In [20]:
# Attempt at quantizing model to uint8 (doesn't seem to work? no errors, but onnx file is same size)
# Reference: https://github.com/minimaxir/imgbeddings/blob/36fb4d7ac6b82694d109cef6f887d4cb9c49da0f/imgbeddings/models.py#L94
# Here's the model the above code generates: https://huggingface.co/minimaxir/imgbeddings/blob/main/patch32_v1.onnx
# Here's a demo of the above ONNX model with ORT Web: https://jsbin.com/nupehazaju/edit?html,output  <-- seems to work, but this model doesn't have the projection head that squashes 768 vec to 512 elements (so can be compared to text embeddings of same length)

# Transform float32 to utin8 of ONNX
from onnxruntime.quantization import quantize_dynamic, QuantType
quantize_dynamic(os.path.join(output,"mobileclip-image-vit-32_v2.onnx"), os.path.join(output,"mogileclip-image-vit-32-uint8_v2.onnx"), weight_type=QuantType.QUInt8, extra_options={"MatMulConstBOnly":False}) # I added the MatMulConstBOnly as a guess due to warnings that it outputs without it
quantize_dynamic(os.path.join(output,"mobileclip-text-vit-32_v2.onnx"), os.path.join(output,"mogileclip-text-vit-32-uint8_v2.onnx"), weight_type=QuantType.QUInt8, extra_options={"MatMulConstBOnly":False}) # I added the MatMulConstBOnly as a guess due to warnings that it outputs without it



In [22]:
!onnx-tf convert -i tflite_tmp/mobileclip-image-vit-32_v2.onnx -o tflite_tmp/mobileclip-image-vit-32-tf_v2
!onnx-tf convert -i tflite_tmp/mobileclip-text-vit-32_v2.onnx -o tflite_tmp/mobileclip-text-vit-32-tf_v2


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

2024-08-06 04:24:13,342 - onnx-tf - INFO - Start converting onnx pb to tf saved model
2024-08-06 04:24:13.944396: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2024-08-06 04:24:30,529 - onnx-tf - INFO - Converting completes successfully.
INFO:onnx-tf:Converting completes succe

In [None]:
     
import tensorflow as tf
# image encoder:
converter = tf.lite.TFLiteConverter.from_saved_model(os.path.join(output,"mobileclip-image-vit-32-tf_v2"))
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] # This line is needed because: https://github.com/tensorflow/tfjs/issues/5844
tflite_model = converter.convert()
with open(os.path.join(output,'mobileclip-image-vit-32_v2.tflite'), 'wb') as f:
  f.write(tflite_model)

In [None]:
# text encoder:
converter = tf.lite.TFLiteConverter.from_saved_model(os.path.join(output,"mobileclip-text-vit-32-tf_v2"))
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] # This line is needed because: https://github.com/tensorflow/tfjs/issues/5844
tflite_model = converter.convert()
with open(os.path.join(output,'mobileclip-text-vit-32_v2.tflite'), 'wb') as f:
  f.write(tflite_model)