In [9]:
import numpy as np
import tensorflow as tf
from transformers import CLIPProcessor, CLIPTokenizer, CLIPTextConfig, CLIPVisionConfig
from transformers.modeling_tf_utils import get_initializer
import os
from transformers.models.clip.modeling_tf_clip import (
    TFCLIPTextMainLayer, 
    TFCLIPTextTransformer,
    TFCLIPMainLayer,
    TFCLIPModel,
    TFCLIPVisionMainLayer,
    TFCLIPVisionTransformer
)

class ClipTextEmbedder(tf.keras.Model):
    def __init__(self):
        super(ClipTextEmbedder, self).__init__()
        self.config=CLIPTextConfig.from_pretrained("openai/clip-vit-base-patch32")
        self.encoder=TFCLIPTextTransformer(self.config)
        self.projection = tf.keras.layers.Dense(
            units=self.config.projection_dim,
#             kernel_initializer=get_initializer(self.config.text_config['hidden_size']**-0.5 * self.config.initializer_factor),
            use_bias=False,
            name="text_projection",
        )
        # if you want to instanciate it from huggingface base model; useful to save initial weights locally the first time
#         self.encoder=TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32").clip.text_model
#         self.projection=TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32").clip.text_projection 
        
        
    def call(self, x, training = True):
        output=self.encoder(input_ids=x['input_ids'],
                            attention_mask=x['attention_mask'],
                            position_ids=None,
                            output_attentions=False,
                            output_hidden_states=False,
                            return_dict=False,
                            training=training
                           )
        output=output[1]
        output=self.projection(inputs=output)
        output=output / tf.norm(tensor=output, ord="euclidean", axis=-1, keepdims=True)
        return output
    
class ClipImageEmbedder(tf.keras.Model):
    def __init__(self):
        super(ClipImageEmbedder, self).__init__()
        self.config=CLIPVisionConfig.from_pretrained("openai/clip-vit-base-patch32")
        self.encoder=TFCLIPVisionTransformer(self.config)
        self.projection = tf.keras.layers.Dense(
            units=self.config.projection_dim,
#             kernel_initializer=get_initializer(self.config.text_config['hidden_size']**-0.5 * self.config.initializer_factor),
            use_bias=False,
            name="visual_projection",
        )
        # if you want to instanciate it from huggingface base model; useful to save initial weights locally the first time
#         self.encoder=TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32").clip.vision_model 
#         self.projection=TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32").clip.visual_projection  
        
        
    def call(self, x, training = True):
        output=self.encoder(pixel_values=x['pixel_values'],
                            output_attentions=False,
                            output_hidden_states=False,
                            return_dict=False,
                            training=training                           
                           )
        output=output[1]
        output=self.projection(inputs=output)
        output=output / tf.norm(tensor=output, ord="euclidean", axis=-1, keepdims=True)
        return output

def tokenize_texts(texts, max_length=64):
    tokenizer=CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
    txt_inputs = tokenizer.batch_encode_plus(texts, 
                                     padding='max_length', 
                                     return_tensors="np",
                                     max_length=64)
    txt_inputs = {elt:np.array(txt_inputs[elt]).astype('int32') for elt in txt_inputs}
    return txt_inputs

def tokenize_imgs(images):
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    img_inputs=processor(images=images, return_tensors="np")
    img_inputs={elt:np.array(img_inputs[elt]) for elt in img_inputs}
    return img_inputs

def compute_sim(image_embeddings, text_embeddings):
    logits_per_text = tf.matmul(text_embeds, image_embeds, transpose_b=True)* tf.math.exp(4.6)
    logits_per_image = tf.transpose(logits_per_text)
    logits_per_image = tf.nn.softmax(logits_per_image,axis=1)
    return logits_per_image
    
model_text = ClipTextEmbedder()
model_text.load_weights('clip-base/clip_text_embedding/weights')
# model_text.save_weights('clip-base/clip_text_embedding/weights')

model_image = ClipImageEmbedder()
model_image.load_weights('clip-base/clip_image_embedding/weights')
# model_image.save_weights('clip-base/clip_image_embedding/weights')

You are using a model of type clip to instantiate a model of type clip_text_model. This is not supported for all configurations of models and can yield errors.
You are using a model of type clip to instantiate a model of type clip_vision_model. This is not supported for all configurations of models and can yield errors.


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x206f0d66850>

In [28]:
class TFClip(tf.keras.Model):
    def __init__(self, save_folder=None):
        super(TFClip, self).__init__()
        self.text_encoder=ClipTextEmbedder()
        self.image_encoder=ClipImageEmbedder()
        
        if save_folder:
            self.text_encoder.load_weights(os.path.join(save_folder,'clip_text_embedding','weights'))
            self.image_encoder.load_weights(os.path.join(save_folder,'clip_image_embedding','weights'))
        
    def call(self, x, training=True):
        [text_inputs, image_inputs] = x
        text_embedding=self.text_encoder(text_inputs,training=training)
        image_embedding=self.image_encoder(image_inputs,training=training)
        cosine=compute_sim(image_embedding,text_embedding)
        return cosine
    
clip = TFClip('clip-base')

You are using a model of type clip to instantiate a model of type clip_text_model. This is not supported for all configurations of models and can yield errors.
You are using a model of type clip to instantiate a model of type clip_vision_model. This is not supported for all configurations of models and can yield errors.


In [29]:
from PIL import Image
import requests

txts = ["a photo of a cat", "a photo of a dog"]
text_inputs=tokenize_texts(txts)


url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

imgs=[image]
image_inputs = tokenize_imgs(imgs)

inputs=[text_inputs, image_inputs]

In [23]:
text_embeds=model_text.predict(txt_inputs, batch_size=2,verbose=True)
image_embeds=model_image.predict(image_inputs, batch_size=2,verbose=True)



<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.99471927, 0.00528076]], dtype=float32)>

In [25]:
logits_per_text = tf.matmul(text_embeds, image_embeds, transpose_b=True)* tf.math.exp(4.6)
logits_per_image = tf.transpose(logits_per_text)
logits_per_image

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[24.442019, 19.203629]], dtype=float32)>

In [None]:
compute_sim(image_embeds, text_embeds)

In [31]:
clip(inputs)

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.99471927, 0.00528076]], dtype=float32)>