In [4]:
import torch
import clip

# Load CLIP model and the tokenizer
device = "mps" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Define the fully connected layer
class CLIPWithFC(torch.nn.Module):
    def __init__(self, clip_model, output_dim):
        super(CLIPWithFC, self).__init__()
        self.clip_model = clip_model
        self.fc = torch.nn.Linear(clip_model.visual.output_dim * 2, output_dim) # for concatenated dimensions

    def forward(self, image, text):
        with torch.no_grad():
            # Encode image and text with CLIP
            image_features = self.clip_model.encode_image(image)
            text_features = self.clip_model.encode_text(text)

        # Combine embeddings (here, we're just concatenating as both are in the same embedding space)
        combined_features = torch.cat((image_features, text_features), dim=1)
        
        print(combined_features.shape)
        print(self.clip_model.visual.output_dim)
        
        # Pass through the fully connected layer
        scores = self.fc(combined_features)
        return scores

# Create the model instance
output_dim = 1  # Assuming you want a single score as the output
model_with_fc = CLIPWithFC(model, output_dim).to(device)

# Example usage
from PIL import Image

# Prepare input image and text (adapt according to your data)
image_path = './10594-screenshot.jpg'
image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
text = clip.tokenize(["Your text description"]).to(device)

# Predict the score
model_with_fc.eval()  # Set the model to evaluation mode
score = model_with_fc(image, text)
print(f'Predicted Score: {score.item()}')

torch.Size([1, 1024])
512
Predicted Score: 0.03832745552062988
