### Contrastive Loss

In this notebook I will explore the contrastive loss and will try to implement it for fine-tuning the CLIP model for my task.

If it's sucessful, perhaps I will try to extend it to more classes.

Contrastive loss will be implemented following the paper [Supervised Contrastive Learning](https://arxiv.org/pdf/2004.11362v5.pdf).

In [1]:
import pandas as pd

movies = pd.read_csv('../scraper/data/movies_with_posters_and_rich_desc.csv')

In [2]:
#movies.loc[movies['imdb_id'] == 'tt0111161']
id_to_name = {idx: movies.loc[movies['imdb_id'] == idx]['title'].values[0] for idx in movies['imdb_id']}


In [12]:
movies.loc[movies['title'] == 'Iron Man']

Unnamed: 0,imdb_id,plot_synopsis,tags,adult,belongs_to_collection,budget,genres,homepage,id,original_language,...,runtime,spoken_languages,status,tagline,video,vote_average,vote_count,num_description_tokens,title,poster_count
2477,tt0371746,A convoy of military Humvees drives across the...,"murder, cult, violence, flashback, good versus...",False,"{'id': 131292, 'name': 'Iron Man Collection', ...",140000000,"[{'id': 28, 'name': 'Action'}, {'id': 878, 'na...",http://www.ironmanmovie.com/,1726,en,...,126.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Heroes aren't born. They're built.,False,7.4,8951.0,3162,Iron Man,22.0
7096,tt0043678,"Genius, billionaire, and playboy Tony Stark, w...",violence,False,,0,"[{'id': 18, 'name': 'Drama'}]",,69592,en,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,He's all man in the ring -- or anywhere!,False,5.0,1.0,679,Iron Man,2.0


In [3]:
from transformers import CLIPProcessor, CLIPModel
import torch, os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

  from .autonotebook import tqdm as notebook_tqdm
  return torch._C._cuda_getDeviceCount() > 0


In [4]:
import random
from PIL import Image

cls = 'tt0013442'
prompt = "{}"
img = "/home/barti/PosterRecognition/scraper/data/posters/tt0013442/train/9vp3ml7IMVg4rZ5xlhOflPYehAQ.jpg"

imdb_ids = os.listdir("../scraper/data/posters/")
imdb_ids.remove(cls)
# random.shuffle(imdb_ids)

classes = list(map(lambda item: prompt.format(id_to_name[item]), random.choices(imdb_ids, k=1000)))
classes.append(prompt.format(id_to_name[cls]))

img = Image.open(img)
a = ['horror', 'thriler', 'comedy', 'romance', 'adventure']

inputs = processor(text=a, images=img, return_tensors="pt", padding=True)
inputs.to(device)
outputs = model(**inputs)

logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
print({f"{prob.item():.4f}": a[idx] for idx, prob in enumerate(probs[0])})
# values, indices = probs[0].topk(5)

# for value, index in zip(values, indices):
#     print(f"{value:.5f}: {classes[index]}")

# print("Predicted class:", classes[indices[0]])


{'0.8266': 'horror', '0.0688': 'thriler', '0.0542': 'comedy', '0.0134': 'romance', '0.0369': 'adventure'}


In [16]:
from transformers import CLIPTextModel, CLIPConfig

text_model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
text_inputs = processor(text=['siema eniu dobry mudzin z afrika', 'eessa'], return_tensors="pt", padding=True) 
print(text_inputs)
print(type(text_inputs))
text_inputs
text_outputs = text_model(**text_inputs)
print(text_outputs[1].shape)
# assert text_outputs[1][0] == text_outputs.pooler_output[0]
torch.equal(text_outputs[1], text_outputs.pooler_output)

config = CLIPConfig.from_pretrained("openai/clip-vit-base-patch32")

print(config.text_config.projection_dim)

Some weights of the model checkpoint at openai/clip-vit-base-patch32 were not used when initializing CLIPTextModel: ['vision_model.encoder.layers.6.self_attn.q_proj.bias', 'vision_model.encoder.layers.11.self_attn.k_proj.bias', 'vision_model.encoder.layers.4.mlp.fc2.bias', 'vision_model.encoder.layers.7.self_attn.out_proj.weight', 'vision_model.encoder.layers.4.self_attn.v_proj.bias', 'vision_model.encoder.layers.6.self_attn.out_proj.weight', 'vision_model.encoder.layers.2.self_attn.out_proj.weight', 'vision_model.encoder.layers.6.self_attn.k_proj.bias', 'vision_model.encoder.layers.0.mlp.fc1.weight', 'vision_model.encoder.layers.2.self_attn.v_proj.bias', 'vision_model.encoder.layers.10.self_attn.v_proj.bias', 'vision_model.encoder.layers.4.self_attn.k_proj.bias', 'vision_model.encoder.layers.7.mlp.fc1.bias', 'vision_model.encoder.layers.6.mlp.fc1.bias', 'vision_model.encoder.layers.10.self_attn.v_proj.weight', 'vision_model.encoder.layers.6.mlp.fc2.weight', 'vision_model.encoder.layer

{'input_ids': tensor([[49406,   564, 11131,  4395,   340,   639, 28228, 17491, 21278,   345,
         45833, 49407],
        [49406,  2644,  6088, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}
<class 'transformers.tokenization_utils_base.BatchEncoding'>
torch.Size([2, 512])
512


In [14]:
path = "/home/barti/PosterRecognition/scraper/data/posters/tt0114709/test/voln3hFAJwZUgcLdhvDmsjK6Lpq.jpg"

In [4]:
from transformers import CLIPModel, CLIPProcessor, CLIPTextModel
import os, torch

prompts = [f"Poster of {name}" for name in id_to_name.values()]

clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
# clip_text_model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32").to("cuda")

def cache_text_embeddings(classes, batch_size: int = 64) -> torch.Tensor:
    # text_embeddings = [clip_text_model(
    #     **clip_processor(text=classes[i:i+batch_size], return_tensors="pt", padding=True).to("cuda")).pooler_output\
    #     for i in range(0, len(classes), batch_size)]
    text_embeddings = []
    for i in range(0, len(classes), batch_size):
        text_embeddings.append(**clip_processor(text=classes[i:i+batch_size],
                                        return_tensors="pt",
                                        padding=True).to("cuda"))
    return torch.cat(text_embeddings, dim=0)

# emb = cache_text_embeddings(prompts)
a = clip_processor(text=prompts,
                    return_tensors="pt",
                    padding=True).to('cuda')
# model

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import CLIPModel, CLIPProcessor, CLIPTextModel, CLIPVisionModel
import os, torch
from PIL import Image

import pandas as pd

movies = pd.read_csv('../scraper/data/movies_with_posters_and_rich_desc.csv')
#movies.loc[movies['imdb_id'] == 'tt0111161']
id_to_name = {idx: movies.loc[movies['imdb_id'] == idx]['title'].values[0] for idx in movies['imdb_id']}


prompts = [f"Poster of a movie: {name}" for name in id_to_name.values()]

clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
# clip_text_model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32").to("cuda")
# clip_vision_model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32").to('cuda')
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to('cuda')

for param in clip_model.parameters():
    param.grad = None

path = "/home/barti/PosterRecognition/scraper/data/posters/tt0411951/test/xZ1LElNefMw4xnOxpzaSkQFzeZu.jpg"

img = Image.open(path)
img = img.convert('RGB')

def cache_text_embeddings(classes, batch_size: int = 512) -> torch.Tensor:
    text_embeddings = []
    for i in range(0, len(classes), batch_size):
        text_embeddings.append(clip_model.get_text_features(**clip_processor(text=classes[i:i+batch_size],
                                        return_tensors="pt",
                                        padding=True).to('cuda')).detach())
    return text_embeddings

text_emb = torch.cat(cache_text_embeddings(prompts), dim=0)

img_emb = clip_model.get_image_features(**clip_processor(images=img,
                                        return_tensors="pt",
                                        padding=True).to('cuda')).detach()

text_emb = text_emb/text_emb.norm(p=2, dim=-1, keepdim=True)
img_emb = img_emb/img_emb.norm(p=2, dim=-1, keepdim=True)
logits = torch.matmul(text_emb, img_emb.T) * torch.exp(torch.tensor([0.1], device='cuda'))

print(logits.shape)

values, indices = logits.topk(3, dim=0)
for i, idx in enumerate(indices):
    print(prompts[idx], values[i])


torch.Size([9771, 1])
Poster of a movie: TEKKEN tensor([0.4137], device='cuda:0')
Poster of a movie: Ten Inch Hero tensor([0.3346], device='cuda:0')
Poster of a movie: Teen Titans: Trouble in Tokyo tensor([0.3300], device='cuda:0')
