In [26]:
from PIL import Image
import requests
import matplotlib.pyplot as plt
import numpy as np
import scipy
import pandas as pd
import os
from sklearn.manifold import TSNE
from tqdm import tqdm
from transformers import CLIPProcessor, CLIPModel
from transformers import logging
logging.set_verbosity_error()

In [5]:
# model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
# processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

  return self.fget.__get__(instance, owner)()


In [6]:
print(model.num_parameters()) # 427616513 params for large-patch14
                              # 151277313 params for base-patch32
                              # 149620737 params for base-patch16

151277313


In [None]:
# # boilerplate code from huggingface docs
# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
# image = Image.open(requests.get(url, stream=True).raw)
# plt.imshow(image)
# inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)

# outputs = model(**inputs)
# logits_per_image = outputs.logits_per_image # this is the image-text similarity score
# probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
# print(outputs.text_embeds.shape)

In [7]:
# Loading data
caption_path = "/Users/echen/Desktop/CSE_6242.nosync/caption-contest-data/summaries"
cartoon_path = "/Users/echen/Desktop/CSE_6242.nosync/caption-contest-data/cartoons"
ids = set()
cap_paths = []
img_paths = []
for file in os.listdir(caption_path):
    file_path = os.path.join(caption_path, file)
    if not os.path.isfile(file_path) or file[-3:]!='csv' or file[:3] in ids:
        continue
    ids.add(file[:3])
    cap_paths.append(file_path)
for file in os.listdir(cartoon_path):
    file_path = os.path.join(cartoon_path, file)
    if not os.path.isfile(file_path) or file[-3:]!='jpg':
        continue
    if file[:3] not in ids: 
        print("extra_img: ", file)
        continue
    img_paths.append(file_path)
id_list = list(ids)
id_list.sort()
cap_paths.sort()
img_paths.sort()
assert(len(img_paths) == len(cap_paths))

extra_img:  890.jpg


In [16]:
def get_embeds(model, image, caption_list):
    inputs = processor(text=caption_list, images=img, return_tensors="pt", padding=True)
    outputs = model(**inputs)
    text_embeds = outputs.text_embeds.detach().numpy()
    img_embeds = outputs.image_embeds.detach().numpy()
    return text_embeds, img_embeds

In [31]:
columns = ["img_id", "img_feat", "cap_feat", "mean","precision", "votes", "not_funny", "somewhat_funny", "funny"]
new_cols = ["img_id", "img_feat", "cap_feat"]
dataset = pd.DataFrame(columns=columns)
batch_size = 32
df_list = []
for img_id, img_path, cap_path in zip(id_list, img_paths, cap_paths):
    img = Image.open(img_path)
    inputs = processor(text=['test'], images=img, return_tensors="pt", padding=True)
    img_feat = model(**inputs).image_embeds.detach().numpy()
    cap_csv = pd.read_csv(cap_path)
    cap_csv["img_id"] = img_id
    cap_csv["cap_feat"] = None
    cap_csv["img_feat"] = [img_feat]*len(cap_csv.index)
    caption_list = []
    for idx,row in cap_csv.iterrows():
        caption_list.append(row['caption'])
        if (len(caption_list) != batch_size):continue
        text_embeds, _ = get_embeds(model, img, caption_list)
        for i,embed in enumerate(text_embeds):
            cap_csv.at[idx-batch_size+1+i,"cap_feat"] = text_embeds[i]
        caption_list = []
    # print(len(caption_list))
    if (len(caption_list) > 0):
        text_embeds, _ = get_embeds(model, img, caption_list)
        for i,embed in enumerate(text_embeds):
            cap_csv.at[idx-batch_size+1+i,"cap_feat"] = text_embeds[i]
    df_list.append(cap_csv)
    print(pd.concat(df_list))
    break
print(len(ids)) #889-510, 525 does not exist in dataset for some reason?

      rank                                            caption      mean  \
0        0          I'm a congressman--obstruction is my job.  1.913043   
1        1  I'm what they mean when they say, 'The middle ...  1.842105   
2        2                  Does this suit make me look flat?  1.711111   
3        3    When the right woman comes along, I'll know it.  1.625000   
4        4  I used to lie in the gutter, but then I quit d...  1.617647   
...    ...                                                ...       ...   
3900  3900  Just getting material for my book, " Humanity ...  1.000000   
3901  3901     This has 'Alice in Wonderland' beat by a mile.  1.000000   
3902  3902                    I could use a quick pick-me-up.  1.000000   
3903  3903     This is the secret to attracting foot traffic.  1.000000   
3904  3904                 An aloof view of clouds colliding.  1.000000   

      precision  votes  not_funny  somewhat_funny  funny img_id  \
0      0.094022     69         2