In [54]:
from PIL import Image
import requests
import matplotlib.pyplot as plt
import numpy as np
import scipy
import pandas as pd
import os
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from tqdm import tqdm
from transformers import CLIPProcessor, CLIPModel
from transformers import logging
logging.set_verbosity_error()

In [55]:
# model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
# processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [56]:
print(model.num_parameters()) # 427616513 params for large-patch14
                              # 151277313 params for base-patch32
                              # 149620737 params for base-patch16

151277313


In [57]:
# # boilerplate code from huggingface docs
# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
# image = Image.open(requests.get(url, stream=True).raw)
# plt.imshow(image)
# inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)

# outputs = model(**inputs)
# logits_per_image = outputs.logits_per_image # this is the image-text similarity score
# probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
# print(outputs.text_embeds.shape)

In [58]:
# Loading data
caption_path = "/Users/echen/Desktop/CSE_6242.nosync/caption-contest-data/summaries"
cartoon_path = "/Users/echen/Desktop/CSE_6242.nosync/caption-contest-data/cartoons"
ids = set()
cap_paths = []
img_paths = []
for file in os.listdir(caption_path):
    file_path = os.path.join(caption_path, file)
    if not os.path.isfile(file_path) or file[-3:]!='csv' or file[:3] in ids:
        continue
    ids.add(file[:3])
    cap_paths.append(file_path)
for file in os.listdir(cartoon_path):
    file_path = os.path.join(cartoon_path, file)
    if not os.path.isfile(file_path) or file[-3:]!='jpg':
        continue
    if file[:3] not in ids: 
        print("extra_img: ", file)
        continue
    img_paths.append(file_path)
id_list = list(ids)
id_list.sort()
cap_paths.sort()
img_paths.sort()
assert(len(img_paths) == len(cap_paths))

extra_img:  890.jpg


In [59]:
# pca = PCA(n_components=50)
# condensed_data = pca.fit_transform(img1_cap)
# print(condensed_data.shape, np.max(condensed_data), np.min(condensed_data))
# tsne = TSNE()
# tsne_embeds = tsne.fit_transform(condensed_data)
# norm_tsne_embeds = np.zeros(tsne_embeds.shape)
# mins = np.min(tsne_embeds, axis=0, keepdims=True)
# maxes = np.max(tsne_embeds, axis=0, keepdims=True)
# norm_tsne_embeds = (tsne_embeds-mins)/(maxes-mins)
# print(norm_tsne_embeds.shape, np.max(norm_tsne_embeds), np.min(norm_tsne_embeds))
# print(norm_tsne_embeds[:5])

# TSNE Processing Functions

In [60]:
def get_embeds(model, image, caption_list):
    inputs = processor(text=caption_list, images=image, return_tensors="pt", padding=True)
    outputs = model(**inputs)
    text_embeds = outputs.text_embeds.detach().numpy()
    img_embeds = outputs.image_embeds.detach().numpy()
    return text_embeds, img_embeds

In [62]:
pca = PCA(n_components=50)
tsne = TSNE()
def get_tsne_embeds(img_df):
    caps = img_df["cap_feat"]
    cap_data = np.vstack(caps)
    condensed_data = pca.fit_transform(cap_data)
    tsne_embeds = tsne.fit_transform(condensed_data)
    norm_tsne_embeds = np.zeros(tsne_embeds.shape)
    mins = np.min(tsne_embeds, axis=0, keepdims=True)
    maxes = np.max(tsne_embeds, axis=0, keepdims=True)
    norm_tsne_embeds = (tsne_embeds-mins)/(maxes-mins)
    norm_tsne_embeds_df = pd.DataFrame({'X': norm_tsne_embeds[:,0], 'Y': norm_tsne_embeds[:,1], "caption": caps})
    return norm_tsne_embeds_df

In [63]:
batch_size = 32
def get_embeds_from_path(img_id, img_path, cap_path, batch_size=32):
    df_list = []
    img = Image.open(img_path)
    inputs = processor(text=['test'], images=img, return_tensors="pt", padding=True)
    img_feat = model(**inputs).image_embeds.detach().numpy()
    cap_csv = pd.read_csv(cap_path)
    cap_csv["img_id"] = img_id
    cap_csv["cap_feat"] = None
    cap_csv["img_feat"] = [img_feat]*len(cap_csv.index)
    caption_list = []
    for idx,row in cap_csv.iterrows():
        caption_list.append(row['caption'])
        if (idx == (len(cap_csv.index)-1)):
            text_embeds, _ = get_embeds(model, img, caption_list)
            for i,embed in enumerate(text_embeds):
                cap_csv.at[idx-len(text_embeds)+1+i,"cap_feat"] = text_embeds[i]
            caption_list = []
        if (len(caption_list) != batch_size):continue
        text_embeds, _ = get_embeds(model, img, caption_list)
        for i,embed in enumerate(text_embeds):
            cap_csv.at[idx-batch_size+1+i,"cap_feat"] = text_embeds[i]
        caption_list = []
    df_list.append(cap_csv)
    return pd.concat(df_list)

In [66]:
relevant_columns = ["caption", "img_id", "mean","votes"]
def process_dataset_tsne(id_list, img_paths, cap_paths):
    tsne_list = []
    cap_list = []
    for img_id, img_path, cap_path in zip(id_list, img_paths, cap_paths):
        # process for TSNE data

        img_df = get_embeds_from_path(img_id, img_path, cap_path)
        tsne_feats = get_tsne_embeds(img_df)
        tsne_feats["img_id"] = img_id
        tsne_list.append(tsne_feats)

        # save complete dataset
        cap_csv = pd.read_csv(cap_path)
        cap_csv["img_id"] = img_id
        cap_list.append(cap_csv[relevant_columns])
        # break

    return pd.concat(tsne_list), pd.concat(cap_list)

# Create complete TSNE DataFrame

In [67]:
tsne_df, condensed_cap_df = process_dataset_tsne(id_list, img_paths, cap_paths)
tsne_df.head(5)


Unnamed: 0,X,Y,caption,img_id
0,0.472153,0.110353,"[-0.0024316842, 0.015905969, 0.011368003, 0.01...",510
1,0.601796,0.318031,"[0.013644679, -0.013433615, 0.018261917, -0.00...",510
2,0.194784,0.741925,"[0.013936422, 0.0062422995, -0.026340615, 0.00...",510
3,0.402756,0.582557,"[0.024949286, 0.017259136, 0.012959481, -0.016...",510
4,0.438213,0.649546,"[0.017066555, 0.027702762, -0.015552031, -0.02...",510


In terms of preprocessing, we will likely want to save the original dataset dataframes without the precison or score breakdown columns. The TSNE dataframe above can be saved as is since it should be possible to filter for the correct img_id when necessary. 

In [69]:
condensed_cap_df.head(5)

Unnamed: 0,caption,img_id,mean,votes
0,I'm a congressman--obstruction is my job.,510,1.913043,69
1,"I'm what they mean when they say, 'The middle ...",510,1.842105,19
2,Does this suit make me look flat?,510,1.711111,45
3,"When the right woman comes along, I'll know it.",510,1.625,32
4,"I used to lie in the gutter, but then I quit d...",510,1.617647,34
