In [14]:
import jsonlines
import pandas as pd
# Read the captions from the jsonl file
with jsonlines.open('captions/captions.jsonl') as reader:
    captions = list(reader)

target_langs = ['ar', 'en', 'fr', 'de']
cols  =[
    'image',
    'arabic_caption', 'num_arabic_captions',
    'english_caption','num_english_captions',
    'french_caption', 'num_french_captions',
    'german_caption', 'num_german_captions',
    'locale'
    ]

langs_abbrev = { 'ar': 'arabic', 'en': 'english', 'fr': 'french', 'de': 'german' }
# Create a dictionary to store the captions
captions_dict = {col: [] for col in cols}

# Loop through the captions
for caption_item in captions:
    # Get the image name
    image = caption_item['image/key']
    # Add the image name to the dictionary
    captions_dict['image'].append(image)
    # Get the locale
    locale = caption_item['image/locale']
    # Add the locale to the dictionary
    captions_dict['locale'].append(locale)

    # Get the caption in the target language
    for lang in target_langs:
        # Get the caption in the target language
        caption = caption_item[f'{lang}']['caption']
        # Get the number of captions for the image
        num_captions = len(caption)
        # Add the caption to the dictionary
        captions_dict[f'{langs_abbrev[lang]}_caption'].append(caption)
        # Add the number of captions to the dictionary
        captions_dict[f'num_{langs_abbrev[lang]}_captions'].append(num_captions)

# Create a dataframe from the dictionary
captions_df = pd.DataFrame(captions_dict)
    

# Save the dataframe to a pickle file
captions_df.to_pickle('captions/captions.pkl')

In [25]:
import json
import clip
import torch
import pickle
import pandas as pd
from PIL import Image
from tqdm import tqdm
import skimage.io as io
import os


def create_captions_json(path):
    '''
    Create a json file for the arabic and english captions
    '''
    # load the merged captions
    merged_captions = pd.read_pickle(path)

    # Get the arabic, english captions, French and German captions
    arabic_captions_df = merged_captions[['image', 'arabic_caption']]
    english_captions_df = merged_captions[['image', 'english_caption']]
    french_captions_df = merged_captions[['image', 'french_caption']]
    german_captions_df = merged_captions[['image', 'german_caption']]

    # Split the three captions per image into three rows
    arabic_captions_df = arabic_captions_df.explode('arabic_caption')
    english_captions_df = english_captions_df.explode('english_caption')
    french_captions_df = french_captions_df.explode('french_caption')
    german_captions_df = german_captions_df.explode('german_caption')

    #Rename the columns image and arabic_caption to image_id and caption
    arabic_captions_df = arabic_captions_df.rename(columns={'image': 'image_id', 'arabic_caption': 'caption'})
    english_captions_df = english_captions_df.rename(columns={'image': 'image_id', 'english_caption': 'caption'})
    french_captions_df = french_captions_df.rename(columns={'image': 'image_id', 'french_caption': 'caption'})
    german_captions_df = german_captions_df.rename(columns={'image': 'image_id', 'german_caption': 'caption'})

    #Convert the dataframe to list of Dictionaries
    arabic_captions = arabic_captions_df.to_dict('records')
    english_captions = english_captions_df.to_dict('records')
    french_captions = french_captions_df.to_dict('records')
    german_captions = german_captions_df.to_dict('records')

    # create a dictionary name annotations if it doesn't exist
    os.makedirs('./annotations', exist_ok=True)

    # Save the list of dictionaries to a json file
    with open('annotations/arabic_captions.json', 'w') as f:
        json.dump(arabic_captions, f)
    with open('annotations/english_captions.json', 'w') as f:
        json.dump(english_captions, f)
    with open('annotations/french_captions.json', 'w') as f:
        json.dump(french_captions, f)
    with open('annotations/german_captions.json', 'w') as f:
        json.dump(german_captions, f)



def create_CLIP_embeddings_for_images(lang, clip_model_type='ViT-B/32', device='cuda'):
    '''
    Create the CLIP embeddings for the images and save them to a pickle file
    '''
    # make directory for the embeddings if it doesn't exist
    os.makedirs('./embeddings', exist_ok=True)
    # create the output path
    clip_model_name = clip_model_type.replace('/', '-') 
    out_path = f"./embeddings/{lang}_CLIP-{clip_model_name}_embeddings.pkl" 

    # load the CLIP model
    clip_model, preprocess = clip.load(clip_model_type, device=device, jit=False)

    # load the annotations
    annotations_file = f"./annotations/{lang}_captions.json"
    with open(annotations_file, 'r') as f:
        data = json.load(f)

    # create list of dictionaries with the image id, the CLIP embedding and the caption
    all_embeddings = []
    all_captions = []
    for i in tqdm(range(len(data))):
        # load the image
        d = data[i]
        img_id = d["image_id"]
        filename = f"./images/{img_id}.jpg"
        image = io.imread(filename)

        # preprocess the image and encode it with the CLIP model
        image = preprocess(Image.fromarray(image)).unsqueeze(0).to(device)
        with torch.no_grad():
            prefix = clip_model.encode_image(image).cpu()

        # add the index , embedding and caption to the dictionary
        d["clip_embedding"] = i 
        all_embeddings.append(prefix)
        all_captions.append(d)

    # save the dictionary to a pickle file
    with open(out_path, 'wb') as f:
        pickle.dump({"clip_embedding": torch.cat(all_embeddings, dim=0), "captions": all_captions}, f)




In [None]:
path = 'captions/captions.pkl'
create_captions_json(path)

In [26]:
target_langs = ['arabic', 'english', 'french', 'german']

for lang in target_langs:
    print(f'Creating CLIP embeddings for {lang} captions')
    create_CLIP_embeddings_for_images(lang)

Creating CLIP embeddings for arabic captions


100%|██████████| 7367/7367 [06:23<00:00, 19.22it/s]


Creating CLIP embeddings for english captions


100%|██████████| 7200/7200 [05:46<00:00, 20.76it/s]


Creating CLIP embeddings for french captions


100%|██████████| 8562/8562 [06:57<00:00, 20.52it/s]


Creating CLIP embeddings for german captions


100%|██████████| 8643/8643 [07:01<00:00, 20.52it/s]
