In [None]:
import preprocess
from dataset import ClipGPTFlickr8kDataset

# Preprocess Data (Data already saved to drive no need to run)
# preprocess.create_CLIP_embeddings_for_images(lang='arabic')
dataset = ClipGPTFlickr8kDataset('./data/embeddings/arabic_CLIP-ViT-B-32_embeddings.pkl', 10)


In [1]:
from bleu import belu_score


model_path = './checkpoints/arabic_exp_1-029.pt'


belu_score(model_path)

100%|██████████| 100/100 [00:29<00:00,  3.42it/s]

The BLEU for Langauge arabic score is 26.197629218220865





In [None]:
import os
import clip
import json
import torch
import PIL.Image 
import pandas as pd
from tqdm import tqdm
import skimage.io as io
from nltk.translate.bleu_score import corpus_bleu
from inference_gpt import load_model, beam_search
from transformers import AutoTokenizer, GPT2Tokenizer

def prepare_data_for_bleu(file_path, n=200):    
    with open(file_path, 'r') as f:
        data = json.load(f)
    sample_image_captions = [item['caption'] for item in data]
    sample_image_ids = [item['image_id'] for item in data]
    unique_image_ids = list(set(sample_image_ids))
    unique_image_ids = unique_image_ids[:n]
    
    image_ids_occurences = []
    for image_id in unique_image_ids:
        image_ids_occurences.append([i for i, x in enumerate(sample_image_ids) if x == image_id])
    captions_per_image = []
    for image_id_occurence in image_ids_occurences:
        captions_per_image.append([sample_image_captions[i] for i in image_id_occurence])
    sample_images_paths = [os.path.join(sample_images_dir, image_name) for image_name in unique_image_ids]

    return captions_per_image, sample_images_paths
    
    

def generate_caption(image_path, model, preprocess, clip_model, tokenizer ,prefix_length,  lang ,device):
    image = io.imread(image_path)
    pil_image = PIL.Image.fromarray(image)
    image = preprocess(pil_image).unsqueeze(0).to(device)
    with torch.no_grad():
        prefix = clip_model.encode_image(image).to(device, dtype=torch.float32)
        prefix_embed = model.clip_project(prefix).reshape(1, prefix_length, -1)
        generated_text_prefix = beam_search(model, tokenizer, embed=prefix_embed, entry_length=10)
    return generated_text_prefix


file_path = './data/annotations/arabic_captions.json'
model_path = './checkpoints/arabic_exp_1-029.pt'
sample_images_dir = './data/images/'

if 'english' in model_path:
    lang = 'english'
if 'arabic' in model_path:
    lang = 'arabic'

sample_image_captions, sample_images_paths = prepare_data_for_bleu(file_path)


# Load the CLIP model
device = 'cuda' if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device, jit=False)

# Load the GPT model Tokenizer
if lang == 'arabic':
    tokenizer = AutoTokenizer.from_pretrained("akhooli/gpt2-small-arabic")
if lang == 'english':
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Load the GPT model
model, prefix_length = load_model(model_path)
model.eval()
model = model.to(device)

In [None]:
candidates = []
references = []
for i in tqdm(range(len(sample_images_paths))):
    image_path = sample_images_paths[i]
    prediction = generate_caption(image_path, model,preprocess, clip_model, tokenizer, prefix_length, lang, device)
    candidates.append(prediction.split(' '))
    references.append([r.split(' ') for r in sample_image_captions[i]])

score = corpus_bleu(references, candidates) *100
print(f'The BLEU score is {score}')

In [None]:
references

In [None]:
candidates

In [None]:
import json

#data
file_path = './data/annotations/arabic_captions.json'
with open(file_path, 'r') as f:
    data = json.load(f)

sample_image_captions = [item['caption'] for item in data]
sample_image_ids = [item['image_id'] for item in data]