In [None]:
%pip install transformers
%pip install torch
%pip install torchvision

In [1]:
import pandas as pd
import numpy as np

from PIL import Image

import torch
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
from sentence_transformers import SentenceTransformer, util

from sklearn.metrics import accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


Dataset Path

In [4]:
PATH = "../../semeval-2023-task-1-V-WSD-train-v1/train_v1/"

In [5]:
data = pd.read_csv(PATH+'train.data.v1.txt', delimiter='\t', header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,moorhen,moorhen swamphen,image.3.jpg,image.8.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.0.jpg,image.5.jpg,image.6.jpg,image.7.jpg,image.9.jpg
1,serinus,serinus genus,image.3.jpg,image.23.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.20.jpg,image.5.jpg,image.24.jpg,image.22.jpg,image.21.jpg
2,pegmatite,pegmatite igneous,image.41.jpg,image.39.jpg,image.42.jpg,image.43.jpg,image.40.jpg,image.44.jpg,image.37.jpg,image.38.jpg,image.36.jpg,image.35.jpg
3,bangalores,bangalores torpedo,image.58.jpg,image.59.jpg,image.64.jpg,image.57.jpg,image.55.jpg,image.56.jpg,image.62.jpg,image.63.jpg,image.61.jpg,image.60.jpg
4,bonxie,bonxie skua,image.3.jpg,image.77.jpg,image.78.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.5.jpg,image.79.jpg,image.76.jpg,image.75.jpg


In [6]:
keys = pd.read_csv(PATH+'train.gold.v1.txt', delimiter='\t', header=None)
keys.head()

Unnamed: 0,0
0,image.0.jpg
1,image.20.jpg
2,image.35.jpg
3,image.55.jpg
4,image.75.jpg


In [7]:
df = pd.concat([data, keys],axis=1)
df.columns = ['keyword', 'context', 'img1', 'img2', 'img3', 'img4', 'img5', 'img6', 'img7', 'img8', 'img9', 'img10', 'gold_key']
df.head()

Unnamed: 0,keyword,context,img1,img2,img3,img4,img5,img6,img7,img8,img9,img10,gold_key
0,moorhen,moorhen swamphen,image.3.jpg,image.8.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.0.jpg,image.5.jpg,image.6.jpg,image.7.jpg,image.9.jpg,image.0.jpg
1,serinus,serinus genus,image.3.jpg,image.23.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.20.jpg,image.5.jpg,image.24.jpg,image.22.jpg,image.21.jpg,image.20.jpg
2,pegmatite,pegmatite igneous,image.41.jpg,image.39.jpg,image.42.jpg,image.43.jpg,image.40.jpg,image.44.jpg,image.37.jpg,image.38.jpg,image.36.jpg,image.35.jpg,image.35.jpg
3,bangalores,bangalores torpedo,image.58.jpg,image.59.jpg,image.64.jpg,image.57.jpg,image.55.jpg,image.56.jpg,image.62.jpg,image.63.jpg,image.61.jpg,image.60.jpg,image.55.jpg
4,bonxie,bonxie skua,image.3.jpg,image.77.jpg,image.78.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.5.jpg,image.79.jpg,image.76.jpg,image.75.jpg,image.75.jpg


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

#### Generate the description for the image

In [10]:
def get_images_description(image_path, image_names):
    images = list()
    for img in image_names:
        image = Image.open(image_path + img)

        if image.mode != "RGB":
            image = image.convert("RGB")
        
        images.append(image)
    
    model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
    model.to(device)
    feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
    tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
    
    pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)

    output_ids = model.generate(pixel_values, **gen_kwargs)

    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]

    del pixel_values
    del output_ids
    del model
    
    torch.cuda.empty_cache()
    
    return preds

#### Measure the similarity between the descriptions and context

In [11]:
def get_sentence_cosine_similarity(sentence1, sentence2):
    sentence_model =  SentenceTransformer('paraphrase-MiniLM-L6-v2')


    embeddings_1 = sentence_model.encode(sentence1, convert_to_tensor=True)
    embeddings_2 = sentence_model.encode(sentence2, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(embeddings_1, embeddings_2).item()
    
    del sentence_model
    del embeddings_1
    del embeddings_2
    
    return cosine_scores

In [13]:
start_row = 200
end_row = 205

answers = list()
for row in range(start_row, end_row):
    image_names = list(df[[f'img{j+1}' for j in range(10)]].iloc[row].values)           # get 10 images
    descriptions = get_images_description(PATH+"train_images_v1/", image_names)             # generate 10 descriptions for 10 images

    score_list = list()
    for sentence in descriptions:
        score = get_sentence_cosine_similarity(df['context'].iloc[row], sentence)   # get cosine similarity between context and description
        score_list.append(score)
    
    index = np.argmax(score_list)                                            # get index of the highest score
    answers.append(image_names[index])
    torch.cuda.empty_cache()

accuracy_score(df['gold_key'].iloc[start_row:end_row].values.tolist(), answers)

0.0