In [43]:
NAME = "Aung"

In [44]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%pip install transformers
%pip install sentence-transformers

In [46]:
import pandas as pd
import numpy as np

from PIL import Image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
Image.MAX_IMAGE_PIXELS = None

import torch
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
from sentence_transformers import SentenceTransformer, util

from sklearn.metrics import accuracy_score

import os
import io
import glob
import zipfile
import tarfile

In [47]:
PATH = "/content/drive/MyDrive/SemEval_Data/SplitData/"

In [48]:
data = pd.read_csv(PATH+f'unique_image_list_for_{NAME}.csv')
data.head()

Unnamed: 0,image_name
0,image.4841.jpg
1,image.5419.jpg
2,image.2263.jpg
3,image.436.jpg
4,image.1032.jpg


## Model Definition

In [49]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [50]:
max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

#### Generate the description for the image

In [51]:
zipFile = zipfile.ZipFile("/content/drive/MyDrive/semeval-2023-task-1-V-WSD-train-v1.zip")
IMG_PATH = "semeval-2023-task-1-V-WSD-train-v1"
img_list = [item[0] for item in data.values.tolist()]
img_list[0:5]

['image.4841.jpg',
 'image.5419.jpg',
 'image.2263.jpg',
 'image.436.jpg',
 'image.1032.jpg']

In [52]:
def get_images_description(image_path, image_names):
    images = list()
    for img in image_names:
        image = Image.open(io.BytesIO(zipFile.read(image_path + img)))

        if image.mode != "RGB":
            image = image.convert("RGB")
        
        images.append(image)
    
    model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
    model.to(device)
    feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
    tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
    
    pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)

    output_ids = model.generate(pixel_values, **gen_kwargs)

    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]

    del pixel_values
    del output_ids
    del model
    
    torch.cuda.empty_cache()
    
    return preds

In [53]:
description_path = "/content/drive/MyDrive/SemEval_Data/Descriptions"
if not os.path.exists(description_path):
  os.makedirs(description_path)

In [None]:
batch_size = 10
start = 0
end = 10
descriptions = list()

for i in range(round(len(img_list)/batch_size)):
  print(f"Working on Data[{start}:{end}]")
  image_names = img_list[start:end]
  desc = get_images_description("semeval-2023-task-1-V-WSD-train-v1/train_v1/train_images_v1/", image_names)
  descriptions.extend(desc)
  start += 10
  end += 10
df = pd.DataFrame({"image": img_list, "description": descriptions})
df.to_csv(description_path+f"description_for_{NAME}.csv", index=False)

Working on Data[0:10]
Working on Data[10:20]
Working on Data[20:30]
Working on Data[30:40]
Working on Data[40:50]


### Calculate Cosine Scores

In [None]:
result_path = "/content/drive/MyDrive/SemEval_Data/Results"
if not os.path.exists(description_path):
  os.makedirs(description_path)

In [None]:
descriptions = pd.read_csv(description_path+f"description_for_{NAME}.csv")
descriptions.head()

In [None]:
data = pd.read_csv(PATH+f"data_split_for_{NAME}.csv")
data.head()

In [None]:
result = pd.DataFrame(columns=['img_1', 'img_2', 'img_3', 'img_4', 'img_5', 'img_6', 'img_7', 'img_8', 'img_9', 'img_10', 'label'])

In [None]:
sentence_model =  SentenceTransformer('paraphrase-MiniLM-L6-v2')

for idx, row in data.iterrows():
    embeddings_1 = sentence_model.encode(row['context'], convert_to_tensor=True)
    scores = list()
    for i in range(1, 11):
        desc = descriptions[descriptions['image'] == row[f'img_{i}']]['description'].values[0]
        embeddings_2 = sentence_model.encode(desc, convert_to_tensor=True)
        cosine_scores = util.pytorch_cos_sim(embeddings_1, embeddings_2).item()
        
        scores.append(cosine_scores)
        result.loc[idx, f'img_{i}'] = cosine_scores

        del embeddings_2
    result.loc[idx, 'label'] = row[f"img_{np.argmax(scores) + 1}"]
    del embeddings_1
result.to_csv(result_path+f"similarities_and_prediction_for_{NAME}.csv", index=False)

### Accuracy

In [None]:
accuracy_score(data['gold_key'], result['label'])