In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from PIL import Image

import torch
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer

from sklearn.metrics import accuracy_score
from rouge import Rouge

import json
import os

from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.wsd import lesk
for ss in wn.synsets('bank'):
    print(ss, ss.definition())

Synset('bank.n.01') sloping land (especially the slope beside a body of water)
Synset('depository_financial_institution.n.01') a financial institution that accepts deposits and channels the money into lending activities
Synset('bank.n.03') a long ridge or pile
Synset('bank.n.04') an arrangement of similar objects in a row or in tiers
Synset('bank.n.05') a supply or stock held in reserve for future use (especially in emergencies)
Synset('bank.n.06') the funds held by a gambling house or the dealer in some gambling games
Synset('bank.n.07') a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force
Synset('savings_bank.n.02') a container (usually with a slot in the top) for keeping money at home
Synset('bank.n.09') a building in which the business of banking transacted
Synset('bank.n.10') a flight maneuver; aircraft tips laterally about its longitudinal axis (especially in turning)
Synset('bank.v.01') tip laterally
Sy

In [3]:
PATH = "/Users/carlosito/Documents/vsCode/vs-DANI/DL-Project/semeval-2023-task-1-V-WSD-train-v1/trial_v1/"
data = pd.read_csv(PATH+'trial.data.v1.txt', delimiter='\t', header=None)
keys = pd.read_csv(PATH+'trial.gold.v1.txt', delimiter='\t', header=None)
df = pd.concat([data, keys],axis=1)
df.columns = ['keyword', 'context', 'img1', 'img2', 'img3', 'img4', 'img5', 'img6', 'img7', 'img8', 'img9', 'img10', 'gold_key']
df.head()


Unnamed: 0,keyword,context,img1,img2,img3,img4,img5,img6,img7,img8,img9,img10,gold_key
0,andromeda,andromeda tree,image.155.jpg,image.68.jpg,image.9.jpg,image.72.jpg,image.158.jpg,image.86.jpg,image.7.jpg,image.132.jpg,image.36.jpg,image.27.jpg,image.86.jpg
1,angora,angora city,image.5.jpg,image.52.jpg,image.96.jpg,image.70.jpg,image.46.jpg,image.91.jpg,image.76.jpg,image.139.jpg,image.14.jpg,image.115.jpg,image.70.jpg
2,anteater,marsupial anteater,image.147.jpg,image.16.jpg,image.107.jpg,image.135.jpg,image.93.jpg,image.59.jpg,image.88.png,image.131.jpg,image.89.jpg,image.121.jpg,image.107.jpg
3,bank,bank erosion,image.104.jpg,image.64.jpg,image.108.jpg,image.80.jpg,image.21.jpg,image.99.jpg,image.117.jpg,image.146.jpg,image.87.jpg,image.34.jpg,image.64.jpg
4,router,internet router,image.127.jpg,image.0.jpg,image.20.jpg,image.18.jpg,image.112.jpg,image.97.jpg,image.24.jpg,image.1.jpg,image.56.jpg,image.26.jpg,image.18.jpg


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

In [20]:
generated_text_dict = {}

In [27]:
for row in tqdm(range(len(df))):
    for img in range(1,11):
        selected_col = df.iloc[row][f'img{img}']
        
        if selected_col in generated_text_dict.keys():
            pred = generated_text_dict[selected_col]
            
        
        else:
            IMAGE_PATH = "/Users/carlosito/Documents/vsCode/vs-DANI/DL-Project/semeval-2023-task-1-V-WSD-train-v1/trial_v1/trial_images_v1/"
            IMAGE_PATH_EACH = IMAGE_PATH + selected_col
            
            plot_img = Image.open(IMAGE_PATH_EACH)
            if plot_img.mode != "RGB":          #add this thing
                plot_img = plot_img.convert(mode="RGB")
                
                
            pixel_values = feature_extractor(images= plot_img, return_tensors="pt").pixel_values
            pixel_values = pixel_values.to(device)  
            gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
            output_ids = model.generate(pixel_values, **gen_kwargs)
            pred = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
            
            generated_text_dict[selected_col] = pred[0]
            
        

100%|██████████| 16/16 [02:09<00:00,  8.08s/it]


## Writing json file

In [26]:
filename = 'test-image-description.json'
with open(filename, 'w') as f:
    json.dump(generated_text_dict, f, sort_keys=True, indent=4)