In [1]:
from transformers import (
    VisionEncoderDecoderModel, 
    ViTFeatureExtractor, 
    PreTrainedTokenizerFast,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    default_data_collator,
)
import requests
import torch
import numpy as np
from PIL import Image
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torchvision import transforms
import json
from sklearn.model_selection import train_test_split
from datasets import load_metric
from nltk.translate.bleu_score import corpus_bleu
import json
import pandas as pd

In [2]:
model_name_or_path = "ddobokki/vit-kogpt_trinity-coco-ko"
#encoder_model_name_or_path = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name_or_path)
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name_or_path)

Downloading:   0%|          | 0.00/228 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/109 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/403 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = VisionEncoderDecoderModel.from_pretrained(model_name_or_path)
model.to(device)

Downloading:   0%|          | 0.00/4.43k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/6.02G [00:00<?, ?B/s]

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): PatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0): ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out

In [4]:
with open('./data/MSCOCO_train_val_Korean.json', 'r') as f:
    coco = json.load(f)

with open('./data/dataset_coco_kor.json', 'r') as f:
    coco_split = json.load(f)

In [5]:
img_path = []
data_id = []
total_caption_lst = []
split_type = []
data_path = './data/'
for i in range(len(coco)):
    # 캡션 5개 미만이면 추가하지 않음
    if len(coco[i]['caption_ko']) < 5:
        continue
    if coco[i]['id'] != coco_split['images'][i]['cocoid']:
        continue
    # img path 추가
    img_path.append(data_path + coco[i]['file_path'])
    data_id.append(coco[i]['id'])
    split_type.append(coco_split['images'][i]['split'])

    # img path와 매칭되는 caption 5개 추가
    caption_lst = []
    for j in range(5):
        caption_lst.append(coco[i]['caption_ko'][j])
    total_caption_lst.append(caption_lst)

In [7]:
#pred_df = pd.read_csv('vit-gpt_tri-pred_labels.csv')

In [None]:
#pred_li = pred_df['pred'].to_list()

In [8]:
coco_df = pd.DataFrame(data={
    'id' : data_id,
    'labels': total_caption_lst,
    'img_paths': img_path,
    'type': split_type
})

In [9]:
valid_df = coco_df[coco_df['type'] == 'test'].reset_index()

In [10]:
labels = []
for i in range(len(valid_df)):
    labels.append(tokenizer(valid_df['labels'][i]).input_ids)

In [13]:
#hypo = tokenizer(pred_li).input_ids

In [11]:
#len(hypo)
len(labels)

5000

In [18]:
from nltk.translate.bleu_score import corpus_bleu

In [19]:
#belu4 = corpus_bleu(list_of_references=labels, hypotheses=hypo)

In [20]:
#belu4

0.24534819193938331

In [14]:
valid_df = coco_df[coco_df['type'] == 'test'].reset_index()

In [15]:
def get_pixel_values_and_tokenized_labels(df, tokenizer):
    # 이미지 캐싱
    img_lst = []
    for i in tqdm(range(len(df)),'img_cache'):
        image = Image.open(df['img_paths'][i]).convert("RGB")
        image_tensor = np.array(image)
        pixel_values = feature_extractor(image_tensor, return_tensors="pt").pixel_values
        img_lst.append(pixel_values)
    
    # 캐싱된 이미지의 인덱스에 맞추어서 label들을 리스트에 넣고 tokenizing을 해줌
    # [iamge1, image2, image3, ... image1, image2, image3 ...]
    # [label1, label2, label3, ... label1, label2, label3 ...]
    labels = []
    for i in range(len(df)):
        labels.append(tokenizer(df['labels'][i],max_length=32 ,return_tensors="pt",padding="max_length" , truncation=True).input_ids)
    return img_lst, labels

In [16]:
valid_img, valid_labels = get_pixel_values_and_tokenized_labels(valid_df,tokenizer)

  tensor = as_tensor(value)
img_cache: 100%|██████████| 5000/5000 [02:34<00:00, 32.45it/s]


In [17]:
class COCODataset(Dataset):
    def __init__(self, img_lst, labels) -> None:
        super().__init__()
        self.img_lst = img_lst
        self.labels = labels
    
    def __len__(self):
        return len(self.img_lst)
    def __getitem__(self, index):
        item = {
            "pixel_values": self.img_lst[index].squeeze(),
            "labels": self.labels[index],
        }
        return item

In [18]:
valid_dataset = COCODataset(valid_img, valid_labels)
valid_loader = DataLoader(valid_dataset, batch_size=32 ,shuffle=False)

In [19]:
model.eval()
all_preds = []
with torch.no_grad():
    for batch in tqdm(valid_loader):
        batch_pixel_values, batch_labels = batch['pixel_values'], batch['labels']
        outputs = model.generate(batch_pixel_values.to(device),num_beams=5)
        preds = tokenizer.batch_decode(outputs,skip_special_tokens=True)
        all_preds.extend(preds)



100%|██████████| 157/157 [48:44<00:00, 18.63s/it]


In [20]:
caption1 = []
caption2 = []
caption3 = []
caption4 = []
caption5 = []

for i in range(len(valid_df)):
    caption1.append(valid_df['labels'][i][0])
    caption2.append(valid_df['labels'][i][1])
    caption3.append(valid_df['labels'][i][2])
    caption4.append(valid_df['labels'][i][3])
    caption5.append(valid_df['labels'][i][4])

In [21]:
infer = pd.DataFrame(data = {
    'id': valid_df['id'],
    'pred' : all_preds,
    'label1': caption1,
    'label2': caption2,
    'label3': caption3,
    'label4': caption4,
    'label5': caption5,
    'img_path': valid_df['img_paths']
})

In [22]:
infer.to_csv('vit-gpt_trinity-pred_labels(beam5).csv',index=False)

In [63]:
def gen_text(index):
    image = Image.open(img[index]).convert("RGB")
    image_tensor = np.array(image)
    pixel_values = feature_extractor(image_tensor, return_tensors="pt").pixel_values
    image.show()

    generated_ids = model.generate(pixel_values.to(device),num_beams=5)
    generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    #print(generated_text)

    return generated_text

In [21]:
import requests
from PIL import Image
from transformers import (
    VisionEncoderDecoderModel, 
    ViTFeatureExtractor, 
    PreTrainedTokenizerFast,
)

# device setting
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# load feature extractor and tokenizer
encoder_model_name_or_path = "ddobokki/vision-encoder-decoder-vit-gpt2-coco-ko"
feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_model_name_or_path)
tokenizer = PreTrainedTokenizerFast.from_pretrained(encoder_model_name_or_path)

# load model
model = VisionEncoderDecoderModel.from_pretrained(encoder_model_name_or_path)
model.to(device)

# inference
#
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
with Image.open(requests.get(url, stream=True).raw) as img:
    pixel_values = feature_extractor(images=img, return_tensors="pt").pixel_values

generated_ids = model.generate(pixel_values.to(device),num_beams=5)
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

Downloading:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/939M [00:00<?, ?B/s]

  tensor = as_tensor(value)


In [22]:
generated_text

['고양이 두마리가 담요 위에 누워 있다.']

In [None]:
'''
벽돌 건물 앞에 있는 표지판.
나는 그 앞에서 걸음을 멈추고 지도를 펼친다.
표지가 없어져도 길의 끝은 항상 보인다.
그곳에도 길이 있겠지.
그럼 그곳에 나의 발자국이 남겠지.
한걸음 두 걸음 지도 위를 걸어다니다 보면 내가 가본 적도 없는 어느 골목길, 누군가의 무덤이 있던 자리로.
'''

In [None]:
'''
벽돌 건물 앞에 있는 표지판.
문득, 문 닫힌 대문에 오래도록 눈이 머물렀다
새로운 계절에 어울리는 집이라 이름 지었을 것이고
또한, 한 번도 버려져 본 적 없는 낡은 이층집이
세상 모든 곳에서 온기로 전해질 수 있도록,
모든 것을 하얗게
'''