In [1]:
from transformers import (
    VisionEncoderDecoderModel, 
    ViTFeatureExtractor, 
    PreTrainedTokenizerFast,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    default_data_collator,
    AutoConfig,
    EncoderDecoderConfig,
    VisionEncoderDecoderConfig
)
import torch
import numpy as np
from PIL import Image
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torchvision import transforms
import json
from sklearn.model_selection import train_test_split
from datasets import load_metric
from nltk.translate.bleu_score import corpus_bleu
from transformers import AdamW
import wandb

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# gpu 사용
#decoder_model_name_or_path = "skt/kogpt2-base-v2"
decoder_model_name_or_path = "skt/ko-gpt-trinity-1.2B-v0.5"
encoder_model_name_or_path = 'google/vit-base-patch16-224-in21k'
#config = AutoConfig.from_pretrained(decoder_model_name_or_path)

# config_decoder = AutoConfig.from_pretrained(decoder_model_name_or_path)
# config_encoder = AutoConfig.from_pretrained(encoder_model_name_or_path)

# config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)


#decoder_model_name_or_path = 'klue/bert-base'

feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_model_name_or_path)
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(encoder_model_name_or_path, decoder_model_name_or_path)
#model = VisionEncoderDecoderModel(config=config)

# encoder, extractor -> vit
model.to(device)

In [3]:
tokenizer = PreTrainedTokenizerFast.from_pretrained(decoder_model_name_or_path, bos_token='<s>', 
                                                    eos_token='</s>', unk_token='<unk>',pad_token='<pad>', mask_token='<mask>')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [4]:
with open('./data/MSCOCO_train_val_Korean.json', 'r') as f:
    coco = json.load(f)

with open('./data/dataset_coco_kor.json', 'r') as f:
    coco_split = json.load(f)

In [5]:
img_path = []
data_id = []
total_caption_lst = []
split_type = []
data_path = './data/'
for i in range(len(coco)):
    # 캡션 5개 미만이면 추가하지 않음
    if len(coco[i]['caption_ko']) < 5:
        continue
    if coco[i]['id'] != coco_split['images'][i]['cocoid']:
        continue
    # img path 추가
    img_path.append(data_path + coco[i]['file_path'])
    data_id.append(coco[i]['id'])
    split_type.append(coco_split['images'][i]['split'])

    # img path와 매칭되는 caption 5개 추가
    caption_lst = []
    for j in range(5):
        caption_lst.append(coco[i]['caption_ko'][j])
    total_caption_lst.append(caption_lst)

In [6]:
coco_df = pd.DataFrame(data={
    'id' : data_id,
    'labels': total_caption_lst,
    'img_paths': img_path,
    'type': split_type
})

In [7]:
coco_df[coco_df['img_paths'] == './data/val2014/COCO_val2014_000000531234.jpg']

Unnamed: 0,id,labels,img_paths,type
7306,531234,"[차고에 매달려 오토바이를 타고 있는 한 남자, 모자로 오토바이를 고치는 남자, 오...",./data/val2014/COCO_val2014_000000531234.jpg,restval


In [8]:
coco_df.head()

Unnamed: 0,id,labels,img_paths,type
0,391895,"[빨간 헬멧을 쓴 남자가 작은 모터 달린 비포장 도로를 달려 있다., 시골의 비포장...",./data/val2014/COCO_val2014_000000391895.jpg,test
1,522418,"[케이크를 자르고 있는 머리에 그물을 두른 여자, 큰 하얀 시트 케이크를 자르고 있...",./data/val2014/COCO_val2014_000000522418.jpg,restval
2,184613,"[꽃이 핀 우산을 들고 북을 치고 있는 아이., 한 젊은 남자가 소떼 옆에 우산을 ...",./data/val2014/COCO_val2014_000000184613.jpg,val
3,318219,"[컴퓨터 키보드 앞에 서 있는 어린 소년, 헤드폰을 끼고 컴퓨터 모니터를 보고 있는...",./data/val2014/COCO_val2014_000000318219.jpg,restval
4,554625,"[긴 컴퓨터 한줄에 헤드폰을 끼고 있는 소년, 이어폰을 끼고 무언가를 듣는 어린 소...",./data/val2014/COCO_val2014_000000554625.jpg,restval


In [9]:
coco_train = coco_df[coco_df['type'] == 'train']
coco_restval = coco_df[coco_df['type'] == 'restval']
train_df = pd.concat([coco_train,coco_restval],ignore_index=True)
#train_df = train_df.iloc[:100]

In [10]:
valid_df = coco_df[coco_df['type'] == 'val'].reset_index()
#valid_df = valid_df.iloc[:100]

In [11]:
def get_pixel_values_and_tokenized_labels(df, tokenizer):
    # 이미지 캐싱
    img_lst = []
    for i in tqdm(range(len(df)),'img_cache'):
        image = Image.open(df['img_paths'][i]).convert("RGB")
        image_tensor = np.array(image)
        pixel_values = feature_extractor(image_tensor, return_tensors="pt").pixel_values
        img_lst.append(pixel_values)
    
    # 캐싱된 이미지의 인덱스에 맞추어서 label들을 리스트에 넣고 tokenizing을 해줌
    # [iamge1, image2, image3, ... image1, image2, image3 ...]
    # [label1, label2, label3, ... label1, label2, label3 ...]
    labels = []
    for i in range(len(df)):
        labels.append(tokenizer(df['labels'][i],max_length=32 ,return_tensors="pt",padding="max_length" , truncation=True).input_ids)
    return img_lst, labels

In [12]:
train_img, train_labels = get_pixel_values_and_tokenized_labels(train_df,tokenizer)
valid_img, valid_labels = get_pixel_values_and_tokenized_labels(valid_df,tokenizer)

  tensor = as_tensor(value)
img_cache: 100%|██████████| 113287/113287 [1:02:58<00:00, 29.99it/s]
img_cache: 100%|██████████| 5000/5000 [02:53<00:00, 28.75it/s]


In [13]:
class COCODataset(Dataset):
    def __init__(self, img_lst, labels) -> None:
        super().__init__()
        self.img_lst = img_lst
        self.labels = labels
    
    def __len__(self):
        return len(self.img_lst)
    def __getitem__(self, index):
        item = {
            "pixel_values": self.img_lst[index].squeeze(),
            "labels": self.labels[index],
        }
        return item

In [14]:
def validate(pred,labels,batch_size):
  """ validation을 위한 metrics function """
#   labels = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True)
  preds = tokenizer.batch_decode(pred, skip_special_tokens=True)
  total_labels = []
  for i in range(batch_size):
      total_labels.append(tokenizer.batch_decode(labels[i], skip_special_tokens=True))

  return preds, total_labels

In [15]:
train_dataset = COCODataset(train_img, train_labels)
valid_dataset = COCODataset(valid_img, valid_labels)

In [16]:
batch_size = 4
train_loader = DataLoader(train_dataset, batch_size=batch_size ,shuffle=True, drop_last= True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size ,shuffle=True)

In [17]:
# wandb.init(
#         project="image_caption",
#         entity="chungye-mountain-sherpa",
#         name="check_belu4",
#         group="vit-gpt2",
#     )

In [18]:
model.config.decoder_start_token_id = 0
model.config.pad_token_id = 3
model.config.vocab_size = model.config.decoder.vocab_size

In [19]:
optim = AdamW(model.parameters(), lr=5e-5)
best_score = -1
for epoch in range(5):
    model.train()

    for batch in tqdm(train_loader):
        optim.zero_grad()
        batch_pixel_values, batch_labels = batch['pixel_values'], batch['labels']

        one_labels = []
        for i in range(batch['labels'].shape[0]): # batch
            one_labels.append(batch['labels'][i][epoch % 5][:].unsqueeze(0))
        one_labels = torch.cat(one_labels,dim=0)

        outputs = model(pixel_values=batch_pixel_values.to(device), labels=one_labels.to(device))
        loss = outputs.loss
        loss.backward()
        optim.step()
    
    model.eval()
    with torch.no_grad():
        all_preds = []
        all_labels = []
        valid_loss = 0
        for batch in tqdm(valid_loader):
            batch_pixel_values, batch_labels = batch['pixel_values'], batch['labels']

            one_labels = []
            for i in range(batch['labels'].shape[0]): # batch
                one_labels.append(batch['labels'][i][epoch % 5][:].unsqueeze(0))
            one_labels = torch.cat(one_labels,dim=0)

            outputs = model.generate(batch_pixel_values.to(device))
            #print(outputs)
            string_pred, string_labels = validate(outputs,batch_labels, batch['labels'].shape[0])
            
            all_preds.extend(string_pred)
            all_labels.extend(string_labels)

        belu4 = corpus_bleu(list_of_references= all_labels, hypotheses = all_preds) #batch
        if belu4 > best_score:
            model.save_pretrained('./finetuned')
            best_score = belu4
        print(belu4)
        #wandb.log({'belu4':belu4})
#wandb.finish()
#model.save_pretrained('./finetuned')
        

100%|██████████| 28321/28321 [3:25:28<00:00,  2.30it/s]  
100%|██████████| 1250/1250 [18:18<00:00,  1.14it/s]


TypeError: Object of type VisionEncoderDecoderConfig is not JSON serializable

In [None]:
tokenizer.save_pretrained('./finetuned')
feature_extractor.save_pretrained('./finetuned')