In [1]:
from transformers import (
    VisionEncoderDecoderModel, 
    ViTFeatureExtractor, 
    PreTrainedTokenizerFast,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    default_data_collator,
)
import torch
import numpy as np
from PIL import Image
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torchvision import transforms
import json
from sklearn.model_selection import train_test_split
from datasets import load_metric
from nltk.translate.bleu_score import corpus_bleu

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# gpu 사용
decoder_model_name_or_path = "skt/kogpt2-base-v2"
#decoder_model_name_or_path = 'klue/bert-base'
encoder_model_name_or_path = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_model_name_or_path)
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(encoder_model_name_or_path, decoder_model_name_or_path)
# encoder, extractor -> vit
model.to(device)

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at skt/kogpt2-base-v2 and are newly initialized: ['transformer.h.10.ln_cross_attn.weight', 'transformer.h.2.crossattention.c_proj.weight', 'transformer.h.0.crossattention.bias', 'transformer.h.5.crossattention.masked_bias', 'transformer.h.10.crossattention.c_proj.bias', 'transformer.h.6.crossattention.c_attn.weight', 'transformer.h.4.crossattention.c_proj.bias', 'transformer.h.7.crossattention.c_proj.weight', 'transformer.h.9.ln_cross_attn.weight', 'transformer.h.4.crossattention.bias', 'transformer.h.11.ln_cross_attn.weight', 'transformer.h.1.crossattention.q_attn.weight', 'transformer.h.5.crossattention.c_proj.weight', 'transformer.h.7.crossattention.q_attn.weight', 'transformer.h.8.ln_cross_attn.weight', 'transformer.h.3.crossattention.q_attn.weight', 'transformer.h.3.crossattention.c_attn.weight', 'transformer.h.7.crossattention.c_proj.bias', 'transformer.h.11.crossattention.bias', 'transformer.h.3.cross

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): PatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0): ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out

In [3]:
tokenizer = PreTrainedTokenizerFast.from_pretrained(decoder_model_name_or_path, bos_token='<s>', 
                                                    eos_token='</s>', unk_token='<unk>',pad_token='<pad>', mask_token='<mask>')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [2]:
with open('./data/MSCOCO_train_val_Korean.json', 'r') as f:
    coco = json.load(f)

with open('./data/dataset_coco_kor.json', 'r') as f:
    coco_split = json.load(f)

In [10]:
coco[7]

{'file_path': 'val2014/COCO_val2014_000000060623.jpg',
 'captions': ['A young girl inhales with the intent of blowing out a candle. ',
  'A young girl is preparing to blow out her candle.',
  'A kid is to blow out the single candle in a bowl of birthday goodness. ',
  'Girl blowing out the candle on an ice-cream ',
  'A little girl is getting ready to blow out a candle on a small dessert.'],
 'id': 60623,
 'caption_ko': ['한 어린 소녀가 촛불을 불어 끌 의도를 갖고 있다.',
  '한 어린 소녀가 촛불을 끌 준비를 하고 있다.',
  '어린 아이는 착한 일이 있는 그릇에 들어 있는 촛불을 불어 끄는 것이다.',
  '아이스크림에 촛불을 불어 끄는 소녀',
  '어린 소녀가 작은 디저트 위에 촛불을 불어서 끌 준비를 하고 있다.']}

In [5]:
img_path = []
data_id = []
total_caption_lst = []
split_type = []
data_path = './data/'
for i in range(len(coco)):
    # 캡션 5개 미만이면 추가하지 않음
    if len(coco[i]['caption_ko']) < 5:
        continue
    if coco[i]['id'] != coco_split['images'][i]['cocoid']:
        continue
    # img path 추가
    img_path.append(data_path + coco[i]['file_path'])
    data_id.append(coco[i]['id'])
    split_type.append(coco_split['images'][i]['split'])

    # img path와 매칭되는 caption 5개 추가
    caption_lst = []
    for j in range(5):
        caption_lst.append(coco[i]['caption_ko'][j])
    total_caption_lst.append(caption_lst)

In [7]:
coco_df = pd.DataFrame(data={
    'id' : data_id,
    'labels': total_caption_lst,
    'img_paths': img_path,
    'type': split_type
})

In [8]:
coco_train = coco_df[coco_df['type'] == 'train']
coco_restval = coco_df[coco_df['type'] == 'restval']
train_df = pd.concat([coco_train,coco_restval],ignore_index=True)
train_df = train_df.reset_index()
val_df = coco_df[coco_df['type'] == 'val'].reset_index()

In [9]:
def get_pixel_values_and_tokenized_labels(df, tokenizer):
    # 이미지 캐싱
    img_lst = []
    for i in tqdm(range(len(df)),'img_cache'):
        image = Image.open(df['img_paths'][i]).convert("RGB")
        image_tensor = np.array(image)
        pixel_values = feature_extractor(image_tensor, return_tensors="pt").pixel_values
        img_lst.append(pixel_values)
    
    # 캐싱된 이미지의 인덱스에 맞추어서 label들을 리스트에 넣고 tokenizing을 해줌
    # [iamge1, image2, image3, ... image1, image2, image3 ...]
    # [label1, label2, label3, ... label1, label2, label3 ...]
    labels = []
    for i in range(len(df)):
        labels.append(tokenizer(df['labels'][i],max_length=32 ,return_tensors="pt",padding="max_length" , truncation=True).input_ids)
    return img_lst, labels

In [10]:
class COCODataset(Dataset):
    def __init__(self, img_lst, labels) -> None:
        super().__init__()
        self.img_lst = img_lst
        self.labels = labels
    
    def __len__(self):
        return len(self.img_lst)
    def __getitem__(self, index):
        item = {
            "pixel_values": self.img_lst[index].squeeze(),
            "labels": self.labels[index],
        }
        return item

In [11]:
train_pixel, train_labels = get_pixel_values_and_tokenized_labels(df = train_df,tokenizer = tokenizer)
valid_pixel, valid_labels = get_pixel_values_and_tokenized_labels(df = val_df,tokenizer = tokenizer)

  tensor = as_tensor(value)
img_cache: 100%|██████████| 98629/98629 [1:38:30<00:00, 16.69it/s]
img extend: 100%|██████████| 5/5 [00:00<00:00, 392.45it/s]
tokenizing: 100%|██████████| 5/5 [00:03<00:00,  1.27it/s]
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
img_cache: 100%|██████████| 24658/24658 [25:16<00:00, 16.26it/s]
img extend: 100%|██████████| 5/5 [00:00<00:00, 1062.93it/s]
tokenizing: 100%|██████████| 5/5 [00:00<00:00,  7.04it/s]


In [12]:
train_dataset = COCODataset(train_pixel,train_labels)
valid_dataset = COCODataset(valid_pixel,valid_labels)

In [13]:
model.config.decoder_start_token_id = 0
model.config.pad_token_id = 3
model.config.vocab_size = model.config.decoder.vocab_size

In [14]:
def compute_metrics(pred):
  """ validation을 위한 metrics function """
  labels = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True)
  preds = tokenizer.batch_decode(pred.predictions, skip_special_tokens=True)


  # labels -> [sen1, sen2, sen3 ...]
  # list_of_references -> [[sen1],[sen2],[sen3]...]
  list_of_references = []
  for i in range(len(labels)):
    list_of_references.append([labels[i]])
      
  # calculate blue4
  blue4 = corpus_bleu(list_of_references= list_of_references, hypotheses = preds)

  return {
      'blue4': blue4
  }

In [17]:
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps = 8,
    overwrite_output_dir=True,
    fp16=True,
    load_best_model_at_end=True,
    output_dir='finetuned',
    logging_steps=500,
    save_steps=500,
    eval_steps=500,
    num_train_epochs=5,
    save_total_limit=2
)

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=feature_extractor,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics = compute_metrics,
    data_collator=default_data_collator,
)
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using amp fp16 backend
***** Running training *****
  Num examples = 493145
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 8
  Total optimization steps = 4815


Step,Training Loss,Validation Loss,Blue4
500,0.5311,0.507908,0.242063
1000,0.4752,0.485473,0.244087
1500,0.4445,0.474927,0.24965
2000,0.4321,0.469131,0.251926
2500,0.4136,0.464637,0.252627
3000,0.4068,0.463035,0.255133
3500,0.3929,0.460689,0.257245
4000,0.3877,0.460486,0.254683
4500,0.3779,0.459354,0.255253


***** Running Evaluation *****
  Num examples = 123290
  Batch size = 64
Saving model checkpoint to finetuned/checkpoint-500
Configuration saved in finetuned/checkpoint-500/config.json
Model weights saved in finetuned/checkpoint-500/pytorch_model.bin
Configuration saved in finetuned/checkpoint-500/preprocessor_config.json
Deleting older checkpoint [finetuned/checkpoint-10] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 123290
  Batch size = 64
Saving model checkpoint to finetuned/checkpoint-1000
Configuration saved in finetuned/checkpoint-1000/config.json
Model weights saved in finetuned/checkpoint-1000/pytorch_model.bin
Configuration saved in finetuned/checkpoint-1000/preprocessor_config.json
Deleting older checkpoint [finetuned/checkpoint-20] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 123290
  Batch size = 64
Saving model checkpoint to finetuned/checkpoint-1500
Configuration saved in finetuned/checkpoint-1500/config.json

In [None]:
model.save_pretrained('./finetuned')