In [3]:
import torch
import random
import editdistance
from fuzzy import fuzzy
from datasets import load_dataset
from transformers import AutoTokenizer

class DocVQADataset(torch.utils.data.Dataset):
    def __init__(self,split):
        datasets = load_dataset("Trailblazer-Yoo/boostcamp-docvqa-test")
        self.dataset = datasets['test']
        
        try:
          model_checkpoint = "microsoft/layoutlmv2-base-uncased"
          self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
        except:
          model_checkpoint = "microsoft/layoutlmv2-base-uncased"
          self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)   

    def encode_dataset(self,example, max_length=512):
      # take a batch 
      questions = example['question']
      words = [w for w in example['words']] #handles numpy and list
      boxes = example['boxes']

      # encode it
      encoding = self.tokenizer([questions], [words], [boxes], max_length=max_length, padding="max_length", truncation=True,return_tensors="pt")
      encoding = {
              'input_ids': encoding['input_ids'],
              'attention_mask': encoding['attention_mask'],
              'token_type_ids': encoding['token_type_ids'],
              'bbox': encoding['bbox'],
              }
      ## 바뀐 부분 example['image'].copy() -> example['image'].copy()[0]
      encoding['image'] = torch.LongTensor(example['image'].copy()[0])

      return encoding

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self,index):
        data = self.dataset[index]
        data = self.encode_dataset(data)

        if data is None:
            #return self.__getitem__((index+1)%len(self))
            index = random.randrange(len(self))
            return self.__getitem__(index)

        return data



In [4]:
def collate(data):
    return {
            'input_ids': torch.cat([d['input_ids'] for d in data],dim=0),
            'attention_mask': torch.cat([d['attention_mask'] for d in data],dim=0),
            'token_type_ids': torch.cat([d['token_type_ids'] for d in data],dim=0),
            'bbox': torch.cat([d['bbox'] for d in data],dim=0),
            'image': torch.stack([d['image'] for d in data],dim=0),
            }

In [5]:
import os
import torch
from transformers import AutoModelForQuestionAnswering

os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = 'cuda'
model = AutoModelForQuestionAnswering.from_pretrained("microsoft/layoutlmv2-base-uncased")
checkpoint = torch.load('/opt/ml/docvqa/model.pt')
model.load_state_dict(checkpoint)

Some weights of the model checkpoint at microsoft/layoutlmv2-base-uncased were not used when initializing LayoutLMv2ForQuestionAnswering: ['layoutlmv2.visual.backbone.bottom_up.res4.19.conv1.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.15.conv3.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.19.conv3.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.5.conv2.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res5.0.shortcut.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.stem.conv1.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res5.2.conv1.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res2.1.conv2.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.10.conv1.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.18.conv2.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.1.conv3.norm.num_batches_tracked

<All keys matched successfully>

In [6]:
from transformers import LayoutLMv2Processor
processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")

def run (batch, start_logits,end_logits):
    length = len(batch['input_ids'])
    predicts = []
    for i in range(length):
        predicted_start_idx = start_logits[i].argmax(-1).item()
        predicted_end_idx = end_logits[i].argmax(-1).item()
        valid = processor.tokenizer.decode(batch['input_ids'][i][predicted_start_idx:predicted_end_idx+1])
        predicts.append(valid)

    return predicts

In [7]:
import torch
# from coll import collate
test_dataset = DocVQADataset('test')
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=16,collate_fn=collate, shuffle=False)

Using custom data configuration Trailblazer-Yoo--boostcamp-docvqa-test-be900c9a47ce7d4b
Found cached dataset parquet (/opt/ml/.cache/huggingface/datasets/Trailblazer-Yoo___parquet/Trailblazer-Yoo--boostcamp-docvqa-test-be900c9a47ce7d4b/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 1/1 [00:00<00:00, 368.21it/s]


In [9]:
test_dataset[0]

{'input_ids': tensor([[  101,  2054,  2003,  1996, 11443,  4859,  3477,  5833,  1999,  2262,
           1029,   102,  2009,  2278,  3132,  1012,  3189,  1998,  6115,  2262,
           5658,  4276, 21665,  2015,  1008, 21665,  2015,  6187, 16523,  2385,
           1003,  6187, 16523,  2539,  1003,  7261,  2475, 13539,  2581, 11176,
           2487,  2727,  2541,  2294,  2289,  2263,  2268,  2230,  2249,  2262,
           2727,  2541,  2294,  2289,  2263,  2268,  2230,  2249,  2262, 21665,
           2015,  6187, 16523,  2539,  1003,  6187, 16523,  2382,  1003,  2727,
           2541,  2294,  2289,  2263,  2268,  1024,  2230,  2249,  2262,  2727,
           1016,  2294,  2289,  2263,  2268,  2230,  2249,  2262, 18678,  1997,
          11443,  4859,  4353,  4171,  2230,  1011,  2569, 17705, 11443,  4859,
           2249,  1011,  2569, 11443,  4859,  2006, 16919, 12943,  2213, 16565,
           1998, 11443,  4859,  2566,  3745,  1006, 10426,  1005,  1022,  1013,
           2871,  1003,  23

In [11]:
import gc
from tqdm.auto import tqdm
answers = []
model.to(device)
def inference():
    model.eval()
    gc.collect()
    global answers
    
    with torch.no_grad():
        pbar = tqdm(test_dataloader)
        for idx, batch in enumerate(pbar):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            token_type_ids = batch["token_type_ids"].to(device)
            bbox = batch["bbox"].to(device)
            image = batch["image"].to(device)
         
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,
                       bbox=bbox, image=image, 
                       )
            predicts = run(batch, outputs.start_logits, outputs.end_logits)
            answers.extend(predicts)
    pbar.close()

In [12]:
inference()

100%|██████████| 325/325 [08:51<00:00,  1.64s/it]


In [22]:
import json
with open('/opt/ml/docvqa/data/test/test_v1.0.json') as f:
    data = json.load(f)

In [14]:
datasets = load_dataset("Trailblazer-Yoo/boostcamp-docvqa-test")
datasets

Using custom data configuration Trailblazer-Yoo--boostcamp-docvqa-test-be900c9a47ce7d4b
Found cached dataset parquet (/opt/ml/.cache/huggingface/datasets/Trailblazer-Yoo___parquet/Trailblazer-Yoo--boostcamp-docvqa-test-be900c9a47ce7d4b/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 1/1 [00:00<00:00, 397.98it/s]


DatasetDict({
    test: Dataset({
        features: ['questionId', 'question', 'image', 'docId', 'ucsf_document_id', 'ucsf_document_page_no', 'data_split', 'words', 'boxes'],
        num_rows: 5188
    })
})

In [20]:
datasets['test'][0]['questionId']

57344

In [15]:
len(answers)

5188

In [22]:
submission = []
for i in tqdm(range(len(answers))):
    tmp = {
        'answer':answers[i],
        'questionId':datasets['test'][i]['questionId']
    }
    submission.append(tmp)

100%|██████████| 5188/5188 [06:27<00:00, 13.40it/s]


In [24]:
import json

with open("/opt/ml/docvqa/submission.json", "w") as json_file:
    json.dump(submission, json_file)