## Environment Setup

In [1]:
!pip install -q gdown numpy pandas scikit-learn pillow opencv-python

In [2]:
!pip install -q torch torchvision protobuf sentencepiece transformers datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.1/316.1 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolve

In [12]:
# hyper-parameter
epochs = 10
batch_size = 32
lr = 1e-04
seed = 42
early_stop = False
patience = 5
aug_data = 1.0

In [11]:
import random
import numpy as np
import torch

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(seed=42)

## Download the Dataset

In [5]:
!gdown 1amslR99ZakBHnfBkeSeTazO6XhzOTYsc

Downloading...
From (original): https://drive.google.com/uc?id=1amslR99ZakBHnfBkeSeTazO6XhzOTYsc
From (redirected): https://drive.google.com/uc?id=1amslR99ZakBHnfBkeSeTazO6XhzOTYsc&confirm=t&uuid=9ea7924c-78e6-42f1-b5c7-8312350291a6
To: /content/data_a.zip
100% 339M/339M [00:15<00:00, 22.0MB/s]


In [9]:
!unzip data_a.zip -d ./

Archive:  data_a.zip
   creating: ./data/
   creating: ./data/test_images/
  inflating: ./__MACOSX/data/._test_images  
  inflating: ./data/train_doc_info.pkl  
   creating: ./data/printed_images/
  inflating: ./__MACOSX/data/._printed_images  
  inflating: ./data/.DS_Store        
  inflating: ./__MACOSX/data/._.DS_Store  
   creating: ./data/val_images/
  inflating: ./__MACOSX/data/._val_images  
  inflating: ./data/val_doc_info.pkl  
  inflating: ./data/submission.csv   
  inflating: ./__MACOSX/data/._submission.csv  
  inflating: ./data/test_doc_info.pkl  
  inflating: ./data/train_dataframe.csv  
  inflating: ./__MACOSX/data/._train_dataframe.csv  
  inflating: ./data/val_dataframe.csv  
  inflating: ./__MACOSX/data/._val_dataframe.csv  
   creating: ./data/train_images/
  inflating: ./__MACOSX/data/._train_images  
  inflating: ./data/test_printed_dataframe.csv  
  inflating: ./__MACOSX/data/._test_printed_dataframe.csv  
  inflating: ./data/printed_doc_info.pkl  
  inflating: ./

# Data Augmentation

In [10]:
import pickle
import tqdm
import os

data_dir = './data'
noised_dir = './data/aug_images'

doc_info_dir = './data'

with open(f'{doc_info_dir}/train_doc_info.pkl', 'rb') as f:
    train_doc_info = pickle.load(f)
with open(f'{doc_info_dir}/val_doc_info.pkl', 'rb') as f:
    val_doc_info = pickle.load(f)

image_dirs = ['train_images', 'val_images']

def rotate_bbox(bbox, angle):

    # 각도를 라디안으로 변환
    angle_rad = np.deg2rad(angle)

    # 경계 상자의 중심점 계산
    center_x = bbox[0] + bbox[2] / 2
    center_y = bbox[1] + bbox[3] / 2

    # 각 모서리의 원래 좌표 계산
    corners = [
        (bbox[0], bbox[1]),  # 왼쪽 상단 모서리
        (bbox[0] + bbox[2], bbox[1]),  # 오른쪽 상단 모서리
        (bbox[0] + bbox[2], bbox[1] + bbox[3]),  # 오른쪽 하단 모서리
        (bbox[0], bbox[1] + bbox[3])  # 왼쪽 하단 모서리
    ]

    # 각 모서리를 회전시킨 새 좌표 계산
    new_corners = [(
            center_x + (corner[0] - center_x) * np.cos(angle_rad) - (corner[1] - center_y) * np.sin(angle_rad),
            center_y + (corner[0] - center_x) * np.sin(angle_rad) + (corner[1] - center_y) * np.cos(angle_rad)
        ) for corner in corners]

    # 회전 후 새 경계 상자의 좌표를 계산하여 반환
    min_x = min(corner[0] for corner in new_corners)
    min_y = min(corner[1] for corner in new_corners)
    max_x = max(corner[0] for corner in new_corners)
    max_y = max(corner[1] for corner in new_corners)

    # 회전된 경계 상자의 'x, y, width, height' 형태로 변환하여 반환
    return [min_x, min_y, max_x - min_x, max_y - min_y]

for image_dir in image_dirs:
    if image_dir == 'train_images':
        doc_info = train_doc_info
    else:
        doc_info = val_doc_info

    os.makedirs(os.path.join(noised_dir, image_dir), exist_ok=True)

    for filename in tqdm.tqdm(os.listdir(os.path.join(data_dir, image_dir)), desc="Processing images"):
        image_path = os.path.join(data_dir, image_dir, filename)
        noised_path = os.path.join(noised_dir, image_dir, filename)
        angle = combined_image_processing(image_path, noised_path)

        image_name = filename.split('.')[0]

        for object_id, object in doc_info[image_name]['objects'].items():
            new_bbox = rotate_bbox(object['bbox'], angle)
            doc_info[image_name]['objects'][object_id]['bbox'] = new_bbox

    if image_dir == 'train_images':
        with open(f'{doc_info_dir}/aug_train_doc_info.pkl', 'wb') as f:
            pickle.dump(doc_info, f)
    else:
        with open(f'{doc_info_dir}/aug_val_doc_info.pkl', 'wb') as f:
            pickle.dump(doc_info, f)

Processing images: 100%|██████████| 535/535 [01:53<00:00,  4.70it/s]
Processing images: 100%|██████████| 76/76 [00:15<00:00,  5.00it/s]


In [11]:
from PIL import Image, ImageFilter
import numpy as np
import cv2

def add_noise_and_rotate(image_path, output_path):
    # Load the image
    image = Image.open(image_path)

    # Randomly choose a small angle to rotate
    angle = np.random.uniform(-2, 2)  # Rotate between -2 and 2 degrees
    rotated_image = image.rotate(angle, expand=False, fillcolor='white', resample=Image.BICUBIC)

    # Convert the image to numpy array
    image_array = np.asarray(rotated_image, dtype=np.uint8)

    # Generate Gaussian noise
    mean = 0
    std_dev = 5  # Standard deviation of the noise
    noise = np.random.normal(mean, std_dev, image_array.shape[:2])

    # Add noise to the image
    noisy_image_array = image_array + noise[:, :, None]
    noisy_image_array = np.clip(noisy_image_array, 0, 255)  # Ensure values are within valid range

    # Convert array back to image
    noisy_image = Image.fromarray(noisy_image_array.astype(np.uint8))

    # Apply dilation and blurring
    # noisy_image = noisy_image.filter(ImageFilter.CONTOUR())  # Dilation
    noisy_image = noisy_image.filter(ImageFilter.GaussianBlur(radius=0.3))  # Blur

    # Save the modified image
    noisy_image.save(output_path)

def erode_text(image_path, output_path, erode_prob=0.02):
    # 이미지를 그레이스케일로 로드
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    # 픽셀 유실 처리
    for i in range(image.shape[0]):
        for j in range(image.shape[1]):
            # 검은색 픽셀(가정하에)을 하얀색으로 변경
            if image[i, j] < 128 and random.random() < 0.05:
                image[i, j] = 255  # 검은색 픽셀을 하얀색으로 변경

    # 결과 이미지 저장
    cv2.imwrite(output_path, image)

def combined_image_processing(image_path, output_path):
    # 이미지를 PIL로 로드하고 회전
    image = Image.open(image_path)
    angle = np.random.uniform(-2, 2)  # Rotate between -2 and 2 degrees
    rotated_image = image.rotate(angle, expand=False, fillcolor='white', resample=Image.BICUBIC)

    # PIL 이미지를 OpenCV 형식으로 변환
    rotated_image_cv = cv2.cvtColor(np.array(rotated_image), cv2.COLOR_RGB2GRAY)

    # Gaussian noise 추가
    mean = 0
    std_dev = 5  # Standard deviation of the noise
    noise = np.random.normal(mean, std_dev, rotated_image_cv.shape)
    noisy_image_array = rotated_image_cv + noise
    noisy_image_array = np.clip(noisy_image_array, 0, 255)  # Ensure values are within valid range

    # 픽셀 유실 처리
    for i in range(noisy_image_array.shape[0]):
        for j in range(noisy_image_array.shape[1]):
            if noisy_image_array[i, j] < 128 and random.random() < 0.05:
                noisy_image_array[i, j] = 255  # 검은색 픽셀을 하얀색으로 변경

    # 블러링 적용
    final_image = cv2.GaussianBlur(noisy_image_array.astype(np.uint8), (3, 3), 0.3)

    # 결과 이미지 저장
    cv2.imwrite(output_path, final_image)

    return angle


# Feature Extraction

We newly extracted visual/textual feature for modeling

In [12]:
import pickle

with open('data/train_doc_info.pkl','rb') as f:
    train_doc_info = pickle.load(f)

with open('data/val_doc_info.pkl','rb') as f:
    val_doc_info = pickle.load(f)

with open('data/test_doc_info.pkl','rb') as f:
    test_doc_info = pickle.load(f)

with open('data/printed_doc_info.pkl','rb') as f:
    printed_doc_info = pickle.load(f)

with open('data/aug_train_doc_info.pkl','rb') as f:
    aug_train_doc_info = pickle.load(f)

with open('data/aug_val_doc_info.pkl','rb') as f:
    aug_val_doc_info = pickle.load(f)

In [13]:
import torch
from torchvision import models, transforms
from PIL import Image
import os
from transformers import AutoModel, AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 모델과 전처리 함수를 전역 변수로 선언하여 매번 로드하지 않도록 합니다
visual_model = models.resnet101(pretrained=True)
visual_model = torch.nn.Sequential(*list(visual_model.children())[:-2])
visual_model = visual_model.to(device)
visual_model.eval()

visual_preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')
textual_model = AutoModel.from_pretrained('microsoft/deberta-v3-large')
textual_model = textual_model.to(device)
textual_model.eval()

def extract_visual_features_batch(image_paths, boundary_boxes=None):
    batch_size = len(image_paths)
    images = []

    for i, image_path in enumerate(image_paths):
        image = Image.open(image_path).convert('RGB')
        if boundary_boxes is not None:
            image = image.crop(boundary_boxes[i])
        image = visual_preprocess(image)
        images.append(image)

    images = torch.stack(images).to(device)

    with torch.no_grad():
        features = visual_model(images)

    # features는 [batch_size, 2048, 7, 7] 크기의 텐서로 반환됨
    # [batch_size, 2048]로 변환
    features = torch.nn.functional.adaptive_avg_pool2d(features, (1, 1))
    features = features.view(batch_size, -1)

    return features.cpu().numpy()

def extract_textual_features_batch(texts):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

    with torch.no_grad():
        outputs = textual_model(**inputs)

    textual_features = outputs.last_hidden_state[:, 0, :].cpu().numpy()

    return textual_features

Downloading: "https://download.pytorch.org/models/resnet101-63fe2227.pth" to /root/.cache/torch/hub/checkpoints/resnet101-63fe2227.pth
100%|██████████| 171M/171M [00:01<00:00, 161MB/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

In [14]:
from tqdm import tqdm
import os
import pickle
import numpy as np

def process_batch(batch, img_dir, data_dir):
    results = []
    for img_name, img_info in batch:
        if 'objects' not in img_info:
            print(f"Warning: 'objects' not found in image info for {img_name}")
            continue

        for obj_idx, obj_info in img_info['objects'].items():
            if 'text' not in obj_info or 'bbox' not in obj_info:
                print(f"Warning: 'text' or 'bbox' not found in object info for {img_name}, object {obj_idx}")
                continue

            text = obj_info['text']
            bbox = obj_info['bbox']

            # bbox in form of [[x0, y0], [x0+w, y0], [x0+w, y0+h], [x0, y0+h]]
            # Convert to [x0, y0, x1, y1]
            boundary_box = [bbox[0], bbox[1], bbox[2] + bbox[0], bbox[3] + bbox[1]]

            results.append({
                'img_name': img_name,
                'obj_idx': obj_idx,
                'text': text,
                'bbox': bbox,
                'boundary_box': boundary_box,
                'img_path': os.path.join(data_dir, img_dir, f"{img_name}.png")
            })

    if not results:
        print(f"Warning: No valid objects found in this batch")
        return batch

    visual_features = extract_visual_features_batch([r['img_path'] for r in results], [r['boundary_box'] for r in results])
    textual_features = extract_textual_features_batch([r['text'] for r in results])

    for i, result in enumerate(results):
        img_name = result['img_name']
        obj_idx = result['obj_idx']

        if img_name not in dict(batch):
            print(f"Warning: {img_name} not found in batch")
            continue

        img_info = dict(batch)[img_name]
        if 'objects' not in img_info:
            img_info['objects'] = {}

        if obj_idx not in img_info['objects']:
            img_info['objects'][obj_idx] = {}

        img_info['objects'][obj_idx]['bbox'] = result['bbox']
        img_info['objects'][obj_idx]['visual_feature'] = visual_features[i].tolist()
        img_info['objects'][obj_idx]['textual_feature'] = textual_features[i].tolist()

    return batch

datasets = {
    'train_images': train_doc_info,
    'val_images': val_doc_info,
    'test_images': test_doc_info,
    'printed_images': printed_doc_info,
    'aug_images/train_images': aug_train_doc_info,
    'aug_images/val_images': aug_val_doc_info
}

save_file_directory = 'doc_info'
os.makedirs(save_file_directory, exist_ok=True)

save_file_path = ['train_doc_info.pkl', 'val_doc_info.pkl', 'test_doc_info.pkl', 'printed_doc_info.pkl', 'aug_train_doc_info.pkl', 'aug_val_doc_info.pkl']

process_batch_size = 8

for i, (img_dir, doc_info) in enumerate(datasets.items()):
    batches = [list(doc_info.items())[i:i+process_batch_size] for i in range(0, len(doc_info), process_batch_size)]

    for batch in tqdm(batches, desc=f"Processing {img_dir}"):
        processed_batch = process_batch(batch, img_dir, data_dir)

        for img_name, img_info in processed_batch:
            doc_info[img_name] = img_info

    with open(f'{save_file_directory}/{save_file_path[i]}', 'wb') as f:
        pickle.dump(doc_info, f)

    print(f'{save_file_path[i]} file is saved!')

Processing train_images: 100%|██████████| 67/67 [07:55<00:00,  7.09s/it]


train_doc_info.pkl file is saved!


Processing val_images: 100%|██████████| 10/10 [01:09<00:00,  6.93s/it]


val_doc_info.pkl file is saved!


Processing test_images: 100%|██████████| 19/19 [02:08<00:00,  6.76s/it]


test_doc_info.pkl file is saved!


Processing printed_images: 100%|██████████| 7/7 [01:02<00:00,  8.98s/it]


printed_doc_info.pkl file is saved!


Processing aug_images/train_images: 100%|██████████| 67/67 [08:12<00:00,  7.36s/it]


aug_train_doc_info.pkl file is saved!


Processing aug_images/val_images: 100%|██████████| 10/10 [01:12<00:00,  7.29s/it]


aug_val_doc_info.pkl file is saved!


In [15]:
import gc

def clear_gpu_memory():
    global visual_model, textual_model

    # 모델을 CPU로 이동
    visual_model = visual_model.cpu()
    textual_model = textual_model.cpu()

    del visual_model
    del textual_model

    # CUDA 캐시 정리
    torch.cuda.empty_cache()

    # 가비지 컬렉션 실행
    gc.collect()

# 특징 추출이 완료된 후 이 함수를 호출
clear_gpu_memory()

print("GPU memory cleared and models moved to CPU.")

GPU memory cleared and models moved to CPU.


## Loading Dataset

In [7]:
import pandas as pd

train_df = pd.read_csv('data/train_dataframe.csv')
train_df.rename(columns={'label(global_id)': 'label'}, inplace=True)
val_df = pd.read_csv('data/val_dataframe.csv')
val_df.rename(columns={'label(global_id)': 'label'}, inplace=True)

In [8]:
import pickle

with open('doc_info/train_doc_info.pkl','rb') as f:
  train_doc_info = pickle.load(f)
with open('doc_info/val_doc_info.pkl','rb') as f:
  val_doc_info = pickle.load(f)

In [9]:
with open(f'doc_info/aug_train_doc_info.pkl', "rb") as f:
    aug_train_doc_info = pickle.load(f)

with open(f'doc_info/aug_val_doc_info.pkl', "rb") as f:
    aug_val_doc_info = pickle.load(f)

## Dataloader

In [5]:
from torch.utils.data import Dataset

class formnlu_taskb(Dataset):
    def __init__(self, dataframe, doc_info, tokenizer, max_question_length=50, max_obj_num=35):
        self.doc = dataframe.file
        self.answer = dataframe.label
        self.text = dataframe.key_fix_text
        self.tokenizer = tokenizer
        self.doc_info = doc_info
        self.max_question_length = max_question_length
        self.max_obj_num = max_obj_num
        self.feature_obj_num = max_obj_num*2

    def __len__(self):
        return len(self.doc)

    def __getitem__(self, idx):
        doc_id = self.doc[idx]
        doc_id = doc_id.replace('.png','')
        text = self.text[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_question_length,
            padding='max_length',
            return_token_type_ids=True
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        answer_id = self.answer[idx]
        seq_id = 0
        target_ids = []
        page = self.doc_info[doc_id]
        textual_feature_list = []
        visual_feature_list = []
        bbox_list = []
        obj_mask = []
        obj_token_type_ids = []

        for i, (obj_local_id, obj) in enumerate(page['objects'].items()):
            global_id = obj['global_id']
            if int(global_id) == int(answer_id):
                target_ids.append(1)
            else:
                target_ids.append(0)
            textual_feature_list.append(obj['textual_feature'])
            visual_feature_list.append(obj['visual_feature'])
            bbox_list.append(obj['bbox'])
            obj_mask.append(1)
            obj_token_type_ids.append(1)

        if len(textual_feature_list) >= self.max_obj_num:
            textual_feature_list = textual_feature_list[:self.max_obj_num]
            visual_feature_list = visual_feature_list[:self.max_obj_num]
            bbox_list = bbox_list[:self.max_obj_num]
            obj_mask = obj_mask[:self.max_obj_num]
            obj_token_type_ids = obj_token_type_ids[:self.max_obj_num]
            target_ids = target_ids[:self.max_obj_num]
        else:
            textual_feature_list.extend([[0.0]*1024]*(self.max_obj_num-len(textual_feature_list)))
            visual_feature_list.extend([[0.0]*2048]*(self.max_obj_num-len(visual_feature_list)))
            bbox_list.extend([[0.0]*4]*(self.max_obj_num-len(bbox_list)))
            obj_mask.extend([0]*(self.max_obj_num-len(obj_mask)))
            obj_token_type_ids.extend([1]*(self.max_obj_num-len(obj_token_type_ids)))
            target_ids.extend([-1]*(self.max_obj_num-len(target_ids)))

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.float),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'textual_feature': torch.tensor(textual_feature_list, dtype=torch.float),
            'visual_feature': torch.tensor(visual_feature_list, dtype=torch.float),
            'bbox': torch.tensor(bbox_list, dtype=torch.float),
            'target_ids': torch.tensor(target_ids, dtype=torch.float),
            'obj_mask': torch.tensor(obj_mask, dtype=torch.long),
            'file': self.doc[idx]
        }

## Modeling

In [2]:
import torch
from torch import nn
from transformers import LxmertModel

# Visual-Textual Fusion Transformer
class VTFT(nn.Module):
    def __init__(self, model_path):
        super(VTFT, self).__init__()
        self.text_proj = nn.Linear(1024, 2048)
        self.lxmert = LxmertModel.from_pretrained(model_path)
        self.fc = nn.Linear(768*2, 2048)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(2048, 1)

    def forward(self, input_ids, attention_mask, token_type_ids, textual_feature, visual_feats, visual_pos, obj_attention_mask,):
        batch_size = input_ids.shape[0]

        textual_feature = self.text_proj(textual_feature)

        feature = torch.cat([textual_feature, visual_feats], dim=-1).view(batch_size, -1, 2048)
        pos = torch.cat([visual_pos, visual_pos], dim=-1).view(batch_size, -1, 4)
        mask = torch.cat([obj_attention_mask.unsqueeze(-1), obj_attention_mask.unsqueeze(-1)], dim=-1).view(batch_size, -1)

        outputs = self.lxmert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,
                              visual_feats=feature, visual_pos=pos, visual_attention_mask=mask)
        cross_outputs = outputs.vision_output.view(batch_size, -1, 768*2)
        output = self.fc(cross_outputs)
        output = self.dropout(output)
        logits = self.classifier(output)
        return logits


In [3]:
from torch import cuda
from transformers import AutoTokenizer

device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

tokenizer_path = 'unc-nlp/lxmert-base-uncased'
model_name = 'unc-nlp/lxmert-base-uncased'

model_class = VTFT

tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
model = model_class(model_name).to(device)
model

cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at unc-nlp/lxmert-base-uncased were not used when initializing LxmertModel: ['answer_head.logit_fc.0.bias', 'answer_head.logit_fc.0.weight', 'answer_head.logit_fc.2.bias', 'answer_head.logit_fc.2.weight', 'answer_head.logit_fc.3.bias', 'answer_head.logit_fc.3.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.w

VTFT(
  (text_proj): Linear(in_features=1024, out_features=2048, bias=True)
  (lxmert): LxmertModel(
    (embeddings): LxmertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768, padding_idx=0)
      (token_type_embeddings): Embedding(2, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): LxmertEncoder(
      (visn_fc): LxmertVisualFeatureEncoder(
        (visn_fc): Linear(in_features=2048, out_features=768, bias=True)
        (visn_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (box_fc): Linear(in_features=4, out_features=768, bias=True)
        (box_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (layer): ModuleList(
        (0-8): 9 x LxmertLayer(
          (attention): LxmertSelfAttentionLayer(
    

In [13]:
from torch.utils.data import Subset

train_set = formnlu_taskb(train_df,train_doc_info, tokenizer)
aug_train_set = formnlu_taskb(train_df, aug_train_doc_info, tokenizer)

indices = np.random.permutation(len(aug_train_set))
subset_indices = indices[:int(len(aug_train_set) * aug_data)]
random_train_subset = Subset(aug_train_set, subset_indices)

valid_set = formnlu_taskb(val_df,val_doc_info, tokenizer)
aug_valid_set = formnlu_taskb(val_df, aug_val_doc_info, tokenizer)

indices = np.random.permutation(len(aug_valid_set))
subset_indices = indices[:int(len(aug_valid_set) * aug_data)]
random_valid_subset = Subset(aug_valid_set, subset_indices)


In [14]:
from torch.utils.data import ConcatDataset, DataLoader

train_set = ConcatDataset([train_set, random_train_subset, valid_set, random_valid_subset])
valid_set = ConcatDataset([valid_set, random_valid_subset])

train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_set, batch_size=batch_size, shuffle=False)

In [15]:
from transformers import AdamW, get_linear_schedule_with_warmup

warmup_steps = 0.1 * len(train_dataloader)
loss_function = torch.nn.BCEWithLogitsLoss()
optimizer = AdamW(params=model.parameters(), lr=lr, weight_decay=0.1)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=len(train_dataloader)*epochs)



In [16]:
import os

model_path = f'./models/{model_class.__name__}'

os.makedirs(model_path, exist_ok=True)

def save_model(model, epoch, path):
    save_path = f'{model_path}/epoch{epoch}.pth'
    print(save_path)
    torch.save(model.state_dict(), save_path)
    print(f"Model saved to {path}_epoch{epoch}")

In [20]:
from tqdm import tqdm
import torch.nn.functional as F

# Defining the training function on the 80% of the dataset for tuning the distilbert model
def calcuate_exact_match_correct(probs, targets, obj_mask, thresholds=0.5):
    probs[probs>thresholds] = 1
    probs[probs<=thresholds] = 0
    probs[obj_mask==0] = -1

    corrects = torch.sum(torch.eq(probs, targets),dim=1)
    n_corrects = torch.sum(corrects==targets.shape[1]) # 50

    return n_corrects.item()

# Creating the loss function and optimizer
def train(model, epoch, train_dataloader, loss_function, optimizer, scheduler, device):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()

    for _,data in tqdm(enumerate(train_dataloader, 0)):
        ids = data['ids'].to(device)
        mask = data['mask'].to(device)
        token_type_ids = data['token_type_ids'].to(device)
        target_ids = data['target_ids'].to(device)
        textual_feature = data['textual_feature'].to(device)
        visual_feature = data['visual_feature'].to(device)
        bbox = data['bbox'].to(device)
        obj_mask = data['obj_mask'].to(device)

        logits = model(input_ids=ids, attention_mask=mask, token_type_ids=token_type_ids, textual_feature=textual_feature,
                       visual_feats=visual_feature, obj_attention_mask=obj_mask, visual_pos=bbox).view(ids.shape[0],-1)
        probs = F.sigmoid(logits)

        flat_target_ids = torch.flatten(target_ids)
        flat_logits = torch.flatten(logits)

        flat_logits = flat_logits[flat_target_ids!=-1]
        flat_target_ids = flat_target_ids[flat_target_ids!=-1]

        loss = loss_function(flat_logits, flat_target_ids)
        tr_loss += loss.item()

        n_correct += calcuate_exact_match_correct(probs, target_ids, obj_mask)

        nb_tr_steps += 1
        nb_tr_examples+=target_ids.size(0)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = n_correct/nb_tr_examples
    print(f"Training Loss of Epoch {epoch}: {epoch_loss}")
    print(f"Training Accuracy of Epoch {epoch}: {epoch_accu}")

    return epoch_loss, epoch_accu

In [18]:
import torch.nn.functional as F

def evaluate(model, valid_dataloader, device):
    n_correct = 0
    nb_tr_examples = 0
    model.eval()
    for _,data in tqdm(enumerate(valid_dataloader, 0)):
        ids = data['ids'].to(device)
        mask = data['mask'].to(device)
        token_type_ids = data['token_type_ids'].to(device)
        target_ids = data['target_ids'].to(device)
        textual_feature = data['textual_feature'].to(device)
        visual_feature = data['visual_feature'].to(device)
        bbox = data['bbox'].to(device)
        obj_mask = data['obj_mask'].to(device)

        logits = model(input_ids=ids, attention_mask=mask, token_type_ids=token_type_ids, textual_feature=textual_feature,
                       visual_feats=visual_feature, obj_attention_mask=obj_mask, visual_pos=bbox).view(ids.shape[0],-1)
        probs = F.sigmoid(logits)

        n_correct += calcuate_exact_match_correct(probs, target_ids, obj_mask)
        nb_tr_examples+=target_ids.size(0)

    epoch_accu = n_correct/nb_tr_examples
    print(f"Validation Accuracy Epoch: {epoch_accu}")

    return epoch_accu

In [None]:
best_val_acc = 0
patience_counter = 0
best_epoch = 0

for epoch in range(epochs):
    train_loss, train_acc = train(model, epoch, train_dataloader, loss_function, optimizer, scheduler, device)
    val_acc = evaluate(model, valid_dataloader, device)

    if early_stop:
        # Early stopping condition
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_counter = 0
            best_epoch = epoch
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print("Early stopping triggered")
            break
        else:
            print("No Early stopping")
    else:
        best_epoch = epoch

save_model(model, best_epoch, model_path)

455it [07:37,  1.01s/it]


Training Loss of Epoch 0: 0.0752616440079042
Training Accuracy of Epoch 0: 0.3497733205110592


57it [00:29,  1.97it/s]


Validation Accuracy Epoch: 0.7057522123893806


455it [07:33,  1.00it/s]


Training Loss of Epoch 1: 0.021232556235966284
Training Accuracy of Epoch 1: 0.7740761093556807


57it [00:29,  1.95it/s]


Validation Accuracy Epoch: 0.9430309734513275


455it [07:34,  1.00it/s]


Training Loss of Epoch 2: 0.010551813896233728
Training Accuracy of Epoch 2: 0.9040390163483996


57it [00:29,  1.93it/s]


Validation Accuracy Epoch: 0.9823008849557522


455it [07:33,  1.00it/s]


Training Loss of Epoch 3: 0.004898644526822334
Training Accuracy of Epoch 3: 0.957755186151944


57it [00:29,  1.93it/s]


Validation Accuracy Epoch: 0.9950221238938053


455it [07:34,  1.00it/s]


Training Loss of Epoch 4: 0.003459507328919314
Training Accuracy of Epoch 4: 0.9694326143701057


57it [00:29,  1.96it/s]


Validation Accuracy Epoch: 0.9977876106194691


455it [07:34,  1.00it/s]


Training Loss of Epoch 5: 0.0026634992344842542
Training Accuracy of Epoch 5: 0.9774694326143701


57it [00:28,  1.98it/s]


Validation Accuracy Epoch: 0.9983407079646017


455it [07:35,  1.00s/it]


Training Loss of Epoch 6: 0.0015053450015709662
Training Accuracy of Epoch 6: 0.9872235197142465


57it [00:29,  1.91it/s]


Validation Accuracy Epoch: 0.9961283185840708


455it [07:32,  1.01it/s]


Training Loss of Epoch 7: 0.0008493812107040107
Training Accuracy of Epoch 7: 0.9921005632641846


57it [00:28,  1.98it/s]


Validation Accuracy Epoch: 0.9994469026548672


455it [07:33,  1.00it/s]


Training Loss of Epoch 8: 0.0004844627900343607
Training Accuracy of Epoch 8: 0.9964967715345514


57it [00:29,  1.94it/s]


Validation Accuracy Epoch: 1.0


455it [07:31,  1.01it/s]


Training Loss of Epoch 9: 0.00038774028817737576
Training Accuracy of Epoch 9: 0.9972523698310207


57it [00:28,  1.98it/s]


Validation Accuracy Epoch: 1.0
./models/VTFT/epoch9.pth
Model saved to ./models/VTFT_epoch9


## Evaluation

In [None]:
class formnlu_taskb_test(Dataset):
    def __init__(self, dataframe, doc_info, tokenizer, max_question_length=100, max_obj_num=35):
        self.doc = dataframe.file
        self.text = dataframe.key_fix_text
        self.tokenizer = tokenizer
        self.doc_info = doc_info
        self.max_question_length = max_question_length
        self.max_obj_num = max_obj_num
        self.feature_obj_num = max_obj_num*2

    def __len__(self):
        return len(self.doc)

    def __getitem__(self, idx):
        doc_id = self.doc[idx]
        doc_id = doc_id.replace('.png', '')

        if doc_id not in self.doc_info:
            return None

        text = self.text[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_question_length,
            padding='max_length',
            return_token_type_ids=True
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        page = self.doc_info[doc_id]
        textual_feature_list = []
        visual_feature_list = []
        bbox_list = []
        obj_mask = []

        for i, (obj_local_id, obj) in enumerate(page['objects'].items()):
            textual_feature_list.append(obj['textual_feature'])
            visual_feature_list.append(obj['visual_feature'])
            bbox_list.append(obj['bbox'])
            obj_mask.append(1)

        if len(textual_feature_list) >= self.max_obj_num:
            textual_feature_list = textual_feature_list[:self.max_obj_num]
            visual_feature_list = visual_feature_list[:self.max_obj_num]
            bbox_list = bbox_list[:self.max_obj_num]
            obj_mask = obj_mask[:self.max_obj_num]
        else:
            textual_feature_list.extend([[0.0]*1024]*(self.max_obj_num-len(textual_feature_list)))
            visual_feature_list.extend([[0.0]*2048]*(self.max_obj_num-len(visual_feature_list)))
            bbox_list.extend([[0.0]*4]*(self.max_obj_num-len(bbox_list)))
            obj_mask.extend([0]*(self.max_obj_num-len(obj_mask)))

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.float),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'textual_feature': torch.tensor(textual_feature_list, dtype=torch.float),
            'visual_feature': torch.tensor(visual_feature_list, dtype=torch.float),
            'bbox': torch.tensor(bbox_list, dtype=torch.float),
            'obj_mask': torch.tensor(obj_mask, dtype=torch.long),
            'file': self.doc[idx]
        }

In [None]:
test_df = pd.read_csv('data/test_printed_dataframe.csv')

In [None]:
import copy
import pickle

with open('doc_info/printed_doc_info.pkl', 'rb') as f:
    printed_doc_info = pickle.load(f)

with open('doc_info/test_doc_info.pkl', 'rb') as f:
    test_doc_info = pickle.load(f)

combined_doc_info = copy.deepcopy(test_doc_info)
combined_doc_info.update(printed_doc_info)

In [None]:
tokenizer_path = 'unc-nlp/lxmert-base-uncased'
model_class = VTFT
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

test_set = formnlu_taskb_test(test_df, combined_doc_info, tokenizer)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False)

model_name = 'unc-nlp/lxmert-base-uncased'
model_path = f'models/{model_class.__name__}'

model = model_class(model_name).to(device)
load_model_path = f'{model_path}/epoch{best_epoch}'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = model_class(model_name).to(device)
model.load_state_dict(torch.load(f'{load_model_path}.pth'))
model.to(device)
model.eval()

Some weights of the model checkpoint at unc-nlp/lxmert-base-uncased were not used when initializing LxmertModel: ['answer_head.logit_fc.0.bias', 'answer_head.logit_fc.0.weight', 'answer_head.logit_fc.2.bias', 'answer_head.logit_fc.2.weight', 'answer_head.logit_fc.3.bias', 'answer_head.logit_fc.3.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'obj_predict_head.decoder_dict.attr.bias', 'obj_predict_head.decoder_dict.attr.weight', 'obj_predict_head.decoder_dict.feat.bias', 'obj_predict_head.decoder_dict.feat.weight', 'obj_predict_head.decoder_dict.obj.bias', 'obj_predict_head.decoder_dict.obj.weight', 'obj_predict_head.transform.LayerNorm.bias', 'obj_predict_head.transform.LayerNorm.weight', 'obj_predict_head.transform.dense.bias', 'obj_pred

VTFT(
  (text_proj): Linear(in_features=1024, out_features=2048, bias=True)
  (lxmert): LxmertModel(
    (embeddings): LxmertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768, padding_idx=0)
      (token_type_embeddings): Embedding(2, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): LxmertEncoder(
      (visn_fc): LxmertVisualFeatureEncoder(
        (visn_fc): Linear(in_features=2048, out_features=768, bias=True)
        (visn_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (box_fc): Linear(in_features=4, out_features=768, bias=True)
        (box_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (layer): ModuleList(
        (0-8): 9 x LxmertLayer(
          (attention): LxmertSelfAttentionLayer(
    

In [None]:
def inference(model, data_loader, doc_info):
    model.eval()
    predictions = []

    with torch.no_grad():
        for _, data in tqdm(enumerate(data_loader, 0)):
            ids = data['ids'].to(device)
            mask = data['mask'].to(device)
            token_type_ids = data['token_type_ids'].to(device)
            textual_feature = data['textual_feature'].to(device)
            visual_feature = data['visual_feature'].to(device)
            bbox = data['bbox'].to(device)
            obj_mask = data['obj_mask'].to(device)
            file_names = data['file']

            logits = model(input_ids=ids, attention_mask=mask, token_type_ids=token_type_ids, textual_feature=textual_feature,
                       visual_feats=visual_feature, obj_attention_mask=obj_mask, visual_pos=bbox).view(ids.shape[0],-1)
            probs = F.sigmoid(logits)

            for i, file_name in enumerate(file_names):
                doc_id = file_name.replace('.png', '')
                page = doc_info[doc_id]
                valid_probs = probs[i][:len(page['objects'])]

                predicted_id = torch.argmax(valid_probs).item()

                global_id = list(page['objects'].values())[predicted_id]['global_id']
                predictions.append(global_id)

    return predictions

In [None]:
predictions = inference(model, test_loader, combined_doc_info)

73it [00:47,  1.55it/s]


In [None]:
output_path = f'./results/{model_class.__name__}'

os.makedirs(output_path, exist_ok=True)

file_name = output_path + f'/submission.csv'

test_df = test_df.drop(columns=['Source', 'file', 'key_fix_text'])
test_df['Label'] = predictions
test_df.to_csv(file_name, index=False)
print(f"Predictions saved to {file_name}")