#Downloads

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
file_id = '1-F07uiQtPtlISfml0Y_0xFxgirEmiXuq' # URL id.
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('mask-rcnn-predict_pkl.zip')

In [None]:
file_id = '13Yq7zielAiyBZJu6OjIrj8orlPTQ0OQd' # URL id.
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('text_feature_bert_ltfeat.zip')

In [None]:
file_id = '1wSLMZ-Qjoe6EoxYvW_4GBwwaXCE9xvf6' # URL id.
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('OpenCQA_Graph.zip')

In [None]:
file_id = '1Zo7t0j2jZ2jzDa5Y0h7cCq5D6YVGKrKx' # URL id.
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('OpenCQA_Graph_6_rels_pie.zip')

In [None]:
file_id = '1xYcRy1EMQF1Iyj0ZbIyEc-2b377n0KP4' # URL id.
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('unichart_patch_object_pred_ae.zip')

In [None]:
file_id = '1_15A3I1he-SH_yS0vkZInDDEZakz-VLb' # URL id.
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('unichart_patch_len_pred_ae.zip')

In [None]:
!unzip mask-rcnn-predict_pkl.zip

In [None]:
!unzip text_feature_bert_ltfeat.zip

In [None]:
!unzip OpenCQA_Graph.zip

In [None]:
!unzip OpenCQA_Graph_6_rels_pie.zip

In [None]:
!unzip unichart_patch_object_pred_ae.zip

In [None]:
!unzip unichart_patch_len_pred_ae.zip

In [None]:
!git clone https://github.com/vis-nlp/OpenCQA.git

In [None]:
!pip install sentencepiece

In [None]:
!pip install wandb

In [None]:
!pip install sacrebleu
!pip install sacremoses

#Helper Function

In [None]:
import torch
torch.manual_seed(42)

In [None]:
import os
import pickle
def create_folder_if_not_exists(folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print(f"Folder '{folder_path}' created successfully.")
    else:
        print(f"Folder '{folder_path}' already exists.")

In [None]:
import torch

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
def pad_zeros(matrix, desired_shape):
  if matrix.shape[0] == 0:
      return torch.zeros((desired_shape))
  # Calculate the amount of padding needed for each dimension
  rows_padding = desired_shape[0] - matrix.shape[0]
  cols_padding = desired_shape[1] - matrix.shape[1]

  # Pad the original tensor with zeros using torch
  return torch.nn.functional.pad(matrix, (0, cols_padding, 0, rows_padding), value=0)

In [None]:
from transformers.models.pix2struct.modeling_pix2struct import *

In [None]:

import torch

In [None]:
from torch.nn.utils.rnn import pad_sequence

In [None]:
from tqdm.autonotebook import tqdm

In [None]:
def evaluate(model, dataloader, evaluator, processor, criteria='bleu'):
    prompt_end_token_id = processor.tokenizer.convert_tokens_to_ids('<s_answer>')
    with torch.no_grad():
        quesid2ans = {}
        with tqdm(range(len(dataloader))) as pbar:
          for i, batch in enumerate(dataloader):
              patch_objects = batch.pop('patch_objects')
              pixel_values = batch.pop('pixel_values')
              patch_lens = batch.pop('patch_lens')
              ids = batch.pop('question_ids')
              full_weights = batch.pop('full_weights').to(device)
              full_adj = batch.pop('full_adj').to(device)
              sem_adj = batch.pop('sem_adj').to(device)
              rtexts_feats = batch.pop('rtexts_feats').to(device)
              bboxes = batch.pop('bboxes').to(device)
              vis_feats = batch.pop('vis_feats').to(device)
              decoder_input_ids = batch.pop('input_ids').to(device)
              prompt_end_idxs = batch.pop('prompt_end_index')
              graph_mask = batch.pop('graph_mask').to(device)

              decoder_prompts = pad_sequence(
                  [input_id[: end_idx + 1] for input_id, end_idx in zip(decoder_input_ids, prompt_end_idxs)],
                  batch_first=True,
              )

              outputs = model.generate(
                  pixel_values.to(device),
                  decoder_input_ids=decoder_prompts.to(device),
                  max_length=512,
                  early_stopping=True,
                  pad_token_id=processor.tokenizer.pad_token_id,
                  eos_token_id=processor.tokenizer.eos_token_id,
                  use_cache=True,
                  num_beams=4,
                  bad_words_ids=[[processor.tokenizer.unk_token_id]],
                  return_dict_in_generate=True,
                  patch_objects=patch_objects,
                  full_weights=full_weights, full_adj=full_adj, sem_adj=sem_adj, vis_feats=vis_feats,
                  rtexts_feats=rtexts_feats, vis_pos=bboxes, patch_lens=patch_lens, graph_mask=graph_mask
              )

              pred_ans = processor.batch_decode(outputs.sequences, skip_special_tokens=True)
              pred_ans = [pred.split("<s_answer>")[1].replace(processor.tokenizer.eos_token, "").replace("<s>", "").strip(' ') for pred in pred_ans]
              for qid, ans in zip(ids, pred_ans):
                  quesid2ans[qid] = ans

              pbar.update(1)

    qid2ans_list = [quesid2ans]
    quesid2ans = {}
    for qid2ans in qid2ans_list:
      for k, v in qid2ans.items():
        quesid2ans[k] = v
    return evaluator.evaluate_raw(quesid2ans, criteria=criteria)

## Graph Layers

In [None]:
import math

import torch

from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module


class GraphConvolution(Module):
    """
    Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
    """

    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, input, adj, dis):
        support = torch.matmul(input, self.weight)
        if dis is None:
          output = torch.matmul(adj, support)
        else:
          output = torch.matmul(adj*dis, support)
        if self.bias is not None:
            return output + self.bias
        else:
            return output

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'

In [None]:
import torch.nn as nn
import torch.nn.functional as F


class GCN(nn.Module):
    def __init__(self, nfeat, nhid, ofeat, dropout):
        super(GCN, self).__init__()

        self.ofeat = ofeat
        self.dropout = dropout
        self.gc1 = GraphConvolution(nfeat, nhid)
        self.gc2 = GraphConvolution(nhid, ofeat)

    def forward(self, x, adj, dis):
        x = F.relu(self.gc1(x, adj, dis))
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.gc2(x, adj, dis)
        return x

In [None]:
class GraphFuse(nn.Module):
    def __init__(self):
        super(GraphFuse, self).__init__()
        self.fg_gcn = GCN(1024,1024,1024,0.2)
        self.sem_gcn = GCN(1024,1024,1024,0.2)

        self.fc = nn.Linear(1024*2, 1024)

    def forward(self, ful_adj, sem_adj, ful_weights, sem_weights, vis_feat, text_feat, batch_size, graph_mask):
        full_feature = self.fg_gcn(vis_feat, ful_adj, ful_weights).clone()
        sem_feature = self.sem_gcn(text_feat, sem_adj, None).clone()

        sem_feature = sem_feature * graph_mask
        out = torch.cat((full_feature, sem_feature[:, :full_feature.shape[1]]), dim=2)
        out = F.relu(self.fc(out))
        out = F.dropout(out, 0.1, training=self.training)
        return out

##UniChart

In [None]:
from transformers.models.vision_encoder_decoder.configuration_vision_encoder_decoder import VisionEncoderDecoderConfig

In [None]:
VDCONFIG = VisionEncoderDecoderConfig.from_pretrained('ahmed-masry/unichart-base-960')

In [None]:
from transformers.models.donut.modeling_donut_swin import DonutSwinLayer, DonutSwinStage

In [None]:
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
    """
    Shift input ids one token to the right.
    """
    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
    if decoder_start_token_id is None:
        raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
    shifted_input_ids[:, 0] = decoder_start_token_id

    if pad_token_id is None:
        raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
    # replace possible -100 values in labels by `pad_token_id`
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

    return shifted_input_ids

In [None]:
class DonutSwinPatchMerging(nn.Module):
    """
    Patch Merging Layer.

    Args:
        input_resolution (`Tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    """

    def __init__(self, input_resolution: Tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
        super().__init__()
        self.input_resolution = input_resolution
        self.dim = dim
        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
        self.norm = norm_layer(4 * dim)

    def maybe_pad(self, input_feature, height, width):
        should_pad = (height % 2 == 1) or (width % 2 == 1)
        if should_pad:
            pad_values = (0, 0, 0, width % 2, 0, height % 2)
            input_feature = nn.functional.pad(input_feature, pad_values)

        return input_feature

    def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]) -> torch.Tensor:
        height, width = input_dimensions
        # `dim` is height * width
        batch_size, dim, num_channels = input_feature.shape

        input_feature = input_feature.view(batch_size, height, width, num_channels)
        # pad input to be disible by width and height, if needed
        input_feature = self.maybe_pad(input_feature, height, width)
        # [batch_size, height/2, width/2, num_channels]
        input_feature_0 = input_feature[:, 0::2, 0::2, :]
        # [batch_size, height/2, width/2, num_channels]
        input_feature_1 = input_feature[:, 1::2, 0::2, :]
        # [batch_size, height/2, width/2, num_channels]
        input_feature_2 = input_feature[:, 0::2, 1::2, :]
        # [batch_size, height/2, width/2, num_channels]
        input_feature_3 = input_feature[:, 1::2, 1::2, :]
        # batch_size height/2 width/2 4*num_channels
        input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1)
        input_feature = input_feature.view(batch_size, -1, 4 * num_channels)  # batch_size height/2*width/2 4*C

        input_feature = self.norm(input_feature)
        input_feature = self.reduction(input_feature)

        return input_feature

In [None]:
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Donut Swin Transformer model.

This implementation is identical to a regular Swin Transformer, without final layer norm on top of the final hidden
states."""

import collections.abc
import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn

from transformers.activations import ACT2FN
from transformers.modeling_utils import PreTrainedModel
from transformers.pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
from transformers.utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
)
from transformers.models.donut.configuration_donut_swin import DonutSwinConfig


logger = logging.get_logger(__name__)

# General docstring
_CONFIG_FOR_DOC = "DonutSwinConfig"

# Base docstring
_CHECKPOINT_FOR_DOC = "https://huggingface.co/naver-clova-ix/donut-base"
_EXPECTED_OUTPUT_SHAPE = [1, 49, 768]

DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "naver-clova-ix/donut-base",
    # See all Donut Swin models at https://huggingface.co/models?filter=donut
]


@dataclass
# Copied from transformers.models.swin.modeling_swin.SwinEncoderOutput with Swin->DonutSwin
class DonutSwinEncoderOutput(ModelOutput):
    """
    DonutSwin encoder's outputs, with potential hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    """

    last_hidden_state: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None


@dataclass
# Copied from transformers.models.swin.modeling_swin.SwinModelOutput with Swin->DonutSwin
class DonutSwinModelOutput(ModelOutput):
    """
    DonutSwin model's outputs that also contains a pooling of the last hidden states.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
            Average pooling of the last layer hidden-state.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    """

    last_hidden_state: torch.FloatTensor = None
    pooler_output: Optional[torch.FloatTensor] = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None


# Copied from transformers.models.swin.modeling_swin.window_partition
def window_partition(input_feature, window_size):
    """
    Partitions the given input into windows.
    """
    batch_size, height, width, num_channels = input_feature.shape
    input_feature = input_feature.view(
        batch_size, height // window_size, window_size, width // window_size, window_size, num_channels
    )
    windows = input_feature.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels)
    return windows


# Copied from transformers.models.swin.modeling_swin.window_reverse
def window_reverse(windows, window_size, height, width):
    """
    Merges windows to produce higher resolution features.
    """
    num_channels = windows.shape[-1]
    windows = windows.view(-1, height // window_size, width // window_size, window_size, window_size, num_channels)
    windows = windows.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, height, width, num_channels)
    return windows


# Copied from transformers.models.swin.modeling_swin.SwinEmbeddings with Swin->DonutSwin
class DonutSwinEmbeddings(nn.Module):
    """
    Construct the patch and position embeddings. Optionally, also the mask token.
    """

    def __init__(self, config, use_mask_token=False):
        super().__init__()

        self.patch_embeddings = DonutSwinPatchEmbeddings(config)
        num_patches = self.patch_embeddings.num_patches
        self.patch_grid = self.patch_embeddings.grid_size
        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token else None

        if config.use_absolute_embeddings:
            self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.embed_dim))
        else:
            self.position_embeddings = None

        self.norm = nn.LayerNorm(config.embed_dim)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(
        self, pixel_values: Optional[torch.FloatTensor], bool_masked_pos: Optional[torch.BoolTensor] = None,
    ) -> Tuple[torch.Tensor]:
        embeddings, output_dimensions = self.patch_embeddings(pixel_values)

        embeddings = self.norm(embeddings)
        batch_size, seq_len, _ = embeddings.size()

        if bool_masked_pos is not None:
            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
            # replace the masked visual tokens by mask_tokens
            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask

        if self.position_embeddings is not None:
            embeddings = embeddings + self.position_embeddings

        embeddings = self.dropout(embeddings)

        return embeddings, output_dimensions


# Copied from transformers.models.swin.modeling_swin.SwinPatchEmbeddings
class DonutSwinPatchEmbeddings(nn.Module):
    """
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    """

    def __init__(self, config):
        super().__init__()
        image_size, patch_size = config.image_size, config.patch_size
        num_channels, hidden_size = config.num_channels, config.embed_dim
        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.num_patches = num_patches
        self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])

        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)

    def maybe_pad(self, pixel_values, height, width):
        if width % self.patch_size[1] != 0:
            pad_values = (0, self.patch_size[1] - width % self.patch_size[1])
            pixel_values = nn.functional.pad(pixel_values, pad_values)
        if height % self.patch_size[0] != 0:
            pad_values = (0, 0, 0, self.patch_size[0] - height % self.patch_size[0])
            pixel_values = nn.functional.pad(pixel_values, pad_values)
        return pixel_values

    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor, Tuple[int]]:
        _, num_channels, height, width = pixel_values.shape
        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )
        # pad the input to be divisible by self.patch_size, if needed
        pixel_values = self.maybe_pad(pixel_values, height, width)
        embeddings = self.projection(pixel_values)
        _, _, height, width = embeddings.shape
        output_dimensions = (height, width)
        embeddings = embeddings.flatten(2).transpose(1, 2)

        return embeddings, output_dimensions

In [None]:
# Copied from transformers.models.swin.modeling_swin.SwinEncoder with Swin->DonutSwin
class DonutSwinEncoder(nn.Module):
    def __init__(self, config, grid_size):
        super().__init__()
        self.num_layers = len(config.depths)
        self.config = config
        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
        self.layers = nn.ModuleList(
            [
                DonutSwinStage(
                    config=config,
                    dim=int(config.embed_dim * 2**i_layer),
                    input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)),
                    depth=config.depths[i_layer],
                    num_heads=config.num_heads[i_layer],
                    drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
                    downsample=DonutSwinPatchMerging if (i_layer < self.num_layers - 1) else None,
                )
                for i_layer in range(self.num_layers)
            ]
        )

        self.gradient_checkpointing = False

    def forward(
        self,
        hidden_states: torch.Tensor,
        input_dimensions: Tuple[int, int],
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        output_hidden_states_before_downsampling: Optional[bool] = False,
        always_partition: Optional[bool] = False,
        return_dict: Optional[bool] = True,
    ) -> Union[Tuple, DonutSwinEncoderOutput]:
        all_hidden_states = () if output_hidden_states else None
        all_reshaped_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None

        if output_hidden_states:
            batch_size, _, hidden_size = hidden_states.shape
            # rearrange b (h w) c -> b c h w
            reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
            reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
            all_hidden_states += (hidden_states,)
            all_reshaped_hidden_states += (reshaped_hidden_state,)

        for i, layer_module in enumerate(self.layers):
            layer_head_mask = head_mask[i] if head_mask is not None else None

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    input_dimensions,
                    layer_head_mask,
                    output_attentions,
                    always_partition,
                )
            else:
                layer_outputs = layer_module(
                    hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
                )

            hidden_states = layer_outputs[0]
            hidden_states_before_downsampling = layer_outputs[1]
            output_dimensions = layer_outputs[2]

            input_dimensions = (output_dimensions[-2], output_dimensions[-1])

            if output_hidden_states and output_hidden_states_before_downsampling:
                batch_size, _, hidden_size = hidden_states_before_downsampling.shape
                # rearrange b (h w) c -> b c h w
                # here we use the original (not downsampled) height and width
                reshaped_hidden_state = hidden_states_before_downsampling.view(
                    batch_size, *(output_dimensions[0], output_dimensions[1]), hidden_size
                )
                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
                all_hidden_states += (hidden_states_before_downsampling,)
                all_reshaped_hidden_states += (reshaped_hidden_state,)
            elif output_hidden_states and not output_hidden_states_before_downsampling:
                batch_size, _, hidden_size = hidden_states.shape
                # rearrange b (h w) c -> b c h w
                reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
                all_hidden_states += (hidden_states,)
                all_reshaped_hidden_states += (reshaped_hidden_state,)

            if output_attentions:
                all_self_attentions += layer_outputs[3:]

        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)

        return DonutSwinEncoderOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            reshaped_hidden_states=all_reshaped_hidden_states,
        )

In [None]:
class DonutSwinPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = DonutSwinConfig
    base_model_prefix = "swin"
    main_input_name = "pixel_values"
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

In [None]:
SWIN_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`DonutSwinConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

SWIN_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`DonutImageProcessor.__call__`] for details.
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

In [None]:
class DonutSwinModel(DonutSwinPreTrainedModel):
    def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
        super().__init__(config)
        self.config = config
        self.num_layers = len(config.depths)
        self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1))

        self.embeddings = DonutSwinEmbeddings(config, use_mask_token=use_mask_token)
        self.encoder = DonutSwinEncoder(config, self.embeddings.patch_grid)

        self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embeddings.patch_embeddings

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=DonutSwinModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        bool_masked_pos: Optional[torch.BoolTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, DonutSwinModelOutput]:
        r"""
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        head_mask = self.get_head_mask(head_mask, len(self.config.depths))

        embedding_output, input_dimensions = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)

        encoder_outputs = self.encoder(
            embedding_output,
            input_dimensions,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = encoder_outputs[0]

        pooled_output = None
        if self.pooler is not None:
            pooled_output = self.pooler(sequence_output.transpose(1, 2))
            pooled_output = torch.flatten(pooled_output, 1)

        if not return_dict:
            output = (sequence_output, pooled_output) + encoder_outputs[1:]

            return output

        return DonutSwinModelOutput(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
            reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
        )

In [None]:
from transformers.models.vision_encoder_decoder.configuration_vision_encoder_decoder import VisionEncoderDecoderConfig
from transformers.configuration_utils import PretrainedConfig
from transformers.models.auto.modeling_auto import AutoModelForCausalLM
import tempfile
import gc

In [None]:
import gc
import os
import tempfile
from typing import Optional, Tuple, Union

import torch
from torch import nn
from torch.nn import CrossEntropyLoss

from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_outputs import BaseModelOutput, Seq2SeqLMOutput
from transformers.modeling_utils import PreTrainedModel
from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from transformers.models.auto.configuration_auto import AutoConfig
from transformers.models.auto.modeling_auto import AutoModel, AutoModelForCausalLM
from transformers.models.vision_encoder_decoder.configuration_vision_encoder_decoder import VisionEncoderDecoderConfig


In [None]:
VISION_ENCODER_DECODER_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using an image processor (e.g. if you use ViT as the encoder,
            you should use [`AutoImageProcessor`]). See [`ViTImageProcessor.__call__`] for details.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)

            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            For training, `decoder_input_ids` are automatically created by the model by shifting the `labels` to the
            right, replacing -100 by the `pad_token_id` and prepending them with the `decoder_start_token_id`.
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        encoder_outputs (`tuple(torch.FloatTensor)`, *optional*):
            This tuple must consist of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
            `last_hidden_state` (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`) is a tensor
            of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the
            decoder.
        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
            representation. This is useful if you want more control over how to convert `decoder_input_ids` indices
            into associated vectors than the model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss for the decoder. Indices should be in `[-100, 0,
            ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            If set to `True`, the model will return a [`~utils.Seq2SeqLMOutput`] instead of a plain tuple.
        kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments come in two flavors:

            - Without a prefix which will be input as `**encoder_kwargs` for the encoder forward function.
            - With a *decoder_* prefix which will be input as `**decoder_kwargs` for the decoder forward function.
"""

In [None]:
class VisionEncoderDecoderModel(PreTrainedModel):
    r"""
    [`VisionEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture with
    one of the base vision model classes of the library as encoder and another one as decoder when created with the
    :meth*~transformers.AutoModel.from_pretrained* class method for the encoder and
    :meth*~transformers.AutoModelForCausalLM.from_pretrained* class method for the decoder.
    """

    config_class = VisionEncoderDecoderConfig
    base_model_prefix = "vision_encoder_decoder"
    main_input_name = "pixel_values"
    supports_gradient_checkpointing = True

    def __init__(
        self,
        config: Optional[PretrainedConfig] = None,
        encoder: Optional[PreTrainedModel] = None,
        decoder: Optional[PreTrainedModel] = None,
    ):
        if config is None and (encoder is None or decoder is None):
            raise ValueError("Either a configuration or an encoder and a decoder has to be provided.")
        if config is None:
            config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config)
        else:
            if not isinstance(config, self.config_class):
                raise ValueError(f"Config: {config} has to be of type {self.config_class}")

        if config.decoder.cross_attention_hidden_size is not None:
            if config.decoder.cross_attention_hidden_size != config.encoder.hidden_size:
                raise ValueError(
                    "If `cross_attention_hidden_size` is specified in the decoder's configuration, it has to be equal"
                    f" to the encoder's `hidden_size`. Got {config.decoder.cross_attention_hidden_size} for"
                    f" `config.decoder.cross_attention_hidden_size` and {config.encoder.hidden_size} for"
                    " `config.encoder.hidden_size`."
                )

        # initialize with config
        # make sure input & output embeddings is not tied
        config.tie_word_embeddings = False
        super().__init__(config)

        if encoder is None:
            encoder = DonutSwinModel(VDCONFIG.encoder)
        if decoder is None:
            decoder = AutoModelForCausalLM.from_config(config.decoder)

        self.encoder = encoder
        self.decoder = decoder

        self.gf = GraphFuse()
        self.proj = nn.Linear(768, 1024)

        if self.encoder.config.to_dict() != self.config.encoder.to_dict():
            logger.warning(
                f"Config of the encoder: {self.encoder.__class__} is overwritten by shared encoder config:"
                f" {self.config.encoder}"
            )
        if self.decoder.config.to_dict() != self.config.decoder.to_dict():
            logger.warning(
                f"Config of the decoder: {self.decoder.__class__} is overwritten by shared decoder config:"
                f" {self.config.decoder}"
            )

        # make sure that the individual model's config refers to the shared config
        # so that the updates to the config will be synced
        self.encoder.config = self.config.encoder
        self.decoder.config = self.config.decoder

        # encoder outputs might need to be projected to different dimension for decoder
        if (
            self.encoder.config.hidden_size != self.decoder.config.hidden_size
            and self.decoder.config.cross_attention_hidden_size is None
        ):
            self.enc_to_dec_proj = nn.Linear(self.encoder.config.hidden_size, self.decoder.config.hidden_size)

        if self.encoder.get_output_embeddings() is not None:
            raise ValueError(
                f"The encoder {self.encoder} should not have a LM Head. Please use a model without LM Head"
            )

    def get_encoder(self):
        return self.encoder

    def get_decoder(self):
        return self.decoder

    def get_output_embeddings(self):
        return self.decoder.get_output_embeddings()

    def set_output_embeddings(self, new_embeddings):
        return self.decoder.set_output_embeddings(new_embeddings)

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        r"""
        Example:

        ```python
        >>> from transformers import VisionEncoderDecoderModel, AutoImageProcessor, AutoTokenizer
        >>> from PIL import Image
        >>> import requests

        >>> image_processor = AutoImageProcessor.from_pretrained("ydshieh/vit-gpt2-coco-en")
        >>> decoder_tokenizer = AutoTokenizer.from_pretrained("ydshieh/vit-gpt2-coco-en")
        >>> model = VisionEncoderDecoderModel.from_pretrained("ydshieh/vit-gpt2-coco-en")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> img = Image.open(requests.get(url, stream=True).raw)
        >>> pixel_values = image_processor(images=img, return_tensors="pt").pixel_values  # Batch size 1

        >>> output_ids = model.generate(
        ...     pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True
        ... ).sequences

        >>> preds = decoder_tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        >>> preds = [pred.strip() for pred in preds]

        >>> assert preds == ["a cat laying on top of a couch next to another cat"]
        ```"""

        from_tf = kwargs.pop("from_tf", False)
        if from_tf:
            from transformers import TFVisionEncoderDecoderModel

            # a workaround to load from tensorflow checkpoint
            # Using `_tf_model` won't work, because the weight names in the encoder/decoder of `_tf_model` get
            # extended before saving those components. For example, The name of `_tf_model.encoder.vit` is
            # `[top model name]/encoder/vit`, but the name of `tf_model.encoder.vit` is `[top model name]/vit`. The
            # [top model name] is handled (stripped) by the conversion method, and the former case gets extra `encoder`,
            # which should not occur when we want to save the components alone.
            # There was a (very) ugly potential fix, which wasn't integrated to `transformers`: see
            #   https://github.com/huggingface/transformers/pull/13222/commits/dbb3c9de76eee235791d2064094654637c99f36d#r697304245
            #   (the change in `src/transformers/modeling_tf_utils.py`)
            _tf_model = TFVisionEncoderDecoderModel.from_pretrained(
                pretrained_model_name_or_path, *model_args, **kwargs
            )
            config = _tf_model.config

            # Using `tf_model` instead
            encoder = _tf_model.encoder.__class__(_tf_model.config.encoder)
            decoder = _tf_model.decoder.__class__(_tf_model.config.decoder)
            # Make sure models are built
            encoder(encoder.dummy_inputs)
            decoder(decoder.dummy_inputs)

            # Get the variable correspondence between `_tf_model` and `encoder` and `decoder`
            encoder_variables = {}
            for v in encoder.trainable_variables + encoder.non_trainable_variables:
                encoder_variables["/".join(v.name.split("/")[1:])] = v
            decoder_variables = {}
            for v in decoder.trainable_variables + decoder.non_trainable_variables:
                decoder_variables["/".join(v.name.split("/")[1:])] = v

            _encoder_variables = {}
            for v in _tf_model.encoder.trainable_variables + _tf_model.encoder.non_trainable_variables:
                _encoder_variables["/".join(v.name.split("/")[2:])] = v
            _decoder_variables = {}
            for v in _tf_model.decoder.trainable_variables + _tf_model.decoder.non_trainable_variables:
                _decoder_variables["/".join(v.name.split("/")[2:])] = v

            # assign weight values to `encoder` and `decoder` from `_tf_model`
            for name, v in encoder_variables.items():
                v.assign(_encoder_variables[name])
            for name, v in decoder_variables.items():
                v.assign(_decoder_variables[name])

            tf_model = TFVisionEncoderDecoderModel(encoder=encoder, decoder=decoder)

            # Deal with `enc_to_dec_proj`
            if hasattr(_tf_model, "enc_to_dec_proj"):
                tf_model(tf_model.dummy_inputs)
                tf_model.enc_to_dec_proj.kernel.assign(_tf_model.enc_to_dec_proj.kernel)
                tf_model.enc_to_dec_proj.bias.assign(_tf_model.enc_to_dec_proj.bias)

            with tempfile.TemporaryDirectory() as tmpdirname:
                encoder_dir = os.path.join(tmpdirname, "encoder")
                decoder_dir = os.path.join(tmpdirname, "decoder")
                tf_model.encoder.save_pretrained(encoder_dir)
                tf_model.decoder.save_pretrained(decoder_dir)

                if hasattr(tf_model, "enc_to_dec_proj"):
                    enc_to_dec_proj_weight = torch.transpose(
                        torch.from_numpy(tf_model.enc_to_dec_proj.kernel.numpy()), 1, 0
                    )
                    enc_to_dec_proj_bias = torch.from_numpy(tf_model.enc_to_dec_proj.bias.numpy())

                del _tf_model
                del tf_model
                gc.collect()

                model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
                    encoder_dir, decoder_dir, encoder_from_tf=True, decoder_from_tf=True
                )
                # This is only for copying some specific attributes of this particular model.
                model.config = config

                if hasattr(model, "enc_to_dec_proj"):
                    model.enc_to_dec_proj.weight.data = enc_to_dec_proj_weight.contiguous()
                    model.enc_to_dec_proj.bias.data = enc_to_dec_proj_bias.contiguous()

                return model

        # At the moment fast initialization is not supported for composite models
        if kwargs.get("_fast_init", False):
            logger.warning(
                "Fast initialization is currently not supported for VisionEncoderDecoderModel. "
                "Falling back to slow initialization..."
            )
        kwargs["_fast_init"] = False

        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)

    @classmethod
    def from_encoder_decoder_pretrained(
        cls,
        encoder_pretrained_model_name_or_path: str = None,
        decoder_pretrained_model_name_or_path: str = None,
        *model_args,
        **kwargs,
    ) -> PreTrainedModel:
        r"""
        Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
        checkpoints.


        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
        the model, you need to first set it back in training mode with `model.train()`.

        Params:
            encoder_pretrained_model_name_or_path (`str`, *optional*):
                Information necessary to initiate the image encoder. Can be either:

                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. An
                      example is `google/vit-base-patch16-224-in21k`.
                    - A path to a *directory* containing model weights saved using
                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
                      this case, `from_tf` should be set to `True` and a configuration object should be provided as
                      `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
                      PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

            decoder_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
                Information necessary to initiate the text decoder. Can be either:

                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
                    - A path to a *directory* containing model weights saved using
                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
                      this case, `from_tf` should be set to `True` and a configuration object should be provided as
                      `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
                      PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

            model_args (remaining positional arguments, *optional*):
                All remaning positional arguments will be passed to the underlying model's `__init__` method.

            kwargs (remaining dictionary of keyword arguments, *optional*):
                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                `output_attentions=True`).

                - To update the encoder configuration, use the prefix *encoder_* for each configuration parameter.
                - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter.
                - To update the parent model configuration, do not use a prefix for each configuration parameter.

                Behaves differently depending on whether a `config` is provided or automatically loaded.

        Example:

        ```python
        >>> from transformers import VisionEncoderDecoderModel

        >>> # initialize a vit-bert from a pretrained ViT and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized
        >>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
        ...     "google/vit-base-patch16-224-in21k", "google-bert/bert-base-uncased"
        ... )
        >>> # saving model after fine-tuning
        >>> model.save_pretrained("./vit-bert")
        >>> # load fine-tuned model
        >>> model = VisionEncoderDecoderModel.from_pretrained("./vit-bert")
        ```"""

        kwargs_encoder = {
            argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")
        }

        kwargs_decoder = {
            argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
        }

        # remove encoder, decoder kwargs from kwargs
        for key in kwargs_encoder.keys():
            del kwargs["encoder_" + key]
        for key in kwargs_decoder.keys():
            del kwargs["decoder_" + key]

        # Load and initialize the encoder and decoder
        # The distinction between encoder and decoder at the model level is made
        # by the value of the flag `is_decoder` that we need to set correctly.
        encoder = kwargs_encoder.pop("model", None)
        if encoder is None:
            if encoder_pretrained_model_name_or_path is None:
                raise ValueError(
                    "If `encoder_model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has "
                    "to be defined."
                )

            if "config" not in kwargs_encoder:
                encoder_config, kwargs_encoder = AutoConfig.from_pretrained(
                    encoder_pretrained_model_name_or_path, **kwargs_encoder, return_unused_kwargs=True
                )

                if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
                    logger.info(
                        f"Initializing {encoder_pretrained_model_name_or_path} as a encoder model "
                        "from a decoder model. Cross-attention and casual mask are disabled."
                    )
                    encoder_config.is_decoder = False
                    encoder_config.add_cross_attention = False

                kwargs_encoder["config"] = encoder_config

            encoder = AutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder)

        decoder = kwargs_decoder.pop("model", None)
        if decoder is None:
            if decoder_pretrained_model_name_or_path is None:
                raise ValueError(
                    "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has "
                    "to be defined."
                )

            if "config" not in kwargs_decoder:
                decoder_config, kwargs_decoder = AutoConfig.from_pretrained(
                    decoder_pretrained_model_name_or_path, **kwargs_decoder, return_unused_kwargs=True
                )

                if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False:
                    logger.info(
                        f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention"
                        f" layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if"
                        f" {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers."
                    )
                    decoder_config.is_decoder = True
                    decoder_config.add_cross_attention = True

                kwargs_decoder["config"] = decoder_config

            if kwargs_decoder["config"].is_decoder is False or kwargs_decoder["config"].add_cross_attention is False:
                logger.warning(
                    f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. "
                    f"In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, "
                    "make sure that the attributes `is_decoder` and `add_cross_attention` of `decoder_config` "
                    "passed to `.from_encoder_decoder_pretrained(...)` are set to `True` or do not pass a "
                    "`decoder_config` to `.from_encoder_decoder_pretrained(...)`"
                )

            decoder = AutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)

        # instantiate config with corresponding kwargs
        config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs)

        # make sure input & output embeddings is not tied
        config.tie_word_embeddings = False
        return cls(encoder=encoder, decoder=decoder, config=config)

    @add_start_docstrings_to_model_forward(VISION_ENCODER_DECODER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.BoolTensor] = None,
        encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        patch_objects = None,
        full_weights=None,
        full_adj=None,
        sem_adj=None,
        vis_feats=None,
        rtexts_feats=None,
        vis_pos=None,
        patch_lens=None,
        graph_mask=None,
        **kwargs,
    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
        r"""
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoProcessor, VisionEncoderDecoderModel
        >>> import requests
        >>> from PIL import Image
        >>> import torch

        >>> processor = AutoProcessor.from_pretrained("microsoft/trocr-base-handwritten")
        >>> model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

        >>> # load image from the IAM dataset
        >>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")

        >>> # training
        >>> model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
        >>> model.config.pad_token_id = processor.tokenizer.pad_token_id
        >>> model.config.vocab_size = model.config.decoder.vocab_size

        >>> pixel_values = processor(image, return_tensors="pt").pixel_values
        >>> text = "hello world"
        >>> labels = processor.tokenizer(text, return_tensors="pt").input_ids
        >>> outputs = model(pixel_values=pixel_values, labels=labels)
        >>> loss = outputs.loss

        >>> # inference (generation)
        >>> generated_ids = model.generate(pixel_values)
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```"""
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")}

        kwargs_decoder = {
            argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
        }

        if encoder_outputs is None:
            if pixel_values is None:
                raise ValueError("You have to specify pixel_values")

            encoder_outputs = self.encoder(
                pixel_values,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
                **kwargs_encoder,
            )
        elif isinstance(encoder_outputs, tuple):
            encoder_outputs = BaseModelOutput(*encoder_outputs)

        encoder_hidden_states = encoder_outputs[0]

        # optionally project encoder_hidden_states
        if (
            self.encoder.config.hidden_size != self.decoder.config.hidden_size
            and self.decoder.config.cross_attention_hidden_size is None
        ):
            encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)

        obj_features = torch.full((vis_feats.shape[0], vis_feats.shape[1], 1024), float('-inf')).to(device)
        for i, patch in enumerate(patch_objects):
            for j, obj in enumerate(patch):
                new_obj = [ob for ob in obj if ob < 900]
                obj_features[i][j] = torch.mean(encoder_hidden_states[i][new_obj],dim=0)

        obj_features = torch.where(obj_features == float('-inf'), torch.tensor(0.0), obj_features)
        gf_feats = self.gf(full_adj, sem_adj, full_weights, None, obj_features, self.proj(rtexts_feats), full_adj.shape[0],graph_mask)
        gf_feats_input = torch.zeros_like(encoder_hidden_states)

        for b, objects in enumerate(patch_objects):
            for o, o_idx in enumerate(objects):
                new_o_idx = [od for od in o_idx if od < 900]
                if len(new_o_idx) == 0:
                    continue
                gf_feats_input[b][new_o_idx] += gf_feats[b][o]

        for b, plens in enumerate(patch_lens):
          if len(plens) == 0:
              continue
          for l, pidx in plens.items():
              gf_feats_input[b][pidx] = gf_feats_input[b][pidx]/int(l)

        encoder_hidden_states += gf_feats_input

        # else:
        encoder_attention_mask = None

        if (labels is not None) and (decoder_input_ids is None and decoder_inputs_embeds is None):
            decoder_input_ids = shift_tokens_right(
                labels, self.config.pad_token_id, self.config.decoder_start_token_id
            )

        # Decode
        decoder_outputs = self.decoder(
            input_ids=decoder_input_ids,
            attention_mask=decoder_attention_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            inputs_embeds=decoder_inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            use_cache=use_cache,
            past_key_values=past_key_values,
            return_dict=return_dict,
            **kwargs_decoder,
        )

        # Compute loss independent from decoder (as some shift the logits inside them)
        loss = None
        if labels is not None:
            logits = decoder_outputs.logits if return_dict else decoder_outputs[0]
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.reshape(-1, self.decoder.config.vocab_size), labels.reshape(-1))

        if not return_dict:
            if loss is not None:
                return (loss,) + decoder_outputs + encoder_outputs
            else:
                return decoder_outputs + encoder_outputs

        return Seq2SeqLMOutput(
            loss=loss,
            logits=decoder_outputs.logits,
            past_key_values=decoder_outputs.past_key_values,
            decoder_hidden_states=decoder_outputs.hidden_states,
            decoder_attentions=decoder_outputs.attentions,
            cross_attentions=decoder_outputs.cross_attentions,
            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
            encoder_hidden_states=encoder_outputs.hidden_states,
            encoder_attentions=encoder_outputs.attentions,
        )

    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)

    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None,
        patch_objects = None, full_weights=None, full_adj=None, sem_adj=None, vis_feats=None, rtexts_feats=None,
        vis_pos=None,patch_lens=None, graph_mask=None,**kwargs
    ):
        decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values)
        decoder_attention_mask = decoder_inputs["attention_mask"] if "attention_mask" in decoder_inputs else None
        input_dict = {
            "attention_mask": attention_mask,
            "decoder_attention_mask": decoder_attention_mask,
            "decoder_input_ids": decoder_inputs["input_ids"],
            "encoder_outputs": encoder_outputs,
            "past_key_values": decoder_inputs["past_key_values"],
            "use_cache": use_cache,
            "patch_objects": patch_objects,
            'full_weights': full_weights,
            'full_adj': full_adj,
            'sem_adj': sem_adj,
            'vis_feats': vis_feats,
            'rtexts_feats': rtexts_feats,
            'vis_pos': vis_pos,
            'patch_lens': patch_lens,
            'graph_mask': graph_mask
        }
        return input_dict

    def resize_token_embeddings(self, *args, **kwargs):
        raise NotImplementedError(
            "Resizing the embedding layers via the VisionEncoderDecoderModel directly is not supported.Please use the"
            " respective methods of the wrapped decoder object (model.decoder.resize_token_embeddings(...))"
        )

    def _reorder_cache(self, past_key_values, beam_idx):
        # apply decoder cache reordering here
        return self.decoder._reorder_cache(past_key_values, beam_idx)

## Dataset

In [None]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image


class ChartQADataset(Dataset):
    def __init__(self, df, processor, split):
        self.df = df
        self.processor = processor
        self.split = split
        self.ignore_id = -100
        self.max_length = 512
        self.prompt_end_token_id = self.processor.tokenizer.convert_tokens_to_ids('<s_answer>')

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        item = self.df.iloc[idx]

        image = Image.open(item['img_path']).convert("RGB")
        patch_data = json.load(open(item['patch_objects'], 'r'))
        patch_objects = [o['patch_idx'] for o in patch_data['patch_object']]
        question_id = item['question_ids']
        feats = pickle.load(open(item['feature'], 'rb'))
        base_data = pickle.load(open(item['graph_base'], 'rb'))
        full_weights = torch.tensor(base_data['full_weights'])
        full_adj = torch.tensor(base_data['normalized_adj'])
        patch_lens = json.load(open(item['patch_lens']))
        sem_data = pickle.load(open(item['graph_sem'], 'rb'))
        sem_adj = torch.tensor(sem_data['normalized_adj'])
        text_data = pickle.load(open(item['text'], 'rb'))

        label_feats = torch.tensor(text_data['label_feature'], dtype=torch.float32)
        rtexts_feats = torch.tensor(text_data['text_feature'], dtype=torch.float32)
        text_feats = torch.cat((label_feats, rtexts_feats), dim=0)

        bboxes = torch.tensor(feats['bboxes'], dtype=torch.float32)
        vis_feats = torch.tensor(feats['visual_feats'])

        graph_mask = torch.zeros(sem_adj.shape[0], 1024)
        graph_mask[:full_adj.shape[0]] = 1

        pixel_values = self.processor(image, random_padding=self.split == "train", return_tensors="pt").pixel_values
        input_tensor = pixel_values.squeeze()

        # input_ids
        processed_parse = "<opencqa>" + " " + item['queries'] + " " + '<s_answer>' + " " + item['answers'] + self.processor.tokenizer.eos_token
        input_ids = self.processor.tokenizer(
            processed_parse,
            add_special_tokens=False,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )["input_ids"].squeeze(0)

        labels = input_ids.clone()
        labels[
            labels == self.processor.tokenizer.pad_token_id
        ] = self.ignore_id  # model doesn't need to predict pad token
        labels[
            : torch.nonzero(labels == self.prompt_end_token_id).sum() + 1
        ] = self.ignore_id  # model doesn't need to predict prompt
        prompt_end_index = torch.nonzero(
            input_ids == self.prompt_end_token_id
        ).sum()  # return prompt end index instead of target output labels

        ret = {'pixel_values': input_tensor, 'input_ids': input_ids, 'patch_object': patch_objects, 'graph_mask': graph_mask,
               'answers': item['answers'], 'label_ids': labels,  'question_id': question_id, 'rtexts_feats': text_feats, 'vis_feats': vis_feats,
               'full_weights': full_weights, 'full_adj': full_adj, 'sem_adj': sem_adj, 'bboxes':bboxes, 'prompt_end_index': prompt_end_index, 'patch_lens': patch_lens}

        return ret

In [None]:
def collator(batch):
  new_batch = {'pixel_values': None, 'input_ids': None, 'patch_objects': [], 'answers': [], 'label_ids': None, 'question_ids': [], 'graph_mask': None,
               'full_weights': None, 'full_adj': None, 'sem_adj': None, 'rtexts_feats': None, 'bboxes': None, 'vis_feats': None, 'prompt_end_index': None,
               'patch_lens': []}

  max_objs = max([b['full_weights'].shape[0] for b in batch])
  max_texts = max([b['rtexts_feats'].shape[0] for b in batch])
  max_mask = max([b['graph_mask'].shape[0] for b in batch])
  max_sem = max([b['sem_adj'].shape[0] for b in batch])

  for item in batch:
    padded_graph_mask = pad_zeros(item['graph_mask'], (max_mask, 1024)).unsqueeze(0)
    if new_batch['graph_mask'] is None:
      new_batch['graph_mask'] = padded_graph_mask
    else:
      new_batch['graph_mask'] = torch.cat((new_batch['graph_mask'], padded_graph_mask))

    padded_full_weights = pad_zeros(item['full_weights'], (max_objs, max_objs)).unsqueeze(0)
    if new_batch['full_weights'] is None:
      new_batch['full_weights'] = padded_full_weights
    else:
      new_batch['full_weights'] = torch.cat((new_batch['full_weights'], padded_full_weights))

    padded_full_adj = pad_zeros(item['full_adj'], (max_objs, max_objs)).unsqueeze(0)
    if new_batch['full_adj'] is None:
      new_batch['full_adj'] = padded_full_adj
    else:
      new_batch['full_adj'] = torch.cat((new_batch['full_adj'], padded_full_adj))

    padded_sem_adj = pad_zeros(item['sem_adj'], (max_sem, max_sem)).unsqueeze(0)
    if new_batch['sem_adj'] is None:
      new_batch['sem_adj'] = padded_sem_adj
    else:
      new_batch['sem_adj'] = torch.cat((new_batch['sem_adj'], padded_sem_adj))

    padded_rtexts_feats = pad_zeros(item['rtexts_feats'], (max_texts, 768)).unsqueeze(0)
    if new_batch['rtexts_feats'] is None:
      new_batch['rtexts_feats'] = padded_rtexts_feats
    else:
      new_batch['rtexts_feats'] = torch.cat((new_batch['rtexts_feats'], padded_rtexts_feats))

    padded_bboxes = pad_zeros(item['bboxes'], (max_objs, 4)).unsqueeze(0)
    if new_batch['bboxes'] is None:
      new_batch['bboxes'] = padded_bboxes
    else:
      new_batch['bboxes'] = torch.cat((new_batch['bboxes'], padded_bboxes))

    padded_vis_feats = pad_zeros(item['vis_feats'], (max_objs, 2048)).unsqueeze(0)
    if new_batch['vis_feats'] is None:
      new_batch['vis_feats'] = padded_vis_feats
    else:
      new_batch['vis_feats'] = torch.cat((new_batch['vis_feats'], padded_vis_feats))

    padded_pixel_values = item['pixel_values'].unsqueeze(0)
    if new_batch['pixel_values'] is None:
      new_batch['pixel_values'] = padded_pixel_values
    else:
      new_batch['pixel_values'] = torch.cat((new_batch['pixel_values'], padded_pixel_values))

    padded_input_ids = item['input_ids'].unsqueeze(0)
    if new_batch['input_ids'] is None:
      new_batch['input_ids'] = padded_input_ids
    else:
      new_batch['input_ids'] = torch.cat((new_batch['input_ids'], padded_input_ids))

    padded_label_ids = item['label_ids'].unsqueeze(0)
    if new_batch['label_ids'] is None:
      new_batch['label_ids'] = padded_label_ids
    else:
      new_batch['label_ids'] = torch.cat((new_batch['label_ids'], padded_label_ids))

    padded_prompt_end_index = item['prompt_end_index'].unsqueeze(0)
    if new_batch['prompt_end_index'] is None:
      new_batch['prompt_end_index'] = padded_prompt_end_index
    else:
      new_batch['prompt_end_index'] = torch.cat((new_batch['prompt_end_index'], padded_prompt_end_index))

    new_batch["patch_objects"].append(item["patch_object"])
    new_batch["answers"].append(item["answers"])
    new_batch['question_ids'].append(item['question_id'])
    new_batch["patch_lens"].append(item["patch_lens"])

  return new_batch

In [None]:
import os
import json

In [None]:
def create_df(split):
  pairs = json.load(open(f'/content/OpenCQA/etc/data/{split}.json'))
  question_ids = []
  queries = []
  answers = []
  graph_bases = []
  graph_sem = []
  text_paths = []
  feature_paths = []
  img_paths = []
  patch_objects = []
  patch_lens = []
  for uid, pair in pairs.items():
    question_ids.append(uid)
    queries.append(pair[3])
    answers.append(str(pair[-2]))
    img_paths.append(f'/content/OpenCQA/chart_images/{uid}.png')
    patch_objects.append(f'/content/patch_object_pred/{uid}.json')
    graph_bases.append(f'/content/OpenCQA_Graph/{uid}.pkl')
    graph_sem.append(f'/content/OpenCQA_Graph_6_rels/{uid}.pkl')
    text_paths.append(f'/content/text_feature_bert/{uid}.pkl')
    feature_paths.append(f'/content/mask-rcnn-predict_pkl/{uid}.pkl')
    patch_lens.append(f'/content/patch_len_pred/{uid}.json')
  return pd.DataFrame({'img_path': img_paths, 'queries': queries, 'answers': answers, 'question_ids': question_ids, 'patch_objects': patch_objects,
                       'graph_base': graph_bases, 'graph_sem': graph_sem, 'feature': feature_paths, 'text': text_paths, 'patch_lens': patch_lens})

##Evaluator

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
from sacrebleu.metrics import BLEU, CHRF, TER
from sacremoses import MosesPunctNormalizer, MosesTokenizer, MosesDetokenizer
import pandas as pd

In [None]:
import csv
import json
from statistics import mean, stdev
import sys
import re

In [None]:
class VQAEvaluator:
    def __init__(self, df):
        # Loading datasets to data
        '''instances = pd.read_csv(src_folder + "data.csv")
        self.instances = instances
        self.inputs = instances["Input"].values
        self.outputs = None
        if "Output" in instances:
            self.outputs = instances["Output"].values
        self.images_indices = instances['Image Index'].values
        self.questions_ids = instances['Question ID'].values
        self.src_folder = src_folder'''

        self.qidtoans = []
        # Iterate through rows using iterrows()
        for index, row in df.iterrows():
            self.qidtoans.append(row['answers'])
        """https://github.com/GT-Vision-Lab/VQA/blob/master/PythonEvaluationTools/vqaEvaluation/vqaEval.py"""

        self.contractions = {"aint": "ain't", "arent": "aren't", "cant": "can't", "couldve": "could've", "couldnt": "couldn't", \
							 "couldn'tve": "couldn't've", "couldnt've": "couldn't've", "didnt": "didn't", "doesnt": "doesn't", "dont": "don't", "hadnt": "hadn't", \
							 "hadnt've": "hadn't've", "hadn'tve": "hadn't've", "hasnt": "hasn't", "havent": "haven't", "hed": "he'd", "hed've": "he'd've", \
							 "he'dve": "he'd've", "hes": "he's", "howd": "how'd", "howll": "how'll", "hows": "how's", "Id've": "I'd've", "I'dve": "I'd've", \
							 "Im": "I'm", "Ive": "I've", "isnt": "isn't", "itd": "it'd", "itd've": "it'd've", "it'dve": "it'd've", "itll": "it'll", "let's": "let's", \
							 "maam": "ma'am", "mightnt": "mightn't", "mightnt've": "mightn't've", "mightn'tve": "mightn't've", "mightve": "might've", \
							 "mustnt": "mustn't", "mustve": "must've", "neednt": "needn't", "notve": "not've", "oclock": "o'clock", "oughtnt": "oughtn't", \
							 "ow's'at": "'ow's'at", "'ows'at": "'ow's'at", "'ow'sat": "'ow's'at", "shant": "shan't", "shed've": "she'd've", "she'dve": "she'd've", \
							 "she's": "she's", "shouldve": "should've", "shouldnt": "shouldn't", "shouldnt've": "shouldn't've", "shouldn'tve": "shouldn't've", \
							 "somebody'd": "somebodyd", "somebodyd've": "somebody'd've", "somebody'dve": "somebody'd've", "somebodyll": "somebody'll", \
							 "somebodys": "somebody's", "someoned": "someone'd", "someoned've": "someone'd've", "someone'dve": "someone'd've", \
							 "someonell": "someone'll", "someones": "someone's", "somethingd": "something'd", "somethingd've": "something'd've", \
							 "something'dve": "something'd've", "somethingll": "something'll", "thats": "that's", "thered": "there'd", "thered've": "there'd've", \
							 "there'dve": "there'd've", "therere": "there're", "theres": "there's", "theyd": "they'd", "theyd've": "they'd've", \
							 "they'dve": "they'd've", "theyll": "they'll", "theyre": "they're", "theyve": "they've", "twas": "'twas", "wasnt": "wasn't", \
							 "wed've": "we'd've", "we'dve": "we'd've", "weve": "we've", "werent": "weren't", "whatll": "what'll", "whatre": "what're", \
							 "whats": "what's", "whatve": "what've", "whens": "when's", "whered": "where'd", "wheres": "where's", "whereve": "where've", \
							 "whod": "who'd", "whod've": "who'd've", "who'dve": "who'd've", "wholl": "who'll", "whos": "who's", "whove": "who've", "whyll": "why'll", \
							 "whyre": "why're", "whys": "why's", "wont": "won't", "wouldve": "would've", "wouldnt": "wouldn't", "wouldnt've": "wouldn't've", \
							 "wouldn'tve": "wouldn't've", "yall": "y'all", "yall'll": "y'all'll", "y'allll": "y'all'll", "yall'd've": "y'all'd've", \
							 "y'alld've": "y'all'd've", "y'all'dve": "y'all'd've", "youd": "you'd", "youd've": "you'd've", "you'dve": "you'd've", \
							 "youll": "you'll", "youre": "you're", "youve": "you've"}

        self.manualMap    = { 'none': '0',
							  'zero': '0',
							  'one': '1',
							  'two': '2',
							  'three': '3',
							  'four': '4',
							  'five': '5',
							  'six': '6',
							  'seven': '7',
							  'eight': '8',
							  'nine': '9',
							  'ten': '10'
							}

        self.articles     = ['a',
							 'an',
							 'the'
							]

        self.periodStrip  = re.compile("(?!<=\d)(\.)(?!\d)")
        self.commaStrip   = re.compile("(\d)(\,)(\d)")
        self.punct        = [';', r"/", '[', ']', '"', '{', '}',
							 '(', ')', '=', '+', '\\', '_', '-',
							 '>', '<', '@', '`', ',', '?', '!']

        self.n = 2

    def dump_result(self, quesid2ans: dict, path):
        """
        Dump results to a json file, which could be submitted to the VQA online evaluation.
        VQA json file submission requirement:
            results = [result]
            result = {
                "question_id": int,
                "answer": str
            }
        :param quesid2ans: dict of quesid --> ans
        :param path: The desired path of saved file.
        """
        with open(path, 'w') as f:
            result = []
            for ques_id, ans in quesid2ans.items():
                result.append({
                    'question_id': ques_id,
                    'answer': ans
                })
            json.dump(result, f, indent=4, sort_keys=True)

    def evaluate_raw(self, quesid2ans: dict, is_topk_optimal=None, criteria='bleu'):
        """https://github.com/GT-Vision-Lab/VQA/blob/master/PythonEvaluationTools/vqaEvaluation/vqaEval.py"""

        # gts = self.dataset.id2datum_gt

        self.accuracy     = {}
        self.evalQA       = {}
        self.evalQuesType = {}
        self.evalAnsType  = {}

        accQA = []
        accQuesType = {}
        accAnsType = {}

        # print("Computing accuracy")

        if criteria == 'bleu':
            mpn = MosesPunctNormalizer()
            mt = MosesTokenizer(lang="en")
            md = MosesDetokenizer(lang="en")

            model_output_summary = []
            for quesId, resAns in tqdm(quesid2ans.items(), total=len(quesid2ans), ncols=80):
                model_output_summary.append(self.normalize_answer(resAns))

            test_summary = list(evaluator.qidtoans)

            def detokenize(sent):
                sent = mpn.normalize(sent)
                tokens = mt.tokenize(sent)
                return md.detokenize(tokens)

            model_output_summary = list(map(detokenize, model_output_summary))
            test_summary = list(map(detokenize, test_summary))

            bleu = BLEU()
            bleuscore = bleu.corpus_score(model_output_summary, [test_summary]).score

            self.setAccuracy(bleuscore)

            return self.accuracy, model_output_summary
        elif criteria == 'cs':
            fillers = ['in', 'the', 'and', 'or', 'an', 'as', 'can', 'be', 'a', ':', '-',
           'to', 'but', 'is', 'of', 'it', 'on', '.', 'at', '(', ')', ',', ';']

            count = 0

            generatedScores = []
            #baselineScores = []
            untemplatedScores = [1,1]

            gen_file = []
            for quesId, resAns in tqdm(quesid2ans.items(), total=len(quesid2ans), ncols=80):
                gen_file.append(self.normalize_answer(resAns))



            with open('/content/testData.txt', 'r', encoding='utf-8') as dataFile, open('/content/testTitles.txt', 'r', encoding='utf-8') as titleFile, \
                    open('/content/targetAnswers.txt', 'r', encoding='utf-8') as goldFile:
                for datas, titles, gold in zip(dataFile.readlines(), titleFile.readlines(), goldFile.readlines()):
                    dataArr = datas.split()
                    titleArr = titles.split()
                    goldArr = gold.split()
                    recordList = []
                    for gld in goldArr:
                        data_string = datas.replace("_", " ")
                        if gld.lower() in " ".join([data_string,titles]).lower()  and gld.lower() not in fillers and gld.lower() not in recordList:
                            recordList.append(gld.lower())
                    list1 = recordList
                    list2 = recordList
                    list3 = recordList
                    recordLength = len(recordList)
                    generatedList = []
                    summary1 = gen_file[count]


                    for token in summary1.split():
                        if token.lower() in list1:
                            list1.remove(token.lower())
                            generatedList.append(token.lower())


                    count += 1

                    if recordLength==0:
                        generatedRatio=0
                    else:
                        generatedRatio = len(generatedList) / recordLength


                    generatedScores.append(generatedRatio)

            self.setAccuracy(mean(generatedScores)*100)
            return self.accuracy, None

    def normalize_answer(self, resAns):
        resAns      = resAns.replace('<pad>', ' ')
        resAns      = resAns.replace('</s>', ' ')
        #resAns      = resAns.replace('\n', ' ')
        #resAns      = resAns.replace('\t', ' ')
        resAns      = resAns.strip()
        #resAns      = self.processPunctuation(resAns)
        #resAns      = self.processDigitArticle(resAns)
        #resAns = resAns.replace(',', '')
        return resAns

    def processPunctuation(self, inText):
        outText = inText
        for p in self.punct:
            if (p + ' ' in inText or ' ' + p in inText) or (re.search(self.commaStrip, inText) != None):
                outText = outText.replace(p, '')
            else:
                outText = outText.replace(p, ' ')
        outText = self.periodStrip.sub("",
                                        outText,
                                        re.UNICODE)
        return outText

    def processDigitArticle(self, inText):
        outText = []
        tempText = inText.lower().split()
        for word in tempText:
            word = self.manualMap.setdefault(word, word)
            if word not in self.articles:
                outText.append(word)
            else:
                pass
        for wordId, word in enumerate(outText):
            if word in self.contractions:
                outText[wordId] = self.contractions[word]
        outText = ' '.join(outText)
        return outText

    def setEvalQA(self, quesId, acc):
        self.evalQA[quesId] = round(100*acc, self.n)

    def setEvalQuesType(self, quesId, quesType, acc):
        if quesType not in self.evalQuesType:
            self.evalQuesType[quesType] = {}
        self.evalQuesType[quesType][quesId] = round(100*acc, self.n)

    def setEvalAnsType(self, quesId, ansType, acc):
        if ansType not in self.evalAnsType:
            self.evalAnsType[ansType] = {}
        self.evalAnsType[ansType][quesId] = round(100*acc, self.n)

    def setAccuracy(self, bleuscore):
        self.accuracy['overall'] = bleuscore
        # self.accuracy['perQuestionType'] = {quesType: round(100*float(sum(accQuesType[quesType]))/len(accQuesType[quesType]), self.n) for quesType in accQuesType}
        # self.accuracy['perAnswerType']   = {ansType:  round(100*float(sum(accAnsType[ansType]))/len(accAnsType[ansType]), self.n) for ansType in accAnsType}

    def within_percent(self, predicted, golden, tolerance=0.05):
      # Calculate the acceptable range
      tolerance = golden * tolerance

      # Check if the predicted value is within the acceptable range
      if golden - tolerance <= predicted <= golden + tolerance:
          return True
      else:
          return False

    def relaxed_correctness(self, target: str,
                        prediction: str,
                        max_relative_change: float = 0.05) -> bool:
      """Calculates relaxed correctness.

      The correctness tolerates certain error ratio defined by max_relative_change.
      See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
      “Following Methani et al. (2020), we use a relaxed accuracy measure for the
      numeric answers to allow a minor inaccuracy that may result from the automatic
      data extraction process. We consider an answer to be correct if it is within
      5% of the gold answer. For non-numeric answers, we still need an exact match
      to consider an answer to be correct.”

      Args:
        target: Target string.
        prediction: Predicted string.
        max_relative_change: Maximum relative change.

      Returns:
        Whether the prediction was correct given the specified tolerance.
      """
      prediction_float = self._to_float(prediction)
      target_float = self._to_float(target)
      if prediction_float is not None and target_float:
          relative_change = abs(prediction_float -
                                target_float) / abs(target_float)
          return relative_change <= max_relative_change
      else:
          return prediction.lower() == target.lower()

    def _to_float(self, text: str):
      try:
          if text.endswith('%'):
              # Convert percentages to floats.
              return float(text.rstrip('%')) / 100.0
          else:
              return float(text)
      except ValueError:
          return None

#Training

In [None]:
import pandas as pd
import os
import json
from torch.optim.lr_scheduler import LambdaLR

In [None]:
def cosine_scheduler(optimizer, training_steps, warmup_steps):
    def lr_lambda(current_step):
        if current_step < warmup_steps:
            return current_step / max(1, warmup_steps)
        progress = current_step - warmup_steps
        progress /= max(1, training_steps - warmup_steps)
        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * progress)))

    return LambdaLR(optimizer, lr_lambda)

In [None]:
train_df = create_df('train')

In [None]:
from transformers import DonutProcessor
model_name = "ahmed-masry/unichart-base-960"

model = VisionEncoderDecoderModel.from_pretrained(model_name)
processor = DonutProcessor.from_pretrained(model_name)

In [None]:
train_dataset = ChartQADataset(train_df,processor, 'train')

In [None]:
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=collator, num_workers=12)

In [None]:
val_df = create_df('val')

In [None]:
val_dataset = ChartQADataset(val_df,processor, 'val')

In [None]:
val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=1, collate_fn=collator, num_workers=12)

In [None]:
evaluator = VQAEvaluator(val_df)

In [None]:
def cosine_scheduler(optimizer, training_steps, warmup_steps):
    def lr_lambda(current_step):
        if current_step < warmup_steps:
            return current_step / max(1, warmup_steps)
        progress = current_step - warmup_steps
        progress /= max(1, training_steps - warmup_steps)
        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * progress)))

    return LambdaLR(optimizer, lr_lambda)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

device = "cuda" if torch.cuda.is_available() else "cpu"

model.train()

In [None]:
scheduler = cosine_scheduler(optimizer, 80000, 100)

In [None]:
EPOCHS = 20

In [None]:
model.to(device)

In [None]:
max_acc = 0
save_folder = '/content/drive/MyDrive/PL-NL/QA/OpenCQA/models/models_unichart_concate_fc_2048_inside_norm_scheduler_mean_feats_mix_prec_after_encoder_dropout4'
create_folder_if_not_exists(save_folder)
save_losses = []
save_acc = []
scaler = torch.cuda.amp.GradScaler()
from tqdm.autonotebook import tqdm
prompt_end_token_id = processor.tokenizer.convert_tokens_to_ids('<s_answer>')
for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0
    with tqdm(range(len(train_dataloader))) as pbar:
      for bidx, batch in enumerate(train_dataloader):
          with torch.autocast(device_type='cuda', dtype=torch.float16):
            pixel_values = batch.pop('pixel_values')
            patch_objects = batch.pop('patch_objects')
            graph_mask = batch.pop('graph_mask').to(device)
            input_ids = batch.pop('input_ids')
            labels = batch.pop('label_ids')
            ids = batch.pop('question_ids')
            full_weights = batch.pop('full_weights').to(device)
            full_adj = batch.pop('full_adj').to(device)
            sem_adj = batch.pop('sem_adj').to(device)
            rtexts_feats = batch.pop('rtexts_feats').to(device)
            bboxes = batch.pop('bboxes').to(device)
            vis_feats = batch.pop('vis_feats').to(device)
            patch_lens = batch.pop('patch_lens')

            outputs = model(
                pixel_values.to(device),
                labels=labels[:, 1:].to(device),
                decoder_input_ids=input_ids[:, :-1].to(device),
                patch_objects=patch_objects,
                full_weights=full_weights, full_adj=full_adj, sem_adj=sem_adj, vis_feats=vis_feats,
                rtexts_feats=rtexts_feats, vis_pos=bboxes, patch_lens=patch_lens, graph_mask=graph_mask
            )

            loss = outputs.loss

          scheduler.step()
          scaler.scale(loss).backward()
          scaler.unscale_(optimizer)
          scaler.step(optimizer)
          scaler.update()
          optimizer.zero_grad()

          pbar.update(1)
          total_loss += loss.detach().item()
    total_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch} Loss: {total_loss}')
    if epoch % 1 == 0:
        model.eval()
        acc, _ = evaluate(model, val_dataloader, evaluator, processor)
        save_acc.append(acc)
        print(f'Epoch {epoch} acc: {acc}')
        if acc['overall'] > max_acc:
          max_acc = acc['overall']
          torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'acc': acc['overall'],
                "scaler": scaler.state_dict(),
                }, f'{save_folder}/best.pt')

    if (epoch) % 2 == 0:
      torch.save({
              'epoch': epoch,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimizer.state_dict(),
              'scheduler_state_dict': scheduler.state_dict(),
              "scaler": scaler.state_dict(),
              'loss': save_losses,
              'acc': save_acc,
              }, f'{save_folder}/model_{epoch}.pt')

#Test

In [None]:
import pandas as pd
import os
import json

In [None]:
import torch

In [None]:
PATH = '/content/drive/MyDrive/PL-NL/QA/OpenCQA/models/models_unichart_concate_fc_2048_inside_norm_scheduler_mean_feats_mix_prec_after_encoder/best.pt'
checkpoint = torch.load(PATH)

In [None]:
checkpoint['epoch']

In [None]:
checkpoint['acc']

In [None]:
test_df = create_df('test')

In [None]:
from transformers import DonutProcessor
model_name = "ahmed-masry/unichart-chartqa-960"

model = VisionEncoderDecoderModel.from_pretrained(model_name)
processor = DonutProcessor.from_pretrained(model_name)

In [None]:
test_dataset = ChartQADataset(test_df, processor, 'test')

In [None]:
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=1, collate_fn=collator, num_workers=12)

In [None]:
evaluator = VQAEvaluator(test_df)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
model.to(device)

In [None]:
model.load_state_dict(checkpoint['model_state_dict'])

In [None]:
model.eval()
acc = evaluate(model, test_dataloader, evaluator, processor)

In [None]:
acc[0]