In [None]:
import torch
from tqdm import tqdm
import pickle
from transformers import AutoModel,AutoTokenizer
from torch.utils.data import Dataset
import argparse
import json
import pandas as pd

#Feature Extractor Class
Define e textual encoder, e.g., "bert-base-uncased", encode the input text and extract the CLS token as final textual representation

In [None]:
class feature_extractor(torch.nn.Module):
    def __init__(self):
        super(feature_extractor, self).__init__()
        self.bigbird = AutoModel.from_pretrained('bert-base-uncased')

    def forward(self, input_ids, attention_mask):
        input_ids = input_ids.squeeze(1)
        attention_mask = attention_mask.squeeze(1)
        outputs = self.bigbird(input_ids=input_ids,attention_mask=attention_mask)
        outputs = outputs[0]
        cls = outputs[:, 0, :]
        return cls

#Dataset Class
Tokenize the input text and return corresponding *input_ids* and *attention_mask*

In [None]:
class data_prepare(Dataset):
    def __init__(self, sentences, tokenizer):
        self.text = sentences

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.text[index]
        encoded_dict = tokenizer(
            text,
            max_length=512,  # Pad & truncate all sentences
            pad_to_max_length=True,
            return_tensors='pt',  # Return pytorch tensors
        )
        return {
            'ids': torch.tensor(encoded_dict['input_ids'], dtype=torch.long),
            'mask': torch.tensor(encoded_dict['attention_mask'], dtype=torch.long)
        }

#Main


In [None]:
parser = argparse.ArgumentParser()
parser.add_argument("--split", type=str, default="train", choices=["train", "test"])
args = parser.parse_args()
split = args.split

Read entity annotated FUNSD data. They can be downloaded from: https://guillaumejaume.github.io/FUNSD/download/

In [None]:
with open("data/FUNSD/dataset/"+split+"ing_data/all_annotations.json", 'r') as f:
    data = json.load(f)
    sentences = []
    boxes = []
    for key in data.keys():
        doc = data[key]["form"]
        for i in range(len(doc)):
            sentences.append(doc[i]["text"])
            boxes.append(doc[i]["box"])

Apply feature extractor

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = feature_extractor()
model.to(device)
model.eval()
data = data_prepare(sentences, tokenizer)
loader = torch.utils.data.DataLoader(data, batch_size=32, shuffle=False)


features = []
for batch in tqdm(loader):
    input_ids = batch['ids'].to(device)
    attention_mask = batch['mask'].to(device)
    with torch.no_grad():
        feature = model(input_ids, attention_mask)
    features.append(feature)

features = torch.cat(features, dim=0)
features = features.cpu().numpy()

Create a pandas datafram containing the computed textual features

In [None]:
df = pd.DataFrame()
df["id"] = list(range(len(sentences)))
df["text"] = sentences
df["boxes"] = boxes
df["features"] = features.tolist()

Save features as Pickle file

In [None]:
with open("data/FUNSD/dataset/"+split+"ing_data/textual_features.pickle", 'wb') as f:
    pickle.dump(df, f)