In [None]:
"""BROS"""

'BROS'

In [1]:
!pip install transformers torch datasets evaluate seqeval

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downlo

In [2]:
!pip install -U datasets

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency re

In [4]:
import torch
import torch.nn as nn
from transformers import BrosPreTrainedModel, BrosModel, AutoConfig, AutoTokenizer
from PIL import Image,ImageDraw, ImageFont
from datasets import load_dataset, load_from_disk
import pandas as pd
import evaluate
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
!cp -r /content/drive/MyDrive/THESIS/rvl_cdip_financial_subset /content

In [7]:
class BrosForDocumentClassification(BrosPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bros = BrosModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        bbox=None,
        attention_mask=None,
        token_type_ids=None,
        labels=None,
        **kwargs
    ):
        outputs = self.bros(
            input_ids=input_ids,
            bbox=bbox,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )

        # Use the [CLS] token's representation (first token)
        cls_output = outputs.last_hidden_state[:, 0, :]  # shape: (batch_size, hidden_size)

        cls_output = self.dropout(cls_output)
        logits = self.classifier(cls_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return {
            "loss": loss,
            "logits": logits,
        }

In [18]:
config = AutoConfig.from_pretrained(
    "naver-clova-ocr/bros-base-uncased",
    num_labels=5,
    id2label={0: "form", 1: "invoice", 2: "budget", 3: "file folder", 4: "questionnaire"},
    label2id={"form": 0, "invoice": 1, "budget": 2, "file folder": 3, "questionnaire": 4}
)

model = BrosForDocumentClassification.from_pretrained(
    "naver-clova-ocr/bros-base-uncased",
    config=config
)

def freeze_bros_layers(model,
                       num_encoder_layers_to_freeze = 6,
                       freeze_embeddings= True):
    """
    Args
    ----
    model  : your BrosForDocumentClassification instance
    num_encoder_layers_to_freeze : freeze layers 0 … N-1 (default 6 of 12)
    freeze_embeddings            : also freeze word/2-D/pos embeddings
    """
    # 1) embeddings
    if freeze_embeddings:
        for p in model.bros.embeddings.parameters():
            p.requires_grad = False

    # 2) first N encoder layers
    for layer in model.bros.encoder.layer[:num_encoder_layers_to_freeze]:
        for p in layer.parameters():
            p.requires_grad = False

    # 3) report
    tot = sum(p.numel() for p in model.parameters())
    train = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f" Froze embeddings={freeze_embeddings}, "
          f"layers 0-{num_encoder_layers_to_freeze-1}. "
          f"Trainable params: {train/1e6:.1f} M / {tot/1e6:.1f} M")

freeze_bros_layers(model, num_encoder_layers_to_freeze=6, freeze_embeddings=True)
model.to(device)

tokenizer = AutoTokenizer.from_pretrained("naver-clova-ocr/bros-base-uncased",do_lower_case=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# def freeze_bros_layers(model, num_layers_to_freeze=6):
#     # Freeze embeddings
#     for param in model.bros.embeddings.parameters():
#         param.requires_grad = False

#     # Freeze first N encoder layers
#     for i in range(num_layers_to_freeze):
#         for param in model.bros.encoder.layer[i].parameters():
#             param.requires_grad = False

#     print(f"Froze embeddings and encoder layers 0 to {num_layers_to_freeze - 1}")

# freeze_bros_layers(model, num_layers_to_freeze=6)

model.to(device)

Some weights of BrosForDocumentClassification were not initialized from the model checkpoint at naver-clova-ocr/bros-base-uncased and are newly initialized: ['bros.bbox_embeddings.bbox_projection.weight', 'bros.bbox_embeddings.bbox_sinusoid_emb.x_pos_emb.inv_freq', 'bros.bbox_embeddings.bbox_sinusoid_emb.y_pos_emb.inv_freq', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Froze embeddings=True, layers 0-5. Trainable params: 43.1 M / 109.5 M


BrosForDocumentClassification(
  (bros): BrosModel(
    (embeddings): BrosTextEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (bbox_embeddings): BrosBboxEmbeddings(
      (bbox_sinusoid_emb): BrosPositionalEmbedding2D(
        (x_pos_emb): BrosPositionalEmbedding1D()
        (y_pos_emb): BrosPositionalEmbedding1D()
      )
      (bbox_projection): Linear(in_features=192, out_features=64, bias=False)
    )
    (encoder): BrosEncoder(
      (layer): ModuleList(
        (0-11): 12 x BrosLayer(
          (attention): BrosAttention(
            (self): BrosSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): 

In [10]:
def normalize_bbox(bbox, width, height):
    return [
        int(1000 * (bbox[0] / width)),
        int(1000 * (bbox[1] / height)),
        int(1000 * (bbox[2] / width)),
        int(1000 * (bbox[3] / height)),
    ]
def encode(batch):
    encodings = tokenizer(
        batch["words"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )

    batch_aligned_bboxes = []

    for idx, (bboxes, image) in enumerate(zip(batch["bboxes"], batch["image"])):
        # Get image size for normalization
        if isinstance(image, Image.Image):
            width, height = image.size
        else:
            image = Image.open(image)
            width, height = image.size

        # Normalize bounding boxes
        normalized_bboxes = [normalize_bbox(bbox, width, height) for bbox in bboxes]

        # Align bboxes to subword tokens
        aligned_bboxes = []
        word_ids = encodings.word_ids(batch_index=idx)

        for word_id in word_ids:
            if word_id is None:
                aligned_bboxes.append([0, 0, 0, 0])
            else:
                aligned_bboxes.append(normalized_bboxes[word_id])

        batch_aligned_bboxes.append(aligned_bboxes)

    encodings["bbox"] = batch_aligned_bboxes
    encodings["labels"] = batch["label"]  # document-level label per example

    return encodings


In [11]:
rvl = load_from_disk("/content/rvl_cdip_financial_subset")
dataset_split = rvl.train_test_split(test_size=0.2, seed=42)

train = dataset_split["train"]
val = dataset_split["test"]
print(len(train))
print(len(val))
train_dataset = train.map(encode, batched=True, remove_columns=train.column_names)
val_dataset = val.map(encode, batched=True, remove_columns=val.column_names)

4000
1000


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [12]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids

    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='weighted', zero_division=0
    )

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [13]:
from transformers import EarlyStoppingCallback

early_stopping_callback = EarlyStoppingCallback(
       early_stopping_patience=2,
   )

In [14]:
# login to hugging face
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `adam` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `adam`


In [19]:
from transformers import TrainingArguments, Trainer, DefaultDataCollator
training_args = TrainingArguments(
    output_dir= "./bros-docclass-finetuned-frz",
    learning_rate= 3e-5,
    weight_decay   = 0.01,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 16,
    gradient_accumulation_steps= 2,       # 8 × 2 = 16 eff. batch
    num_train_epochs= 7,
    lr_scheduler_type= "cosine",
    warmup_ratio= 0.06,
    fp16= True,
    eval_strategy= "steps",
    eval_steps= 250,     # ≈ 2 evals per epoch
    save_strategy= "steps",
    save_steps= 250,
    load_best_model_at_end= True,
    metric_for_best_model= "eval_accuracy",
    save_total_limit= 3,
    # --- logging -------------------------------------------------------------
    logging_strategy = "steps",
    logging_steps= 50,
    logging_dir  = "./logs",
    # --- misc ---------------------------------------------------------------
    seed= 42,
    report_to= "none",
    push_to_hub= True,
)

data_collator = DefaultDataCollator(return_tensors="pt")
# Data collator
data_collator = DefaultDataCollator(return_tensors="pt")

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    data_collator=DefaultDataCollator(),
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback],
)
trainer.train()



Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
250,1.2994,0.692569,0.78,0.791938,0.78,0.776501
500,1.1597,0.597155,0.804,0.820894,0.804,0.799304
750,0.9832,0.598086,0.814,0.837093,0.814,0.81202
1000,0.8739,0.519959,0.84,0.844884,0.84,0.839023
1250,0.7388,0.538987,0.835,0.84095,0.835,0.835497
1500,0.7552,0.524938,0.843,0.847294,0.843,0.842996
1750,0.6218,0.524164,0.843,0.846483,0.843,0.842399




TrainOutput(global_step=1750, training_loss=1.0034516438075474, metrics={'train_runtime': 545.9753, 'train_samples_per_second': 51.284, 'train_steps_per_second': 3.205, 'total_flos': 7368364953600000.0, 'train_loss': 1.0034516438075474, 'epoch': 7.0})

In [21]:
tokenizer.save_pretrained('adamadam111/bros-docclass-finetuned-frz')
tokenizer.push_to_hub('adamadam111/bros-docclass-finetuned-frz')


CommitInfo(commit_url='https://huggingface.co/adamadam111/bros-docclass-finetuned-frz/commit/218a2969c51b5f81c3da736839fa809e11b4a167', commit_message='Upload tokenizer', commit_description='', oid='218a2969c51b5f81c3da736839fa809e11b4a167', pr_url=None, repo_url=RepoUrl('https://huggingface.co/adamadam111/bros-docclass-finetuned-frz', endpoint='https://huggingface.co', repo_type='model', repo_id='adamadam111/bros-docclass-finetuned-frz'), pr_revision=None, pr_num=None)