Importing IMP libraries

In [2]:
#Basic Data Processing
import numpy as np
import pandas as pd
import os
from pathlib import Path

In [3]:
#Deep Learning & Model:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.optim.lr_scheduler import LambdaLR

import pytorch_lightning as pl
from pytorch_lightning.utilities import rank_zero_only
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger

from transformers import (
    VisionEncoderDecoderConfig,
    DonutProcessor,
    VisionEncoderDecoderModel, 
    BartConfig,
    get_scheduler
)

from peft import LoraConfig, get_peft_model

In [4]:
#Image Processing
from PIL import Image
from PIL.Image import Resampling

In [5]:
#Dataset Management
from datasets import load_dataset, DatasetDict

In [6]:
#Asynchronous Operations
import asyncio
import aiohttp
from urllib.parse import urlparse

In [7]:
#Text Processing & Utils
import json
import re
from nltk import edit_distance
from collections import Counter
from typing import Any, List
from tqdm import tqdm

In [8]:
#Optimization & Monitoring
import wandb
import bitsandbytes as bnb

In [9]:
#Math & Scientific
import math

In [11]:
dataset = load_dataset("Vk333ML/Amazon_ml_challenge_flitered_dataset")

In [12]:
# Check if a file for a given image link exists in the image directory
def file_exists(row):
    filename = Path(row['image_link']).name  # Extract filename from the URL
    file_path = Path(IMAGE_PATH) / filename
    return file_path.exists()

In [13]:
IMAGE_PATH ='./image'

# Assume `dataset_dict` is your DatasetDict and `train` is your split
# Filter rows in the 'train' split based on file existence
filtered_train = dataset['train'].filter(file_exists)

# Create a new DatasetDict with the filtered train split
filtered_dataset_dict = DatasetDict({'train': filtered_train})

print("Filtered dataset:", filtered_dataset_dict)

Filtered dataset: DatasetDict({
    train: Dataset({
        features: ['image_link', 'group_id', 'entity_value', 'query', 'ground_truth'],
        num_rows: 18708
    })
})


In [14]:
# Extract the queries from the dataset
queries = filtered_dataset_dict["train"]["query"]

# Count occurrences of each unique query
query_counts = Counter(queries)

# Print the counts for each unique query
for query, count in query_counts.items():
    print(f"{query}: {count}")

What is the voltage?: 2346
What is the item_volume?: 2333
What is the width?: 2318
What is the height?: 2320
What is the depth?: 2363
What is the item_weight?: 2339
What is the wattage?: 2334
What is the maximum_weight_recommendation?: 2355


In [15]:
# Assuming `filtered_dataset_dict` is your filtered dataset
# Split the 'train' dataset into 80% training and 20% testing
train_test_split = filtered_dataset_dict['train'].train_test_split(test_size= 616, seed=42)

# Create a new DatasetDict with 'train' and 'test' splits
dataset_dict_new = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

print("Split dataset:", dataset_dict_new)
print("Number of rows in train split:", dataset_dict_new['train'].num_rows)
print("Number of rows in test split:", dataset_dict_new['test'].num_rows)

Split dataset: DatasetDict({
    train: Dataset({
        features: ['image_link', 'group_id', 'entity_value', 'query', 'ground_truth'],
        num_rows: 18092
    })
    test: Dataset({
        features: ['image_link', 'group_id', 'entity_value', 'query', 'ground_truth'],
        num_rows: 616
    })
})
Number of rows in train split: 18092
Number of rows in test split: 616


Setting up the config for fine tuning

In [16]:
max_length = 32
image_size = [1280, 960]

config = VisionEncoderDecoderConfig.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
config.encoder.image_size = image_size

config.decoder.max_length = max_length

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa", config=config ).to(device)

Config of the encoder: <class 'transformers.models.donut.modeling_donut_swin.DonutSwinModel'> is overwritten by shared encoder config: DonutSwinConfig {
  "attention_probs_dropout_prob": 0.0,
  "depths": [
    2,
    2,
    14,
    2
  ],
  "drop_path_rate": 0.1,
  "embed_dim": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": [
    1280,
    960
  ],
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-05,
  "mlp_ratio": 4.0,
  "model_type": "donut-swin",
  "num_channels": 3,
  "num_heads": [
    4,
    8,
    16,
    32
  ],
  "num_layers": 4,
  "patch_size": 4,
  "path_norm": true,
  "qkv_bias": true,
  "transformers_version": "4.47.1",
  "use_absolute_embeddings": false,
  "window_size": 10
}

Config of the decoder: <class 'transformers.models.mbart.modeling_mbart.MBartForCausalLM'> is overwritten by shared decoder config: MBartConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "add_

I will fine tune the model using LORA -- only fine - tuning the attention weights of Decoder . Since the vision encoder was already showing good OCR capabilities in Zero - shot .

In [18]:
lora_config = LoraConfig(
    r=32,
    lora_alpha=8,
    lora_dropout=0.1,
    target_modules=['q_proj' , 'v_proj'],
    bias="none"
)
lora_model = get_peft_model(model, lora_config)

In [19]:
def count_parameters(model):
    # Count total parameters
    total_params = sum(p.numel() for p in model.parameters())

    # Count only trainable parameters
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    print(f"Total parameters: {total_params}")
    print(f"Trainable parameters: {trainable_params}")
count_parameters(lora_model)

Total parameters: 201465976
Trainable parameters: 1048576


Adding the question and answer token in the tokenizer

In [23]:
added_token = []

In [21]:
def add_tokens( list_of_tokens):
        new_tokens = list_of_tokens
        tokens_to_add = [token for token in new_tokens if token not in processor.tokenizer.get_vocab()]

        if tokens_to_add:
            newly_added_num = processor.tokenizer.add_tokens(tokens_to_add)
            if newly_added_num > 0:
                model.decoder.resize_token_embeddings(len(processor.tokenizer))
                added_tokens.extend(tokens_to_add)

In [24]:
add_tokens(list_of_tokens = ["<question>" , "<answer>"])

In [25]:
def processed_parse(example):
    
    ex = json.loads(example['ground_truth'])
    ex1 = ex['gt_parse'][0]
    processed_parse = f"<question>{ex1['question']}<question><answer>{ex1['answer']}<answer>"
    return {'parse':processed_parse}

In [26]:
parsed_dict = dataset_dict_new.map(processed_parse )

In [27]:
import os
from pathlib import Path
from typing import Any, List
import torch
from torch.utils.data import Dataset
from PIL import Image
from tqdm import tqdm 
from PIL.Image import Resampling

class Donut_Dataset(Dataset):
    def __init__(self, dataset, max_length: int, split: str = 'train', ignore_id: int = -100, prompt_end_token: str = '<question>'
                  ):
        super().__init__()
        self.split = split
        self.dataset = dataset[split]
        self.max_length = max_length
        self.ignore_id = ignore_id
        self.prompt_end_token = prompt_end_token
        self.prompt_end_token_id = processor.tokenizer.convert_tokens_to_ids(self.prompt_end_token)


    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sample = self.dataset[idx]
        filename = Path(sample['image_link']).name
        file_path = os.path.join(IMAGE_PATH, filename)
        img = Image.open(file_path).convert("RGB")
        pixel_values = processor(images = img, do_resize = True , do_pad = True, do_normalize = True , resample = Resampling.LANCZOS , rescale_factor = 1/255 , return_tensors="pt" ).pixel_values
        input_tensor = pixel_values.squeeze()
        processed_parse = processor.tokenizer.bos_token + sample['parse'] + processor.tokenizer.eos_token
        input_ids = processor.tokenizer(
            processed_parse,
            add_special_tokens=False,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )["input_ids"].squeeze(0)

        if self.split == "train":
            labels = input_ids.clone()
            labels[labels == processor.tokenizer.pad_token_id] = self.ignore_id
            labels[:torch.max(torch.nonzero(labels == self.prompt_end_token_id)) + 1] = self.ignore_id
            return input_tensor, input_ids, labels
        else:
            prompt_end_index = torch.max(torch.nonzero(input_ids == self.prompt_end_token_id))
            return input_tensor, input_ids, prompt_end_index, processed_parse

In [28]:
processor.feature_extractor.size = image_size[::-1] 
processor.feature_extractor.do_align_long_axis = False



In [29]:
train_dataset = Donut_Dataset(parsed_dict, max_length=max_length,
                             split="train"
                             )

val_dataset = Donut_Dataset(parsed_dict, max_length=max_length,
                             split="test"
                             )

In [30]:
pixel_values , decoder_input , labels = train_dataset[290]

Legacy behavior is being used. The current behavior will be deprecated in version 5.0.0. In the new behavior, if both images and text are provided, the default value of `add_special_tokens` will be changed to `False` when calling the tokenizer if `add_special_tokens` is unset. To test the new behavior, set `legacy=False`as a processor call argument.


Loss is only computed against the answers generated

In [31]:
for decoder_input_id, label in zip(decoder_input.tolist()[:-1], labels.tolist()[1:]):
    if label != -100:
        print(processor.decode([decoder_input_id]), processor.decode([label]))
    else:
        print(processor.decode([decoder_input_id]), label)

<s> -100
<question> -100
What -100
is -100
the -100
maximum -100
_ -100
we -100
ight -100
_ -100
re -100
com -100
mend -100
ation -100
? -100
<question> <answer>
<answer> 150
150 kilogram
kilogram <answer>
<answer> </s>
</s> -100
<pad> -100
<pad> -100
<pad> -100
<pad> -100
<pad> -100
<pad> -100
<pad> -100
<pad> -100
<pad> -100
<pad> -100


In [32]:
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=0 , pin_memory=True)
val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=0 , pin_memory=True)

In [33]:
class DonutModelPLModule(pl.LightningModule):
    def __init__(self, config, processor, model):
        super().__init__()
        self.config = config
        self.processor = processor
        self.model = model

    def training_step(self, batch, batch_idx):
        pixel_values, decoder_input_ids, labels = batch

        outputs = self.model(pixel_values,
                             decoder_input_ids=decoder_input_ids[:, :-1],
                             labels=labels[:, 1:])
        loss = outputs.loss
        self.log_dict({"train_loss": loss}, sync_dist=True)
        return loss

    def validation_step(self, batch, batch_idx, dataset_idx=0):
        pixel_values, decoder_input_ids, prompt_end_idxs, answers = batch
        decoder_prompts = pad_sequence(
            [input_id[: end_idx + 1] for input_id, end_idx in zip(decoder_input_ids, prompt_end_idxs)],
            batch_first=True,
        )

        outputs = self.model.generate(pixel_values,
                                   decoder_input_ids=decoder_prompts,
                                   max_length=max_length,
                                   early_stopping=True,
                                   pad_token_id=self.processor.tokenizer.pad_token_id,
                                   eos_token_id=self.processor.tokenizer.eos_token_id,
                                   use_cache=True,
                                   num_beams=1,
                                   bad_words_ids=[[self.processor.tokenizer.unk_token_id]],
                                   return_dict_in_generate=True,)

        predictions = []
        for seq in self.processor.tokenizer.batch_decode(outputs.sequences):
            seq = seq.replace(self.processor.tokenizer.eos_token, "").replace(self.processor.tokenizer.pad_token, "")
            seq = re.sub(r"<.*?>", "", seq, count=1).strip()  # remove first task start token
            predictions.append(seq)

        scores = list()
        for pred, answer in zip(predictions, answers):
            pred = re.sub(r"(?<=<) | (?=>)", "", pred, count=1)
            answer = answer.replace(self.processor.tokenizer.eos_token, "")
            scores.append(edit_distance(pred, answer) / max(len(pred), len(answer)))

            if self.config.get("verbose", False) and len(scores) == 1:
                print(f"Prediction: {pred}")
                print(f"    Answer: {answer}")
                print(f" Normed ED: {scores[0]}")

        return scores

    def on_validation_epoch_end(self):
        # I set this to 1 manually
        # (previously set to len(self.config.dataset_name_or_paths))
        validation_step_outputs = self.trainer.callback_metrics.get("validation_step", [])
        if not validation_step_outputs:
            return
        num_of_loaders = 1
        if num_of_loaders == 1:
            validation_step_outputs = [validation_step_outputs]
        assert len(validation_step_outputs) == num_of_loaders
        cnt = [0] * num_of_loaders
        total_metric = [0] * num_of_loaders
        val_metric = [0] * num_of_loaders
        for i, results in enumerate(validation_step_outputs):
            for scores in results:
                cnt[i] += len(scores)
                total_metric[i] += np.sum(scores)
            val_metric[i] = total_metric[i] / cnt[i]
            val_metric_name = f"val_metric_{i}th_dataset"
            self.log_dict({val_metric_name: val_metric[i]}, sync_dist=True)
        self.log_dict({"val_metric": np.sum(total_metric) / np.sum(cnt)}, sync_dist=True)

    def configure_optimizers(self):
    # Replace bnb.optim.Adam8bit with torch.optim.Adam
        num_epochs = self.config.get('max_epochs')
        num_training_steps = num_epochs * len(train_dataloader)

        optimizer = torch.optim.Adam(self.parameters(), lr=self.config.get("lr"))
        lr_scheduler = get_scheduler(
          "linear",
          optimizer=optimizer,
          num_warmup_steps=10,
          num_training_steps=num_training_steps,
        )

        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}

    def train_dataloader(self):
        return train_dataloader

    def val_dataloader(self):
        return val_dataloader

In [34]:
config = {"max_epochs":5,
          "val_check_interval":0.5, # how many times we want to validate during an epoch
          "check_val_every_n_epoch":1,
          "gradient_clip_val":1.0,
          "num_training_samples_per_epoch": None ,
          "lr":3e-5,
          "train_batch_sizes": [8],
          "val_batch_sizes": [1],
          "num_nodes": 1,
          "warmup_steps": 300,
          "result_path": "/kaggle/working/wandb",
          "verbose": True,
          }

In [35]:
MODEL_PATH = Path('MODEL')
if not MODEL_PATH.exists():
    MODEL_PATH.mkdir()

In [36]:
checkpoint_callback = ModelCheckpoint(
    dirpath=MODEL_PATH,
    filename="model_{epoch}",   
    save_top_k=-1,                   
    every_n_epochs=1,                
    save_weights_only=True           
)

In [37]:
model_module = DonutModelPLModule(config, processor, model)

In [38]:
# Log in to WandB
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mshivamsharma8816153[0m ([33mshivamsharma8816153-indian-institute-of-technology-kanpur[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

Fine-tuning using Mixed precision

In [39]:
wandb_logger = WandbLogger(project="Donut-VQA_Demo")

trainer = pl.Trainer(
        devices=1,
        max_epochs=config.get("max_epochs"),
        val_check_interval=config.get("val_check_interval"),
        check_val_every_n_epoch=config.get("check_val_every_n_epoch"),
        gradient_clip_val=config.get("gradient_clip_val"),
        precision='16-mixed', # we'll use mixed precision
        num_sanity_val_steps=0,
        logger=wandb_logger,
        callbacks=[checkpoint_callback],
        # callbacks=[lr_callback, checkpoint_callback],
)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [42]:
trainer.fit(model_module)

c:\Users\shiva\AppData\Local\Programs\Python\Python310\lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:654: Checkpoint directory C:\Users\shiva\Documents\PIVOT\MODEL exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                      | Params | Mode
-----------------------------------------------------------
0 | model | VisionEncoderDecoderModel | 201 M  | eval
-----------------------------------------------------------
1.0 M     Trainable params
200 M     Non-trainable params
201 M     Total params
805.872   Total estimated model params size (MB)
160       Modules in train mode
484       Modules in eval mode


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Prediction: <question> What is the depth?<question>.0 inch.</s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer> ounce ounce ounce ounce
    Answer: <s><question>What is the depth?<question><answer>37.0 millimetre<answer>
 Normed ED: 0.6351351351351351
Prediction: <question> What is the height?<question>.0</s_answer> centimetre.0</s_answer> centimetre</s_answer> centimetre</s_answer>
    Answer: <s><question>What is the height?<question><answer>86.0 centimetre<answer>
 Normed ED: 0.4628099173553719
Prediction: <question> What is the item_weight?<question> ounce. centimetre</s_answer>
    Answer: <s><question>What is the item_weight?<question><answer>15.0 gram<answer>
 Normed ED: 0.32432432432432434
Prediction: <question> What is the depth?<question>.0</s_answer> centimetre. centimetre</s_answer> centimetre</s_answer>
    Answer: <s><question>What is the depth?<question><answer>15.0 centimetre<answer>
 Normed ED: 0.4166666666666667
Prediction: <question> What is 

Validation: |          | 0/? [00:00<?, ?it/s]

Prediction: <question> What is the depth?<question>.0 inch.</s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer> ounce ounce ounce ounce
    Answer: <s><question>What is the depth?<question><answer>37.0 millimetre<answer>
 Normed ED: 0.6351351351351351
Prediction: <question> What is the height?<question>.0</s_answer> centimetre.0</s_answer> centimetre</s_answer> centimetre</s_answer>
    Answer: <s><question>What is the height?<question><answer>86.0 centimetre<answer>
 Normed ED: 0.4628099173553719
Prediction: <question> What is the item_weight?<question> ounce. centimetre</s_answer>
    Answer: <s><question>What is the item_weight?<question><answer>15.0 gram<answer>
 Normed ED: 0.32432432432432434
Prediction: <question> What is the depth?<question>.0</s_answer> centimetre. centimetre</s_answer> centimetre</s_answer>
    Answer: <s><question>What is the depth?<question><answer>15.0 centimetre<answer>
 Normed ED: 0.4166666666666667
Prediction: <question> What is 

Validation: |          | 0/? [00:00<?, ?it/s]

Prediction: <question> What is the depth?<question>.0 inch.</s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer> ounce ounce ounce
    Answer: <s><question>What is the depth?<question><answer>37.0 millimetre<answer>
 Normed ED: 0.6914285714285714
Prediction: <question> What is the height?<question>.0</s_answer> centimetre.0</s_answer> centimetre</s_answer> centimetre</s_answer>
    Answer: <s><question>What is the height?<question><answer>86.0 centimetre<answer>
 Normed ED: 0.4628099173553719
Prediction: <question> What is the item_weight?<question> ounce. centimetre</s_answer>
    Answer: <s><question>What is the item_weight?<question><answer>15.0 gram<answer>
 Normed ED: 0.32432432432432434
Prediction: <question> What is the depth?<question>.0</s_answer> centimetre. centimetre</s_answer> centimetre</s_answer>
    Answer: <s><question>What is the depth?<question><answer>15.0 centimetre<answer>
 Normed ED: 0.4166666666666667
Pred

Validation: |          | 0/? [00:00<?, ?it/s]

Prediction: <question> What is the depth?<question>.0 inch.</s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer>
    Answer: <s><question>What is the depth?<question><answer>37.0 millimetre<answer>
 Normed ED: 0.757847533632287
Prediction: <question> What is the height?<question>.0</s_answer> centimetre. centimetre</s_answer> centimetre</s_answer>
    Answer: <s><question>What is the height?<question><answer>86.0 centimetre<answer>
 Normed ED: 0.41284403669724773
Prediction: <question> What is the item_weight?<question> ounce. ounce ounce ounce ounce ounce ounce ounce ounce
    Answer: <s><question>What is the item_weight?<question><answer>15.0 gram<answer>
 Normed ED: 0.54
Prediction: <question> What is the depth?<question>.0</s_answer> centimetre. centimetre</s_answer> centimetre</s_answer>
    Answer: <s><question>What is the depth?<question><answer>15.0 centime

`Trainer.fit` stopped: `max_epochs=5` reached.


In [43]:
def evaluate_model(model_module, processor, test_dataset, device='cuda'):
    model_module.eval()  # Set model to evaluation mode
    model_module.to(device)
    
    results = []
    
    # Create a test dataloader
    test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)
    
    with torch.no_grad():  # Disable gradient calculation
        for batch in tqdm(test_dataloader, desc="Evaluating"):
            pixel_values, decoder_input_ids, prompt_end_idxs, answers = batch
            
            # Move inputs to device
            pixel_values = pixel_values.to(device)
            decoder_input_ids = decoder_input_ids.to(device)
            
            # Get decoder prompts
            decoder_prompts = pad_sequence(
                [input_id[: end_idx + 1] for input_id, end_idx in zip(decoder_input_ids, prompt_end_idxs)],
                batch_first=True,
            ).to(device)

            # Generate predictions
            outputs = model_module.model.generate(
                pixel_values,
                decoder_input_ids=decoder_prompts,
                max_length=max_length,
                early_stopping=True,
                pad_token_id=processor.tokenizer.pad_token_id,
                eos_token_id=processor.tokenizer.eos_token_id,
                use_cache=True,
                num_beams=1,
                bad_words_ids=[[processor.tokenizer.unk_token_id]],
                return_dict_in_generate=True,
            )

            # Decode predictions
            predictions = []
            for seq in processor.tokenizer.batch_decode(outputs.sequences):
                seq = seq.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
                seq = re.sub(r"<.*?>", "", seq, count=1).strip()  # remove first task start token
                predictions.append(seq)

            # Calculate scores
            for pred, answer in zip(predictions, answers):
                pred = re.sub(r"(?<=<) | (?=>)", "", pred, count=1)
                answer = answer.replace(processor.tokenizer.eos_token, "")
                score = edit_distance(pred, answer) / max(len(pred), len(answer))
                
                results.append({
                    'prediction': pred,
                    'ground_truth': answer,
                    'normalized_edit_distance': score
                })
                
    # Calculate overall metrics
    scores = [r['normalized_edit_distance'] for r in results]
    avg_score = sum(scores) / len(scores)
    
    print(f"\nEvaluation Results:")
    print(f"Average Normalized Edit Distance: {avg_score:.4f}")
    
    # Print some example predictions
    print("\nSample Predictions:")
    for i in range(min(5, len(results))):
        print(f"\nExample {i+1}:")
        print(f"Prediction: {results[i]['prediction']}")
        print(f"Ground Truth: {results[i]['ground_truth']}")
        print(f"Score: {results[i]['normalized_edit_distance']:.4f}")
    
    return results

# Load the saved checkpoint
checkpoint_path = "MODEL/model_epoch=4.ckpt"  # Adjust to your latest checkpoint
model_module = DonutModelPLModule.load_from_checkpoint(
    checkpoint_path,
    config=config,
    processor=processor,
    model=model
)

# Run evaluation
evaluation_results = evaluate_model(
    model_module=model_module,
    processor=processor,
    test_dataset=val_dataset,  # Using your validation dataset
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

# Optionally save results to a file
import json
with open('evaluation_results.json', 'w') as f:
    json.dump(evaluation_results, f, indent=2)

Evaluating: 100%|██████████| 616/616 [14:57<00:00,  1.46s/it]


Evaluation Results:
Average Normalized Edit Distance: 0.4997

Sample Predictions:

Example 1:
Prediction: <question> What is the depth?<question>.0 inch.</s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer>
Ground Truth: <s><question>What is the depth?<question><answer>37.0 millimetre<answer>
Score: 0.7578

Example 2:
Prediction: <question> What is the height?<question>.0</s_answer> centimetre. centimetre</s_answer> centimetre</s_answer>
Ground Truth: <s><question>What is the height?<question><answer>86.0 centimetre<answer>
Score: 0.4128

Example 3:
Prediction: <question> What is the item_weight?<question> ounce. ounce ounce ounce ounce ounce ounce ounce ounce
Ground Truth: <s><question>What is the item_weight?<question><answer>15.0 gram<answer>
Score: 0.5400

Example 4:
Prediction: <question> What is the depth?<question>.0</s_answer> centimetre. centimetre</s_ans




In [46]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

def visualize_results(evaluation_results):
    # Convert results to DataFrame for easier plotting
    df = pd.DataFrame(evaluation_results)
    
    # Set style - using a standard matplotlib style instead of seaborn
    plt.style.use('default')  # Changed from 'seaborn' to 'default'
    
    # Create a figure with multiple subplots
    fig = plt.figure(figsize=(20, 15))
    
    # 1. Distribution of Edit Distances
    plt.subplot(2, 2, 1)
    plt.hist(df['normalized_edit_distance'], bins=30, edgecolor='black')  # Changed to plt.hist
    plt.title('Distribution of Normalized Edit Distances')
    plt.xlabel('Normalized Edit Distance')
    plt.ylabel('Count')
    
    # 2. Box Plot of Edit Distances
    plt.subplot(2, 2, 2)
    plt.boxplot(df['normalized_edit_distance'])  # Changed to plt.boxplot
    plt.title('Box Plot of Normalized Edit Distances')
    plt.ylabel('Normalized Edit Distance')
    
    # 3. Performance Categories
    plt.subplot(2, 2, 3)
    categories = pd.cut(df['normalized_edit_distance'], 
                       bins=[0, 0.2, 0.4, 0.6, 0.8, 1.0],
                       labels=['Excellent', 'Good', 'Fair', 'Poor', 'Very Poor'])
    category_counts = categories.value_counts()
    plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%')
    plt.title('Distribution of Performance Categories')
    
    # 4. Cumulative Distribution
    plt.subplot(2, 2, 4)
    sorted_data = np.sort(df['normalized_edit_distance'])
    cumulative = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
    plt.plot(sorted_data, cumulative)  # Changed to plt.plot
    plt.title('Cumulative Distribution of Edit Distances')
    plt.xlabel('Normalized Edit Distance')
    plt.ylabel('Cumulative Proportion')
    
    plt.tight_layout()
    plt.savefig('evaluation_visualizations.png')
    plt.close()
    
    # Create a detailed analysis report
    print("\nDetailed Analysis Report:")
    print("-" * 50)
    print(f"Total samples evaluated: {len(df)}")
    print(f"Average edit distance: {df['normalized_edit_distance'].mean():.4f}")
    print(f"Median edit distance: {df['normalized_edit_distance'].median():.4f}")
    print(f"Standard deviation: {df['normalized_edit_distance'].std():.4f}")
    
    # Performance breakdown
    print("\nPerformance Categories:")
    print(category_counts.to_string())
    
    # Sample predictions at different performance levels
    print("\nExample Predictions by Performance Level:")
    for category in ['Excellent', 'Good', 'Fair', 'Poor', 'Very Poor']:
        samples = df[categories == category]
        if not samples.empty:
            sample = samples.iloc[0]
            print(f"\n{category} Example:")
            print(f"Prediction: {sample['prediction']}")
            print(f"Ground Truth: {sample['ground_truth']}")
            print(f"Edit Distance: {sample['normalized_edit_distance']:.4f}")
    
    # Save detailed results to CSV
    df.to_csv('detailed_results.csv', index=False)

# Run the visualization
visualize_results(evaluation_results)

# Optional: Create a simpler confusion matrix visualization
def create_answer_category_analysis(evaluation_results):
    def categorize_answer(answer):
        if answer.isdigit():
            return 'numeric'
        elif len(answer.split()) == 1:
            return 'single_word'
        else:
            return 'multi_word'
    
    df = pd.DataFrame(evaluation_results)
    df['pred_category'] = df['prediction'].apply(categorize_answer)
    df['true_category'] = df['ground_truth'].apply(categorize_answer)
    
    # Create confusion matrix
    categories = sorted(list(set(df['pred_category'].unique()) | set(df['true_category'].unique())))
    matrix = pd.DataFrame(0, index=categories, columns=categories)
    
    for true_cat, pred_cat in zip(df['true_category'], df['pred_category']):
        matrix.loc[true_cat, pred_cat] += 1
    
    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    plt.imshow(matrix, cmap='Blues')
    plt.colorbar()
    plt.xticks(range(len(categories)), categories, rotation=45)
    plt.yticks(range(len(categories)), categories)
    plt.title('Answer Category Confusion Matrix')
    plt.xlabel('Predicted Category')
    plt.ylabel('True Category')
    
    # Add text annotations
    for i in range(len(categories)):
        for j in range(len(categories)):
            plt.text(j, i, int(matrix.iloc[i, j]), 
                    ha='center', va='center')
    
    plt.tight_layout()
    plt.savefig('category_confusion_matrix.png')
    plt.close()
    
    # Print category-wise performance
    print("\nCategory-wise Performance:")
    for category in categories:
        correct = matrix.loc[category, category]
        total = matrix.loc[category].sum()
        accuracy = correct / total if total > 0 else 0
        print(f"{category}: {accuracy:.2%} ({correct}/{total})")

# Run the category analysis
create_answer_category_analysis(evaluation_results)

# Analyze performance by answer length
def analyze_by_answer_length(evaluation_results):
    df = pd.DataFrame(evaluation_results)
    df['ground_truth_length'] = df['ground_truth'].str.len()
    
    plt.figure(figsize=(12, 6))
    plt.scatter(df['ground_truth_length'], df['normalized_edit_distance'], alpha=0.5)
    
    # Add trend line
    z = np.polyfit(df['ground_truth_length'], df['normalized_edit_distance'], 1)
    p = np.poly1d(z)
    plt.plot(df['ground_truth_length'], p(df['ground_truth_length']), "r--", alpha=0.8)
    
    plt.title('Performance vs Answer Length')
    plt.xlabel('Ground Truth Length')
    plt.ylabel('Normalized Edit Distance')
    plt.savefig('performance_by_length.png')
    plt.close()
    
    # Calculate correlation
    correlation = df['ground_truth_length'].corr(df['normalized_edit_distance'])
    print(f"\nCorrelation between answer length and edit distance: {correlation:.4f}")

# Run the length analysis
analyze_by_answer_length(evaluation_results)


Detailed Analysis Report:
--------------------------------------------------
Total samples evaluated: 616
Average edit distance: 0.4997
Median edit distance: 0.4641
Standard deviation: 0.1663

Performance Categories:
normalized_edit_distance
Fair         253
Good         191
Poor         165
Excellent      6
Very Poor      1

Example Predictions by Performance Level:

Excellent Example:
Prediction: <question> What is the maximum_weight_recommendation?<question>.0 kilogram.0 kilogram</s_answer>
Ground Truth: <s><question>What is the maximum_weight_recommendation?<question><answer>100 kilogram<answer>
Edit Distance: 0.1875

Good Example:
Prediction: <question> What is the item_weight?<question>.0, ounce., ounce in inch.</s_answer>
Ground Truth: <s><question>What is the item_weight?<question><answer>4 kilogram<answer>
Edit Distance: 0.3780

Fair Example:
Prediction: <question> What is the height?<question>.0</s_answer> centimetre. centimetre</s_answer> centimetre</s_answer>
Ground Truth:

In [48]:
# 1. First load your saved model
checkpoint_path = "MODEL/model_epoch=4.ckpt"  # Adjust to your latest checkpoint
model_module = DonutModelPLModule.load_from_checkpoint(
    checkpoint_path,
    config=config,
    processor=processor,
    model=model
)

# 2. Run evaluation using your validation dataset (val_dataset)
evaluation_results = evaluate_model(
    model_module=model_module,
    processor=processor,
    test_dataset=val_dataset,  # Using val_dataset instead of test_dataset
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

# 3. Create visualizations
visualize_results(evaluation_results)

Evaluating: 100%|██████████| 616/616 [11:12<00:00,  1.09s/it]



Evaluation Results:
Average Normalized Edit Distance: 0.4997

Sample Predictions:

Example 1:
Prediction: <question> What is the depth?<question>.0 inch.</s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer></s_answer>
Ground Truth: <s><question>What is the depth?<question><answer>37.0 millimetre<answer>
Score: 0.7578

Example 2:
Prediction: <question> What is the height?<question>.0</s_answer> centimetre. centimetre</s_answer> centimetre</s_answer>
Ground Truth: <s><question>What is the height?<question><answer>86.0 centimetre<answer>
Score: 0.4128

Example 3:
Prediction: <question> What is the item_weight?<question> ounce. ounce ounce ounce ounce ounce ounce ounce ounce
Ground Truth: <s><question>What is the item_weight?<question><answer>15.0 gram<answer>
Score: 0.5400

Example 4:
Prediction: <question> What is the depth?<question>.0</s_answer> centimetre. centimetre</s_ans

In [49]:
visualize_results(evaluation_results)


Detailed Analysis Report:
--------------------------------------------------
Total samples evaluated: 616
Average edit distance: 0.4997
Median edit distance: 0.4641
Standard deviation: 0.1663

Performance Categories:
normalized_edit_distance
Fair         253
Good         191
Poor         165
Excellent      6
Very Poor      1

Example Predictions by Performance Level:

Excellent Example:
Prediction: <question> What is the maximum_weight_recommendation?<question>.0 kilogram.0 kilogram</s_answer>
Ground Truth: <s><question>What is the maximum_weight_recommendation?<question><answer>100 kilogram<answer>
Edit Distance: 0.1875

Good Example:
Prediction: <question> What is the item_weight?<question>.0, ounce., ounce in inch.</s_answer>
Ground Truth: <s><question>What is the item_weight?<question><answer>4 kilogram<answer>
Edit Distance: 0.3780

Fair Example:
Prediction: <question> What is the height?<question>.0</s_answer> centimetre. centimetre</s_answer> centimetre</s_answer>
Ground Truth: