In [1]:


from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import zipfile
import numpy as np
import torch
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import editdistance
from tqdm import tqdm

In [3]:
# 1) Unzip your model archive
ZIP_PATH      = "/content/drive/MyDrive/NLP/200k/trocr_frozen_head.zip"
UNZIP_DIR     = "/content/"
os.makedirs(UNZIP_DIR, exist_ok=True)
with zipfile.ZipFile(ZIP_PATH, "r") as z:
    z.extractall(UNZIP_DIR)

In [4]:
# # 2) Load processor + model
# processor = TrOCRProcessor.from_pretrained("/content/kaggle/working/trocr_frozen_head/", use_fast=True)
# model     = VisionEncoderDecoderModel.from_pretrained(UNZIP_DIR)
# model.eval()

# model.to(device)


# 1) load the processor from the original pretrained hub model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-small-printed", use_fast=True)

# 2) load your fine-tuned weights into a VisionEncoderDecoderModel
model = VisionEncoderDecoderModel.from_pretrained(
    "/content/kaggle/working/trocr_frozen_head/",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/327 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

VisionEncoderDecoderModel(
  (encoder): DeiTModel(
    (embeddings): DeiTEmbeddings(
      (patch_embeddings): DeiTPatchEmbeddings(
        (projection): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): DeiTEncoder(
      (layer): ModuleList(
        (0-11): 12 x DeiTLayer(
          (attention): DeiTAttention(
            (attention): DeiTSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
            )
            (output): DeiTSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): DeiTIntermediate(
            (dense): Linear(in_features=384, out_features=1536, bias=True)
        

In [5]:
# 3) Load your test images + labels
#    (adapt these paths / loading calls to wherever your .npy lives)
X_test = np.load("/content/drive/MyDrive/NLP/test/x_testp.npy", mmap_mode="r")
Y_test = np.load("/content/drive/MyDrive/NLP/test/y_testp.npy", allow_pickle=True).squeeze()

# utility: decode your label‐indices → string
idx2char = {}

special_tokens = {
    "<space>": " ",
    "<newline>": "\n",
    "<tab>": "\t"
}

with open("/content/drive/MyDrive/NLP/test/mapping_labls", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split(maxsplit=1)
        if len(parts) == 2:
            i, ch = parts
            ch = special_tokens.get(ch, ch)  # convert <space> etc to actual char
            idx2char[int(i)] = ch

def decode_indices(arr, pad_idx=0, eos_idx=1):
    chars = []
    for i in arr:
        if i==pad_idx: break
        if i==eos_idx: chars.append(" ")
        else:           chars.append(idx2char.get(int(i),""))
    return "".join(chars)


In [6]:
# 4) Metrics
def cer_score(preds, refs):
    tot_edits = sum(editdistance.eval(p,r) for p,r in zip(preds,refs))
    tot_chars = sum(len(r) for r in refs)
    return tot_edits/tot_chars

def wer_score(preds, refs):
    tot_edits = 0
    tot_words = 0
    for p,r in zip(preds,refs):
        pw = p.split(); rw = r.split()
        tot_edits += editdistance.eval(pw, rw)
        tot_words += len(rw)
    return tot_edits/tot_words

def char_acc(preds, refs):
    correct = 0
    total   = 0
    for p,r in zip(preds,refs):
        for pc,rc in zip(p,r):
            if pc==rc: correct+=1
        total += len(r)
    return correct/total


In [7]:
# 5) Run inference in batches
batch_size = 16
pred_texts = []
true_texts = []

for i in tqdm(range(0, len(X_test), batch_size)):
    xb = X_test[i : i + batch_size]
    yb = Y_test[i : i + batch_size]

    # prepare pixel values
    images = []
    for img in xb:
        # img is (H, W) single-channel float in [0,1]; TrOCR wants uint8 RGB
        im_uint8 = (img * 255).astype("uint8")
        rgb = np.repeat(im_uint8[:, :, None], 3, axis=-1)  # now (H, W, 3)
        images.append(rgb)

    # this will now work
    inputs = processor(images=images, return_tensors="pt")
    pixel_values = inputs.pixel_values.to(device)

    # generate
    generated_ids = model.generate(pixel_values, max_length=128)
    batch_preds = processor.batch_decode(generated_ids, skip_special_tokens=True)

    # decode ground-truth using your existing function
    batch_trues = [decode_indices(arr, pad_idx=0) for arr in yb]

    pred_texts.extend(batch_preds)
    true_texts.extend(batch_trues)

# then you can compute CER/WER/accuracy over pred_texts vs. true_texts


100%|██████████| 182/182 [01:47<00:00,  1.69it/s]


In [8]:
# 6) Compute metrics
print("CER:", cer_score(pred_texts, true_texts))
print("WER:", wer_score(pred_texts, true_texts))
print("Char Acc:", char_acc(pred_texts, true_texts))


CER: 0.9142629270237937
WER: 1.001440283625083
Char Acc: 0.0033801105182203147
