In [2]:
!pip install -q transformers datasets


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
bigframes 1.42.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.9.0.13 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cudnn-cu12==9.1.0.70; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cudnn-cu12 9.3.0.75 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cufft-cu12==11.2.1.3; platform_system == "Linux" and platform_machine == "x86_64", but you have nv

**Creating a dataframe from the dataset**
This code reads label.txt, which contains tab-separated image filenames and their corresponding text labels. It builds full image paths, checks if they exist, and stores valid (image_path, text) pairs in a Pandas DataFrame.

Output: A DataFrame with columns:

image_path: Full path to each image

text: Corresponding label (transcription)


In [3]:
import os
import pandas as pd

# Parse labels.txt
label_file = "/kaggle/input/labeldata/label.txt"
image_dir = "/kaggle/input/studydata/input/dataset"

data = []
with open(label_file, "r", encoding="utf-8") as f:
    for line in f:
        fname, text = line.strip().split("\t")
        img_path = os.path.join(image_dir, fname)
        if os.path.exists(img_path):
            data.append({"image_path": img_path, "text": text})

df = pd.DataFrame(data)


**Checking the dataframe**

In [79]:
df.head(50)


Unnamed: 0,image_path,text
0,/kaggle/input/studydata/input/dataset/Picture1...,Why is it?
1,/kaggle/input/studydata/input/dataset/Picture1...,How are you?
2,/kaggle/input/studydata/input/dataset/Picture1...,Where are you going?
3,/kaggle/input/studydata/input/dataset/Picture1...,This is questionable.
4,/kaggle/input/studydata/input/dataset/Picture1...,Are you sure?
5,/kaggle/input/studydata/input/dataset/Picture1...,Cutie!!!!
6,/kaggle/input/studydata/input/dataset/Picture1...,How are you?
7,/kaggle/input/studydata/input/dataset/Picture1...,It is gonna rain today.
8,/kaggle/input/studydata/input/dataset/Picture1...,Are you going to library?
9,/kaggle/input/studydata/input/dataset/Picture1...,Automatically crops and resizes images.


**This code loads the pretrained TrOCR model and its processor for handwritten text recognition.**
The TrOCRProcessor and VisionEncoderDecoderModel are key components used for handwriting recognition with the TrOCR model from Hugging Face. The TrOCRProcessor acts as a unified interface that combines a feature extractor and a tokenizer. It preprocesses input images by converting them into pixel values suitable for the vision encoder and also handles the tokenization of text for the decoder. The VisionEncoderDecoderModel is an end-to-end architecture that integrates a vision encoder (like ViT) and a text decoder (such as RoBERTa). In this setup, the encoder processes the input image to extract visual features, and the decoder generates the corresponding text output. Specifically, the "microsoft/trocr-base-handwritten" model is a pretrained version fine-tuned for recognizing handwritten text. Together, the processor and model enable seamless conversion of handwritten images into digital text through an image-to-sequence generation pipeline.

In [5]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")


2025-05-31 13:14:21.287097: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748697261.504515      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748697261.575958      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_mod

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

This code converts a Pandas DataFrame containing image paths and corresponding text labels into a Hugging Face Dataset and applies preprocessing suitable for training a TrOCR model. Each image is loaded using PIL and processed into pixel_values using the TrOCRProcessor, which prepares the image input for the vision encoder. The associated text is tokenized into input_ids for the decoder, with padding token IDs replaced by -100 to ensure they are ignored during loss computation. The map() function applies this transformation to the entire dataset, and set_format(type="torch") ensures that the output is returned as PyTorch tensors, ready for model training.

In [6]:
from datasets import Dataset
from PIL import Image
import torch

# Convert DataFrame to HuggingFace Dataset
dataset = Dataset.from_pandas(df)

# Preprocessing
def preprocess(example):
    image = Image.open(example["image_path"]).convert("RGB")
    pixel_values = processor(images=image, return_tensors="pt").pixel_values.squeeze(0)
    
    labels = processor.tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    ).input_ids.squeeze(0)
    labels[labels == processor.tokenizer.pad_token_id] = -100

    return {"pixel_values": pixel_values, "labels": labels}

# Map with proper formatting
dataset = dataset.map(preprocess, remove_columns=dataset.column_names)
dataset.set_format(type="torch")


Map:   0%|          | 0/86 [00:00<?, ? examples/s]

This code sets up a PyTorch DataLoader to feed preprocessed image-text data into the TrOCR model during training. A custom collate_fn is defined to correctly batch the pixel_values and labels from individual examples by stacking them into tensors. The DataLoader is then created using the processed Hugging Face dataset, with a batch size of 4, shuffling enabled for training randomness, and the custom collate function to handle batching. This setup ensures efficient and correctly formatted input to the model.

In [7]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    pixel_values = torch.stack([x["pixel_values"] for x in batch])
    labels = torch.stack([x["labels"] for x in batch])
    return {"pixel_values": pixel_values, "labels": labels}

train_dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)


In [8]:
from datasets import Dataset
from PIL import Image
import torch

**Initializing decoder**

In [9]:
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id

**Training the model on our dataset**

In [55]:
from torch.optim import AdamW
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

optimizer = AdamW(model.parameters(), lr=5e-5)

best_loss = float("inf")  # Initialize with infinity

for epoch in range(100):  # Adjust as needed
    total_loss = 0
    model.train()
    
    for batch in train_dataloader:
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(pixel_values=pixel_values, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} Average Loss: {avg_loss:.4f}")

    # Save best model
    if avg_loss < best_loss:
        best_loss = avg_loss
        print(f"✅ New best model found at epoch {epoch+1} with loss {avg_loss:.4f}. Saving model...")
        model.save_pretrained("best-trocr-model")
        processor.save_pretrained("best-trocr-model")


Epoch 1 Average Loss: 0.3380
✅ New best model found at epoch 1 with loss 0.3380. Saving model...
Epoch 2 Average Loss: 0.3558
Epoch 3 Average Loss: 0.2544
✅ New best model found at epoch 3 with loss 0.2544. Saving model...
Epoch 4 Average Loss: 0.1165
✅ New best model found at epoch 4 with loss 0.1165. Saving model...
Epoch 5 Average Loss: 0.1051
✅ New best model found at epoch 5 with loss 0.1051. Saving model...
Epoch 6 Average Loss: 0.1782
Epoch 7 Average Loss: 0.1780
Epoch 8 Average Loss: 0.4440
Epoch 9 Average Loss: 0.2282
Epoch 10 Average Loss: 0.1371
Epoch 11 Average Loss: 0.2170
Epoch 12 Average Loss: 0.1921
Epoch 13 Average Loss: 0.1088
Epoch 14 Average Loss: 0.0927
✅ New best model found at epoch 14 with loss 0.0927. Saving model...
Epoch 15 Average Loss: 0.1640
Epoch 16 Average Loss: 0.2278
Epoch 17 Average Loss: 0.1174
Epoch 18 Average Loss: 0.1250
Epoch 19 Average Loss: 0.0448
✅ New best model found at epoch 19 with loss 0.0448. Saving model...
Epoch 20 Average Loss: 0.0725

**Saving the model**

In [11]:
model.save_pretrained("finetuned-trocr-handwriting")
processor.save_pretrained("finetuned-trocr-handwriting")


[]

**Prediction**

In [82]:
model.eval()

test_image = Image.open("/kaggle/input/studydata/input/dataset/Picture23.jpg").convert("RGB")
pixel_values = processor(images=test_image, return_tensors="pt").pixel_values.to(device)

generated_ids = model.generate(pixel_values)
predicted_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

print("Predicted Text:", predicted_text)


Predicted Text: Note that line


**Save model in kaggle**

In [30]:
model_dir = "/kaggle/working/finetuned-trocr-handwriting"
model.save_pretrained(model_dir)
processor.save_pretrained(model_dir)


[]

In [31]:
from transformers import VisionEncoderDecoderModel, TrOCRProcessor
from PIL import Image
import torch

# Load your fine-tuned model and processor
model = VisionEncoderDecoderModel.from_pretrained("finetuned-trocr-handwriting")
processor = TrOCRProcessor.from_pretrained("finetuned-trocr-handwriting")


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_mod