In [1]:
!pip install --upgrade --quiet transformers==4.41.2 peft==0.10.0 accelerate bitsandbytes


In [None]:
.

import time
import math
import numpy as np
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import torch
import evaluate
from tqdm import tqdm


In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
base_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
base_model.to(device)

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],   # works for TrOCR-base
    lora_dropout=0.05,
    bias="none",
)

# Prepare for k-bit training if using 8-bit/4-bit
# (if not using quantization, skip the next line)
# base_model = prepare_model_for_kbit_training(base_model)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

# ---- REQUIRED FIX ----
model.config.decoder_start_token_id = processor.tokenizer.bos_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.eos_token_id = processor.tokenizer.eos_token_id

# Avoid losses becoming NaN
model.config.vocab_size = model.decoder.config.vocab_size
# ----------------------

model.to(device)




Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 761,856 || all params: 334,683,648 || trainable%: 0.22763466472075744


Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fea

In [23]:
!pip install jiwer



In [24]:

# If model is PEFT-wrapped, ensuring it reports trainable params:
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable:,} / Total params: {total:,}")

# training hyperparams (tweak as needed)
num_epochs = 5
accum_steps = 4               # if using gradient accumulation
learning_rate = 5e-5
grad_clip = 1.0
save_every_n_steps = 500
output_dir = "./trocr_lora_manual"

# optimizer (only the trainable parameters are included automatically)
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)

# mixed precision scaler
use_amp = torch.cuda.is_available()
scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

# metrics
wer_metric = evaluate.load("wer")


Trainable params: 333,921,792 / Total params: 333,921,792


  scaler = torch.cuda.amp.GradScaler(enabled=use_amp)


In [25]:

# helper: generate predictions for a batch of pixel_values (tensor on device)
def generate_preds(batch_pixel_values, max_length=64, num_beams=4):
    # batch_pixel_values: tensor (B, C, H, W)
    model.eval()
    with torch.no_grad():
        # model.generate expects pixel_values on the device
        generated_ids = model.generate(
            pixel_values=batch_pixel_values,
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=True,
            pad_token_id=processor.tokenizer.pad_token_id,
            eos_token_id=processor.tokenizer.eos_token_id,
            decoder_start_token_id=processor.tokenizer.bos_token_id,
        )
        preds = processor.batch_decode(generated_ids, skip_special_tokens=True)
    return preds

# eval function (computes wer + exact match)
def evaluate_epoch(eval_loader, max_length=64, num_beams=4, limit=None):
    model.eval()
    pred_texts = []
    true_texts = []
    with torch.no_grad():
        for i, batch in enumerate(tqdm(eval_loader, desc="Eval")):
            # batch keys: pixel_values, labels (as in collate_fn)
            pixel_values = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)

            # decode true labels to text (convert -100 -> pad token id first)
            labels_for_decoding = labels.clone()
            labels_for_decoding[labels_for_decoding == -100] = processor.tokenizer.pad_token_id
            label_strs = processor.batch_decode(labels_for_decoding, skip_special_tokens=True)
            true_texts.extend([s.strip() for s in label_strs])

            preds = generate_preds(pixel_values, max_length=max_length, num_beams=num_beams)
            pred_texts.extend([s.strip() for s in preds])

            if limit and i >= limit:
                break

    # compute metrics
    wer = wer_metric.compute(predictions=pred_texts, references=true_texts)
    exact = sum(1 for a,b in zip(pred_texts, true_texts) if a.strip()==b.strip()) / max(1, len(pred_texts))
    return {"wer": wer, "exact_match": exact, "preds": pred_texts[:5], "refs": true_texts[:5]}


In [26]:
import os
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader

from transformers import TrOCRProcessor

class HandwritingDataset(Dataset):
    def __init__(self, csv_file, img_dir, processor, max_target_length=64):
        self.df = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.processor = processor
        self.max_target_length = max_target_length

        # ensure filenames are strings
        self.df["FILENAME"] = self.df["FILENAME"].astype(str)
        self.df["IDENTITY"] = self.df["IDENTITY"].astype(str)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):

        row = self.df.iloc[idx]

        img_path = os.path.join(self.img_dir, row["FILENAME"])
        text = row["IDENTITY"]

        # Load image
        image = Image.open(img_path).convert("RGB")

        # Convert image → pixel_values
        pixel_values = self.processor(
            images=image,
            return_tensors="pt"
        ).pixel_values.squeeze(0)

        # Convert text → labels
        labels = self.processor.tokenizer(
            text,
            padding="max_length",
            max_length=self.max_target_length,
            truncation=True,
            return_tensors="pt"
        ).input_ids.squeeze(0)

        # Replace padding with -100
        labels[labels == self.processor.tokenizer.pad_token_id] = -100

        return {
            "pixel_values": pixel_values,
            "labels": labels
        }



In [27]:
path = "/kaggle/input/handwriting-recognitionocr"

train_csv = f"{path}/CSV/written_name_train.csv"
valid_csv = f"{path}/CSV/written_name_test.csv"
test_csv  = f"{path}/CSV/written_name_validation.csv"

train_img_dir = f"{path}/train_v2/train"
valid_img_dir = f"{path}/train_v2/test"
test_img_dir  = f"{path}/train_v2/validation"

train_dataset = HandwritingDataset(train_csv, train_img_dir, processor)
valid_dataset = HandwritingDataset(valid_csv, valid_img_dir, processor)
test_dataset  = HandwritingDataset(test_csv,  test_img_dir, processor)



train_loader = DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True,
    num_workers=2,
    pin_memory=True,
)

eval_loader = DataLoader(
    valid_dataset,
    batch_size=8,
    shuffle=False,
    num_workers=2,
    pin_memory=True,
)


In [28]:

# ---------- Training Loop ----------
global_step = 0
best_wer = float("inf")

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    start = time.time()
    pbar = tqdm(total=len(train_loader), desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
    optimizer.zero_grad()

    for step, batch in enumerate(train_loader):
        pixel_values = batch["pixel_values"].to(device)     # (B, C, H, W)
        labels = batch["labels"].to(device)                 # (B, L) with -100 for padding

        with torch.cuda.amp.autocast(enabled=use_amp):
            # forward pass (VisionEncoderDecoder model accepts pixel_values and labels)
            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs.loss / accum_steps  # average for accumulation

        scaler.scale(loss).backward()
        epoch_loss += loss.item() * accum_steps  # multiply back

        if (step + 1) % accum_steps == 0:
            # gradient clipping (unscale first)
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)

            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1

            if global_step % 50 == 0:
                pbar.set_postfix({"loss": f"{epoch_loss/(global_step+1):.4f}"})

            # periodic eval + save
            if global_step % save_every_n_steps == 0:
                metrics = evaluate_epoch(eval_loader, max_length=64, num_beams=4, limit=100)  # limit to 100 batches for speed
                print(f"\nStep {global_step}: eval wer={metrics['wer']:.4f}, exact={metrics['exact_match']:.4f}")
                # save checkpoint
                ckpt_dir = os.path.join(output_dir, f"ckpt-step-{global_step}")
                os.makedirs(ckpt_dir, exist_ok=True)
                # save peft adapter + base model weights (if small)
                try:
                    model.save_pretrained(ckpt_dir)
                    processor.save_pretrained(ckpt_dir)
                    print("Saved checkpoint to", ckpt_dir)
                except Exception as e:
                    print("Warning: failed to save checkpoint:", e)

        pbar.update(1)

    pbar.close()
    epoch_time = time.time() - start
    avg_loss = epoch_loss / max(1, len(train_loader))
    print(f"Epoch {epoch+1} done — avg_loss: {avg_loss:.4f} — time: {epoch_time/60:.2f} min")

    # evaluate on full validation set (or a subset)
    metrics = evaluate_epoch(eval_loader, max_length=64, num_beams=4, limit=None)
    print(f"Validation — WER: {metrics['wer']:.4f}  Exact: {metrics['exact_match']:.4f}")
    print("Sample preds:", metrics["preds"])
    print("Sample refs :", metrics["refs"])

    # save best
    if metrics["wer"] < best_wer:
        best_wer = metrics["wer"]
        os.makedirs(output_dir, exist_ok=True)
        print("New best — saving model to", output_dir)
        try:
            model.save_pretrained(output_dir)
            processor.save_pretrained(output_dir)
        except Exception as e:
            print("Failed to save model:", e)


Epoch 1/5:   0%|          | 0/41371 [06:56<?, ?batch/s]


  self.pid = os.fork()
  with torch.cuda.amp.autocast(enabled=use_amp):
  self.pid = os.fork()


KeyboardInterrupt: 

In [None]:

print("Training finished. Best WER:", best_wer)
