In [14]:
!pip install --upgrade --quiet transformers==4.41.2 peft==0.10.0 accelerate bitsandbytes
!pip install evaluate


  pid, fd = os.forkpty()




In [15]:


import time
import math
import numpy as np
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import torch
import evaluate
from tqdm import tqdm


In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

base_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-small-handwritten")
base_model.to(device)

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],   # works for TrOCR-base
    lora_dropout=0.05,
    bias="none",
)

# Prepare for k-bit training if using 8-bit/4-bit
# (if not using quantization, skip the next line)
# base_model = prepare_model_for_kbit_training(base_model)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-small-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-small-handwritten")

# ---- REQUIRED FIX ----
model.config.decoder_start_token_id = processor.tokenizer.bos_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.eos_token_id = processor.tokenizer.eos_token_id

# Avoid losses becoming NaN
model.config.vocab_size = model.decoder.config.vocab_size
# ----------------------

model.to(device)




config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/246M [00:00<?, ?B/s]

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-small-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

trainable params: 104,448 || all params: 61,701,120 || trainable%: 0.1692805576300722


preprocessor_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/327 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-small-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


VisionEncoderDecoderModel(
  (encoder): DeiTModel(
    (embeddings): DeiTEmbeddings(
      (patch_embeddings): DeiTPatchEmbeddings(
        (projection): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): DeiTEncoder(
      (layer): ModuleList(
        (0-11): 12 x DeiTLayer(
          (attention): DeiTAttention(
            (attention): DeiTSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): DeiTSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): DeiTIntermediate(
            (dense): Linear(

In [17]:
!pip install jiwer



In [30]:

# If model is PEFT-wrapped, ensuring it reports trainable params:
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable:,} / Total params: {total:,}")

# training hyperparams (tweak as needed)
num_epochs = 2
accum_steps = 8               # if using gradient accumulation
learning_rate = 5e-5
grad_clip = 1.0
save_every_n_steps = 500
output_dir = "./trocr_lora_manual"

# optimizer (only the trainable parameters are included automatically)
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)

# mixed precision scaler
use_amp = True
scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

# metrics
wer_metric = evaluate.load("wer")


Trainable params: 61,596,672 / Total params: 61,596,672


  scaler = torch.cuda.amp.GradScaler(enabled=use_amp)


In [31]:

# helper: generate predictions for a batch of pixel_values (tensor on device)
def generate_preds(batch_pixel_values, max_length=64, num_beams=4):
    # batch_pixel_values: tensor (B, C, H, W)
    model.eval()
    with torch.no_grad():
        # model.generate expects pixel_values on the device
        generated_ids = model.generate(
            pixel_values=batch_pixel_values,
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=True,
            pad_token_id=processor.tokenizer.pad_token_id,
            eos_token_id=processor.tokenizer.eos_token_id,
            decoder_start_token_id=processor.tokenizer.bos_token_id,
        )
        preds = processor.batch_decode(generated_ids, skip_special_tokens=True)
    return preds

# eval function (computes wer + exact match)
def evaluate_epoch(eval_loader, max_length=64, num_beams=4, limit=None):
    model.eval()
    pred_texts = []
    true_texts = []
    with torch.no_grad():
        for i, batch in enumerate(tqdm(eval_loader, desc="Eval")):
            # batch keys: pixel_values, labels (as in collate_fn)
            pixel_values = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)

            # decode true labels to text (convert -100 -> pad token id first)
            labels_for_decoding = labels.clone()
            labels_for_decoding[labels_for_decoding == -100] = processor.tokenizer.pad_token_id
            label_strs = processor.batch_decode(labels_for_decoding, skip_special_tokens=True)
            true_texts.extend([s.strip() for s in label_strs])

            preds = generate_preds(pixel_values, max_length=max_length, num_beams=num_beams)
            pred_texts.extend([s.strip() for s in preds])

            if limit and i >= limit:
                break

    # compute metrics
    wer = wer_metric.compute(predictions=pred_texts, references=true_texts)
    exact = sum(1 for a,b in zip(pred_texts, true_texts) if a.strip()==b.strip()) / max(1, len(pred_texts))
    return {"wer": wer, "exact_match": exact, "preds": pred_texts[:5], "refs": true_texts[:5]}


In [59]:
import os
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader

from transformers import TrOCRProcessor

class HandwritingDataset(Dataset):
    def __init__(self, csv_file, img_dir, processor, max_target_length=64):
        
        # Accept either path or DataFrame
        if isinstance(csv_file, str):
            self.df = pd.read_csv(csv_file)
        else:
            self.df = csv_file

        self.img_dir = img_dir
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        img_path = row["full_path"]
        label = row["IDENTITY"]

        # load image
        try:
            image = Image.open(img_path).convert("RGB")
        except:
            return None

        encoding = self.processor(
            image,
            text=label,
            padding="max_length",
            max_length=self.max_target_length,
            truncation=True,
            return_tensors="pt"
        )

        return {
            "pixel_values": encoding["pixel_values"].squeeze(0),
            "labels": encoding["labels"].squeeze(0)
        }


In [63]:
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

def load_and_clean_csv(csv_path, img_dir):
    df = pd.read_csv(csv_path)

    # Precompute full paths
    df["full_path"] = df["FILENAME"].apply(lambda x: os.path.join(img_dir, x))

    # Fast parallel file existence check
    with ThreadPoolExecutor(max_workers=32) as exe:
        exists = list(exe.map(os.path.isfile, df["full_path"]))
    df["IDENTITY"] = df["IDENTITY"].astype(str)

    # Remove "nan", "None", "NULL", "" etc
    df = df[df["IDENTITY"].str.strip().notnull()]
    df = df[df["IDENTITY"].str.strip() != ""]
    df = df[df["IDENTITY"].str.lower() != "nan"]


    df = df[exists].reset_index(drop=True)

    print(f"{csv_path} -> {len(df)} valid images")
    return df






path = "/kaggle/input/handwriting-recognitionocr"

train_csv = f"{path}/CSV/written_name_train.csv"
valid_csv = f"{path}/CSV/written_name_test.csv"
test_csv  = f"{path}/CSV/written_name_validation.csv"

train_img_dir = f"{path}/train_v2/train"
valid_img_dir = f"{path}/train_v2/test"
test_img_dir  = f"{path}/train_v2/validation"

# Cleaned DataFrames
train_df = load_and_clean_csv(train_csv, train_img_dir)
valid_df = load_and_clean_csv(valid_csv, valid_img_dir)
test_df  = load_and_clean_csv(test_csv,  test_img_dir)


train_dataset = HandwritingDataset(train_df, train_img_dir, processor)
valid_dataset = HandwritingDataset(valid_df, valid_img_dir, processor)
test_dataset  = HandwritingDataset(test_df,  test_img_dir, processor)

def collate_skip_none(batch):
    batch = [b for b in batch if b is not None]
    return torch.utils.data.default_collate(batch)



train_loader = DataLoader(
    train_dataset,
    batch_size=5,
    shuffle=False,
    num_workers=2,
    pin_memory=True,
    collate_fn=collate_skip_none,
)

eval_loader = DataLoader(
    valid_dataset,
    batch_size=5,
    shuffle=False,
    num_workers=2,
    pin_memory=True,
    collate_fn=collate_skip_none,
)


ValueError: Item wrong length 330961 instead of 330396.

In [None]:

# ---------- Training Loop ----------
global_step = 0
best_wer = float("inf")

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    start = time.time()
    pbar = tqdm(total=len(train_loader), desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
    optimizer.zero_grad()

    for step, batch in enumerate(train_loader):
        pixel_values = batch["pixel_values"].to(device)     # (B, C, H, W)
        labels = batch["labels"].to(device)                 # (B, L) with -100 for padding

        with torch.cuda.amp.autocast(enabled=use_amp):
            # forward pass (VisionEncoderDecoder model accepts pixel_values and labels)
            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs.loss / accum_steps  # average for accumulation

        scaler.scale(loss).backward()
        epoch_loss += loss.item() * accum_steps  # multiply back

        if (step + 1) % accum_steps == 0:
            # gradient clipping (unscale first)
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)

            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1

            if global_step % 50 == 0:
                pbar.set_postfix({"loss": f"{epoch_loss/(global_step+1):.4f}"})

            # periodic eval + save
            if global_step % save_every_n_steps == 0:
                metrics = evaluate_epoch(eval_loader, max_length=64, num_beams=4, limit=100)  # limit to 100 batches for speed
                print(f"\nStep {global_step}: eval wer={metrics['wer']:.4f}, exact={metrics['exact_match']:.4f}")
                # save checkpoint
                ckpt_dir = os.path.join(output_dir, f"ckpt-step-{global_step}")
                os.makedirs(ckpt_dir, exist_ok=True)
                # save peft adapter + base model weights (if small)
                try:
                    model.save_pretrained(ckpt_dir)
                    processor.save_pretrained(ckpt_dir)
                    print("Saved checkpoint to", ckpt_dir)
                except Exception as e:
                    print("Warning: failed to save checkpoint:", e)

        pbar.update(1)

    pbar.close()
    epoch_time = time.time() - start
    avg_loss = epoch_loss / max(1, len(train_loader))
    print(f"Epoch {epoch+1} done â€” avg_loss: {avg_loss:.4f} â€” time: {epoch_time/60:.2f} min")

    # evaluate on full validation set (or a subset)
    metrics = evaluate_epoch(eval_loader, max_length=64, num_beams=4, limit=None)
    print(f"Validation â€” WER: {metrics['wer']:.4f}  Exact: {metrics['exact_match']:.4f}")
    print("Sample preds:", metrics["preds"])
    print("Sample refs :", metrics["refs"])

    # save best
    if metrics["wer"] < best_wer:
        best_wer = metrics["wer"]
        os.makedirs(output_dir, exist_ok=True)
        print("New best â€” saving model to", output_dir)
        try:
            model.save_pretrained(output_dir)
            processor.save_pretrained(output_dir)
        except Exception as e:
            print("Failed to save model:", e)


Epoch 1/2:   6%|â–Œ         | 3999/66193 [33:54<8:47:21,  1.97batch/s, loss=3.1841]
  child_process = getattr(os, original_name)()  # fork
  child_process = getattr(os, original_name)()  # fork
Unused or unrecognized kwargs: truncation, padding, max_length.
Unused or unrecognized kwargs: truncation, padding, max_length.
Unused or unrecognized kwargs: truncation, padding, max_length.
Unused or unrecognized kwargs: truncation, padding, max_length.
Unused or unrecognized kwargs: truncation, padding, max_length.
Unused or unrecognized kwargs: truncation, padding, max_length.
Unused or unrecognized kwargs: truncation, padding, max_length.
Unused or unrecognized kwargs: truncation, padding, max_length.
Unused or unrecognized kwargs: truncation, padding, max_length.
Unused or unrecognized kwargs: truncation, padding, max_length.
Unused or unrecognized kwargs: truncation, padding, max_length.
Unused or unrecognized kwargs: truncation, padding, max_length.
Unused or unrecognized kwargs: truncat

ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 309, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/tmp/ipykernel_38/4277509258.py", line 37, in __getitem__
    encoding = self.processor(
  File "/opt/conda/lib/python3.10/site-packages/transformers/models/trocr/processing_trocr.py", line 86, in __call__
    encodings = self.tokenizer(text, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 2883, in __call__
    encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs)
  File "/opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 2941, in _call_one
    raise ValueError(
ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).


In [None]:

print("Training finished. Best WER:", best_wer)
