In [6]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ssarkar445/handwriting-recognitionocr")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/ssarkar445/handwriting-recognitionocr?dataset_version_number=1...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1.26G/1.26G [00:25<00:00, 53.5MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/ssarkar445/handwriting-recognitionocr/versions/1


In [2]:
import cv2
import os
import pandas as pd
import random
from google.colab.patches import cv2_imshow

folder_path = f"{path}/train_v2/train"
csv_file = f"{path}/CSV/written_name_train.csv"
df = pd.read_csv(csv_file)
label_dict = dict(zip(df["FILENAME"], df["IDENTITY"]))
files = os.listdir(folder_path)
random_files = random.sample(files, 10)

for name in random_files:
    img_path = os.path.join(folder_path , name)
    img = cv2.imread(img_path)
    if img is None:
        print(f"cant load the image due to error")
        continue
        label = label_dict.get(name, "NO LABEL")
        cv2_imshow(img)
        print(label)

In [5]:
import numpy as np
import pandas as pd
from PIL import Image
import torch

from datasets import Dataset
from transformers import (
    TrOCRProcessor,
    VisionEncoderDecoderModel,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)

In [8]:

images_dir = os.path.join(path , "train_v2")

df_train_csv = f"{path}/CSV/written_name_train.csv"

pretrained_model_name = 'microsoft/trocr-base-handwritten'
output_dir = "./trocr-finetuned"
seed = 42
num_train_epochs = 12
train_bach_size = 8
eval_batch_size = 8
learning_rate = 5e-5
max_target_length = 64
max_input_pixels = (384, 384)


In [9]:
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7a62482d5330>

In [13]:
df = pd.read_csv(df_train_csv, dtype=str).fillna("")


In [7]:
import os
import pandas as pd
from PIL import Image
import torch
from datasets import Dataset
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
!pip install evaluate
import evaluate

# -----------------------
# PATHS â€” CHANGE ONLY THIS
# -----------------------
   # your base path

train_img_dir = f"{path}/train_v2/train"
valid_img_dir = f"{path}/valid_v2/valid"
test_img_dir  = f"{path}/test_v2/test"

train_csv = f"{path}/CSV/written_name_train.csv"
valid_csv = f"{path}/CSV/written_name_validation.csv"
test_csv  = f"{path}/CSV/written_name_test.csv"


# ------------------------
# LOAD CSV INTO DATASETS
# ------------------------
def load_dataset(csv_file, img_dir):
    df = pd.read_csv(csv_file, dtype={'IDENTITY': str})
    # Fill NaNs in the 'IDENTITY' column, which is now ensured to be string type
    df['IDENTITY'] = df['IDENTITY'].fillna("")

    records = []
    for _, row in df.iterrows():
        fname = row["FILENAME"]
        text  = row["IDENTITY"]

        img_path = os.path.join(img_dir, fname)
        if os.path.isfile(img_path):
            records.append({"image_path": img_path, "text": text})

    return Dataset.from_list(records)

train_ds = load_dataset(train_csv, train_img_dir)
valid_ds = load_dataset(valid_csv, valid_img_dir)
test_ds  = load_dataset(test_csv,  test_img_dir)

print("Train:", len(train_ds), "Valid:", len(valid_ds), "Test:", len(test_ds))


# ------------------------
# LOAD PROCESSOR & MODEL
# ------------------------
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

model.to("cuda" if torch.cuda.is_available() else "cpu")


# ------------------------
# PREPROCESSING
# ------------------------
max_target_length = 64

def preprocess(batch):
    images = [Image.open(p).convert("RGB") for p in batch["image_path"]]
    pixel_values = processor(images=images, return_tensors="pt").pixel_values

    labels = processor.tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=max_target_length,
        return_tensors="pt"
    ).input_ids

    labels[labels == processor.tokenizer.pad_token_id] = -100

    return {"pixel_values": pixel_values, "labels": labels}

train_ds = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
valid_ds = valid_ds.map(preprocess, batched=True, remove_columns=valid_ds.column_names)
test_ds  = test_ds.map(preprocess,  batched=True, remove_columns=test_ds.column_names)


# ------------------------
# TRAINER CONFIG
# ------------------------
data_collator = DataCollatorForSeq2Seq(processor.feature_extractor, tokenizer=processor.tokenizer, model=model)
wer_metric = evaluate.load("wer")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    pred_ids = torch.argmax(torch.tensor(logits), dim=-1)

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)

    labels[labels == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels, skip_special_tokens=True)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

training_args = Seq2SeqTrainingArguments(
    output_dir="./trocr_finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    learning_rate=5e-5,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    data_collator=data_collator,
    tokenizer=processor.tokenizer,
    compute_metrics=compute_metrics,
)


# ------------------------
# TRAIN
# ------------------------
trainer.train()

trainer.save_model("./trocr_finetuned")
processor.save_pretrained("./trocr_finetuned")

print("ðŸ”¥ Training complete!")




KeyboardInterrupt: 

In [None]:
records = []
for _, r in df.iterrows():
    img_path = os.path.join(images_dir , r['FILENAME'])
    if not os.path.isfile(img_path):
        continue

    records.append({"image_path": img_path , "text": r['TEXT']})


dataset = dataset.from_list(records)

datset = dataser.shuffle(seed=seed)
split = dataset.train_test_split(test_size)



In [None]:
from transformers import TrOCRProcessor , VisionEncoderDecoderModel
import torch



processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten')

preprocessor_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-stage1 and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]