In [None]:


#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
TrOCR-based Glyph Classifier (Optimized for Faster Training on T4 GPU)
=====================================================================

Features:
- Uses **microsoft/trocr-base-handwritten** encoder for glyph classification (75 classes).
- **Train / Validation / Test split = 70 / 20 / 10**.
- **Data augmentation** (flip, rotation, color jitter).
- **Configurable image size** (default 256 for speed; can set 384 to match pretraining).
- **Mixed Precision (AMP)** training for speed & reduced memory.
- **Gradient Accumulation** option to simulate larger batch sizes.
- **Encoder Freeze / Unfreeze** strategy: freeze for first N epochs, then fine-tune.
- **Dual Learning Rates:** low LR for encoder, higher LR for classifier head.
- **Early Stopping** on validation accuracy (patience configurable).
- **Learning Rate Scheduler (ReduceLROnPlateau)** triggered by validation accuracy plateau.
- **Saves:** best_model.pth (best val acc), final_model.pth (last/early-stop), checkpoints, metrics, plots, confusion matrix, classification report.

This script is designed to **reduce training time** on a T4 GPU while retaining the TrOCR backbone.
"""

import os
import json
import random
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from typing import Tuple, Dict, Any

from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms

from transformers import VisionEncoderDecoderModel, TrOCRProcessor

# =============================================================
# 0. Configuration
# =============================================================
DATA_DIR = "/content/data/Final DATASET"  # <<< update if needed
OUTPUT_DIR = "/content/trocr_glyph_out"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- Speed / Memory Controls --------------------------------------------------
IMG_SIZE = (384,384)   # smaller than 384 speeds training; set (384,384) to match TrOCR pretraining
BATCH_SIZE = 16         # reduce if OOM; increase if GPU memory allows
ACCUM_STEPS = 1         # >1 simulates larger batch; increases wall clock but improves stability
NUM_WORKERS = 2         # dataloader workers; adjust for Colab
PIN_MEMORY = True

# --- Training Hyperparams -----------------------------------------------------
EPOCHS = 50
FREEZE_EPOCHS = 5       # freeze encoder for first N epochs (fast classifier warmup)
LR_ENCODER = 1e-5       # low LR for pretrained encoder
LR_CLASSIFIER = 1e-4    # higher LR for classifier head
PATIENCE = 5            # early stopping patience (epochs w/out val_acc improvement)
LR_SCHED_FACTOR = 0.5   # ReduceLROnPlateau factor
LR_SCHED_PATIENCE = 2   # plateau patience before LR drop
MIN_LR = 1e-6
PRINT_EVERY = 100       # steps
SEED = 42

# --- Model -----------------------------------------------------
BASE_MODEL = "microsoft/trocr-base-handwritten"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# =============================================================
# 1. Label Map
# =============================================================
label_map = {
    0: 'மூ', 1: 'ா', 2: 'ட', 3: 'ம', 4: 'ப', 5: 'ெ', 6: 'ய', 7: 'ல ', 8: 'ன',
    9: 'ற', 10: 'க', 11: 'வ', 12: 'ண', 13: 'த', 14: 'ச', 15: 'ங', 16: 'ள',
    17: 'தி', 18: 'வி', 19: 'றி', 20: 'லி', 21: 'னி', 22: 'யி', 23: 'ழி',
    24: 'பி', 25: 'ரி', 26: 'சி', 27: 'ணி', 28: 'மி', 29: 'கி', 30: 'டு',
    31: 'கு', 32: 'ளு', 33: 'லு', 34: 'மு', 35: 'ணு', 36: 'னு', 37: 'ஞ',
    38: 'பு', 39: 'று', 40: 'ரு', 41: 'சு', 42: 'து', 43: 'வு', 44: 'யு',
    45: 'ழு', 46: 'டி', 47: 'ளி', 48: 'எ', 49: 'ழ', 50: 'கீ', 51: 'றூ',
    52: 'மீ', 53: 'வீ', 54: 'நூ', 55: 'றீ', 56: 'தீ', 57: 'கூ', 58: 'தூ',
    59: 'சூ', 60: 'யீ', 61: 'லூ', 62: 'உ', 63: 'அ', 64: 'ழீ', 65: 'யூ',
    66: 'சீ', 67: 'ணீ', 68: 'ஆ', 69: 'ளு', 70: 'இ', 71: 'ை', 72: 'ர', 73: 'ந', 74: 'ஒ'
}
NUM_CLASSES = len(label_map)

with open(os.path.join(OUTPUT_DIR, "id2label.json"), "w", encoding="utf-8") as f:
    json.dump(label_map, f, ensure_ascii=False, indent=2)

# =============================================================
# 2. Reproducibility
# =============================================================
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = False  # True slows but deterministic
    torch.backends.cudnn.benchmark = True

set_seed()
print(f"Using device: {DEVICE}")

# =============================================================
# 3. Processor & Normalization Stats
# =============================================================
# We use the TrOCRProcessor only to pull image mean/std so that our manual transforms
# match the normalization the encoder expects. This avoids per-batch processor overhead.
processor = TrOCRProcessor.from_pretrained(BASE_MODEL)
image_mean = processor.image_processor.image_mean
image_std = processor.image_processor.image_std

# torchvision expects 3-channel mean/std; ensure list of len 3
if len(image_mean) == 1:  # fallback safety
    image_mean = image_mean * 3
    image_std = image_std * 3

# =============================================================
# 4. Dataset
# =============================================================
class GlyphDataset(Dataset):
    def __init__(self, data_dir: str, transform=None):
        self.samples = []
        self.transform = transform
        for label in range(NUM_CLASSES):
            folder = os.path.join(data_dir, str(label))
            if not os.path.exists(folder):
                continue
            for file in os.listdir(folder):
                if file.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff")):
                    self.samples.append((os.path.join(folder, file), label))
        random.shuffle(self.samples)
        if len(self.samples) == 0:
            raise RuntimeError(f"No images found in {data_dir}.")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, label = self.samples[idx]
        img = Image.open(path).convert("RGB")
        if self.transform:
            img = self.transform(img)
        return img, label

# Data augmentation pipeline ---------------------------------------------------
train_transform = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=image_mean, std=image_std),
])

# Validation / Test: no augmentation ------------------------------------------
eval_transform = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=image_mean, std=image_std),
])

dataset_full = GlyphDataset(DATA_DIR, transform=None)  # base dataset reads paths only

# We'll apply different transforms per split by wrapping Subset -> custom dataset ----------
class SubsetWithTransform(Dataset):
    def __init__(self, subset, transform):
        self.subset = subset
        self.indices = subset.indices  # torch.utils.data.Subset
        self.dataset = subset.dataset  # original GlyphDataset (no transform)
        self.transform = transform
    def __len__(self):
        return len(self.indices)
    def __getitem__(self, i):
        idx = self.indices[i]
        path, label = self.dataset.samples[idx]
        img = Image.open(path).convert("RGB")
        img = self.transform(img)
        return img, label

n_total = len(dataset_full)
n_train = int(0.7 * n_total)
n_val = int(0.2 * n_total)
n_test = n_total - n_train - n_val

train_subset, val_subset, test_subset = random_split(dataset_full, [n_train, n_val, n_test])

train_data = SubsetWithTransform(train_subset, train_transform)
val_data = SubsetWithTransform(val_subset, eval_transform)
test_data = SubsetWithTransform(test_subset, eval_transform)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)

print(f"Dataset sizes -> Train: {len(train_data)} | Val: {len(val_data)} | Test: {len(test_data)}")

# =============================================================
# 5. Model Definition (TrOCR Encoder + Classifier Head)
# =============================================================
class TrOCRClassifier(nn.Module):
    def __init__(self, base_model_name: str, num_classes: int):
        super().__init__()
        base_model = VisionEncoderDecoderModel.from_pretrained(base_model_name)
        self.encoder = base_model.encoder  # ViT encoder
        hidden = self.encoder.config.hidden_size
        self.classifier = nn.Linear(hidden, num_classes)

    def forward(self, pixel_values):
        # pixel_values: [B,3,H,W]
        enc_out = self.encoder(pixel_values=pixel_values)
        # enc_out.last_hidden_state: [B, seq_len, hidden]
        pooled = enc_out.last_hidden_state.mean(dim=1)  # mean pool tokens
        logits = self.classifier(pooled)
        return logits

model = TrOCRClassifier(BASE_MODEL, NUM_CLASSES).to(DEVICE)

# Freeze encoder initially -----------------------------------------------------
for p in model.encoder.parameters():
    p.requires_grad = False

# Optimizer with parameter groups ----------------------------------------------
optimizer = torch.optim.Adam([
    {"params": [p for p in model.encoder.parameters() if p.requires_grad], "lr": LR_ENCODER},
    {"params": model.classifier.parameters(), "lr": LR_CLASSIFIER},
])

criterion = nn.CrossEntropyLoss()

# LR scheduler (Reduce on plateau of val_acc) ----------------------------------
# We'll step scheduler manually after each epoch with validation accuracy.
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="max", factor=LR_SCHED_FACTOR, patience=LR_SCHED_PATIENCE, min_lr=MIN_LR, verbose=True
)

# AMP scaler -------------------------------------------------------------------
scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())

# =============================================================
# 6. Training Loop with Early Stopping, Freeze/Unfreeze, AMP, Accumulation
# =============================================================

def run_epoch(loader, train_mode: bool, epoch: int) -> Tuple[float, float]:
    if train_mode:
        model.train()
    else:
        model.eval()

    total_loss = 0.0
    correct = 0
    total = 0

    if train_mode:
        optimizer.zero_grad(set_to_none=True)

    for step, (imgs, labels) in enumerate(loader):
        imgs = imgs.to(DEVICE, non_blocking=True)
        labels = labels.to(DEVICE, non_blocking=True)

        with torch.set_grad_enabled(train_mode):
            with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
                logits = model(imgs)
                loss = criterion(logits, labels)

            if train_mode:
                scaler.scale(loss / ACCUM_STEPS).backward()
                if (step + 1) % ACCUM_STEPS == 0:
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad(set_to_none=True)

        # metrics
        total_loss += loss.item() * imgs.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

        if train_mode and (step + 1) % PRINT_EVERY == 0:
            print(f"  step {step+1}/{len(loader)} | loss {loss.item():.4f}")

    avg_loss = total_loss / max(total, 1)
    acc = correct / max(total, 1)
    return avg_loss, acc


def train_model():
    best_val_acc = 0.0
    patience_counter = 0
    history = {"epoch": [], "train_loss": [], "train_acc": [], "val_loss": [], "val_acc": []}

    for epoch in range(EPOCHS):
        # Unfreeze encoder at start of fine-tuning phase -----------------------
        if epoch == FREEZE_EPOCHS:
            print("Unfreezing encoder for fine-tuning...")
            for p in model.encoder.parameters():
                p.requires_grad = True
            # Rebuild optimizer to include encoder params w/ proper LR ----------
            optimizer.param_groups.clear()
            optimizer.add_param_group({"params": model.encoder.parameters(), "lr": LR_ENCODER})
            optimizer.add_param_group({"params": model.classifier.parameters(), "lr": LR_CLASSIFIER})

        train_loss, train_acc = run_epoch(train_loader, train_mode=True, epoch=epoch)
        val_loss, val_acc = run_epoch(val_loader, train_mode=False, epoch=epoch)

        history["epoch"].append(epoch + 1)
        history["train_loss"].append(train_loss)
        history["train_acc"].append(train_acc)
        history["val_loss"].append(val_loss)
        history["val_acc"].append(val_acc)

        print(f"Epoch {epoch+1:03d}/{EPOCHS} | TLoss {train_loss:.4f} Acc {train_acc*100:5.2f}% | VLoss {val_loss:.4f} Acc {val_acc*100:5.2f}%")

        # LR Scheduler step ---------------------------------------------------
        lr_scheduler.step(val_acc)

        # Early stopping ------------------------------------------------------
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_counter = 0
            torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, "best_model.pth"))
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                print("Early stopping triggered!")
                break

    # Always save final model (may be early-stopped)
    torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, "final_model.pth"))

    # Save history ------------------------------------------------------------
    import pandas as pd
    hist_path = os.path.join(OUTPUT_DIR, "training_history.csv")
    pd.DataFrame(history).to_csv(hist_path, index=False)

    return history, best_val_acc


history, best_val_acc = train_model()
print(f"Best validation accuracy: {best_val_acc*100:.2f}%")

# =============================================================
# 7. Test Evaluation
# =============================================================
model.load_state_dict(torch.load(os.path.join(OUTPUT_DIR, "best_model.pth"), map_location=DEVICE))
model.to(DEVICE)
model.eval()

all_labels = []
all_preds = []

with torch.no_grad():
    for imgs, labels in test_loader:
        imgs = imgs.to(DEVICE, non_blocking=True)
        logits = model(imgs)
        preds = logits.argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.numpy())

# Metrics ---------------------------------------------------------------------
acc = accuracy_score(all_labels, all_preds)
prec = precision_score(all_labels, all_preds, average='macro', zero_division=0)
rec = recall_score(all_labels, all_preds, average='macro', zero_division=0)
f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0)

print("\nTest Set Evaluation:")
print(f"Accuracy       : {acc*100:.2f}%")
print(f"Precision (avg): {prec:.4f}")
print(f"Recall    (avg): {rec:.4f}")
print(f"F1 Score  (avg): {f1:.4f}")

# Classification report -------------------------------------------------------
report = classification_report(
    all_labels,
    all_preds,
    target_names=[label_map[i] for i in range(NUM_CLASSES)],
    zero_division=0,
)

with open(os.path.join(OUTPUT_DIR, "classification_report.txt"), "w", encoding="utf-8") as f:
    f.write(report)

# Also save CSV version -------------------------------------------------------
import pandas as pd
rep_dict = classification_report(
    all_labels,
    all_preds,
    target_names=[label_map[i] for i in range(NUM_CLASSES)],
    output_dict=True,
    zero_division=0,
)
pd.DataFrame(rep_dict).transpose().to_csv(os.path.join(OUTPUT_DIR, "classification_report.csv"), index=True)

# Confusion Matrix ------------------------------------------------------------
cm = confusion_matrix(all_labels, all_preds, labels=list(range(NUM_CLASSES)))
plt.figure(figsize=(12, 10))
plt.imshow(cm, cmap='Blues', aspect='auto')
plt.title('Confusion Matrix - Test Set')
plt.colorbar()
step = max(1, NUM_CLASSES // 25)
plt.xticks(range(0, NUM_CLASSES, step), [label_map[i] for i in range(0, NUM_CLASSES, step)], rotation=90)
plt.yticks(range(0, NUM_CLASSES, step), [label_map[i] for i in range(0, NUM_CLASSES, step)])
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "confusion_matrix.png"), dpi=150)
plt.close()

# Training Curves -------------------------------------------------------------
plt.figure(figsize=(6,4))
plt.plot(history['epoch'], history['train_loss'], label='Train Loss')
plt.plot(history['epoch'], history['val_loss'], label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(); plt.grid(True, ls='--', alpha=0.4)
plt.tight_layout(); plt.savefig(os.path.join(OUTPUT_DIR, "loss_curve.png"), dpi=150)
plt.close()

plt.figure(figsize=(6,4))
plt.plot(history['epoch'], np.array(history['train_acc'])*100, label='Train Acc')
plt.plot(history['epoch'], np.array(history['val_acc'])*100, label='Val Acc')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.legend(); plt.grid(True, ls='--', alpha=0.4)
plt.tight_layout(); plt.savefig(os.path.join(OUTPUT_DIR, "accuracy_curve.png"), dpi=150)
plt.close()

print("\nDone. Outputs saved in:", OUTPUT_DIR)


Using device: cuda
Dataset sizes -> Train: 10796 | Val: 3084 | Test: 1543


Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())
  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):


  step 100/675 | loss 4.2788
  step 200/675 | loss 4.1865
  step 300/675 | loss 4.1670
  step 400/675 | loss 4.0897
  step 500/675 | loss 4.1143
  step 600/675 | loss 4.0310
Epoch 001/50 | TLoss 4.1636 Acc  5.24% | VLoss 3.9803 Acc 10.96%
  step 100/675 | loss 3.9023
  step 200/675 | loss 3.8717
  step 300/675 | loss 3.8628
  step 400/675 | loss 3.9113
  step 500/675 | loss 3.9352
  step 600/675 | loss 3.7052
Epoch 002/50 | TLoss 3.8524 Acc 16.98% | VLoss 3.6889 Acc 24.03%
  step 100/675 | loss 3.6724
  step 200/675 | loss 3.4314
  step 300/675 | loss 3.7306
  step 400/675 | loss 3.6039
  step 500/675 | loss 3.5320
  step 600/675 | loss 3.5645
Epoch 003/50 | TLoss 3.6004 Acc 27.51% | VLoss 3.4531 Acc 33.14%
  step 100/675 | loss 3.3826
  step 200/675 | loss 3.3110
  step 300/675 | loss 3.6094
  step 400/675 | loss 3.4567
  step 500/675 | loss 3.5571
  step 600/675 | loss 3.2726
Epoch 004/50 | TLoss 3.3933 Acc 34.09% | VLoss 3.2599 Acc 35.51%
  step 100/675 | loss 3.3281
  step 200/675 

  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(os.path.join(OUTPUT_DIR, "confusion_matrix.png"), dpi=150)
  plt.savefig(os.path.join(OUTPUT_DIR, "confusion_matrix.png"), dpi=150)
  plt.savefig(os.path.join(OUTPUT_DIR, "confusion_matrix.png"), dpi=150)
  plt.savefig(os.path.join(OUTPUT_DIR, "confusion_matrix.png"), dpi=150)
  plt.savefig(os.path.join(OUTPUT_DIR, "confusion_matrix.png"), dpi=150)
  plt.savefig(os.path.join(OUTPUT_DIR, "confusion_matrix.png"), dpi=150)
  plt.savefig(os.path.join(OUTPUT_DIR, "confusion_matrix.p


Done. Outputs saved in: /content/trocr_glyph_out


In [None]:
!unzip -q "/content/drive/MyDrive/Colab Notebooks/test-datasets.zip" -d /content/testdata

In [1]:
from google.colab import drive
drive.mount('/content/drive')

ValueError: mount failed

In [None]:
!unzip -q "/content/drive/MyDrive/Colab Notebooks/DATASET200.zip" -d /content/data

In [None]:
import os
import shutil
import random
from tqdm import tqdm

source_dir = '/content/data/Final DATASET'  # original dataset folder with subfolders 0 to 74
target_base = '/content/drive/MyDrive/split_dataset'  # target folder for train/val/test

train_ratio = 0.7
val_ratio = 0.2
test_ratio = 0.1

# Create target folders
for split in ['train', 'val', 'test']:
    os.makedirs(os.path.join(target_base, split), exist_ok=True)
    for class_name in os.listdir(source_dir):
        os.makedirs(os.path.join(target_base, split, class_name), exist_ok=True)

# For each class folder
for class_name in tqdm(os.listdir(source_dir), desc="Splitting data"):
    class_path = os.path.join(source_dir, class_name)
    if not os.path.isdir(class_path):
        continue

    images = os.listdir(class_path)
    random.shuffle(images)

    total = len(images)
    train_end = int(train_ratio * total)
    val_end = train_end + int(val_ratio * total)

    train_imgs = images[:train_end]
    val_imgs = images[train_end:val_end]
    test_imgs = images[val_end:]

    for img in train_imgs:
        shutil.copy(os.path.join(class_path, img), os.path.join(target_base, 'train', class_name, img))

    for img in val_imgs:
        shutil.copy(os.path.join(class_path, img), os.path.join(target_base, 'val', class_name, img))

    for img in test_imgs:
        shutil.copy(os.path.join(class_path, img), os.path.join(target_base, 'test', class_name, img))


Splitting data: 100%|██████████| 75/75 [02:05<00:00,  1.67s/it]


In [None]:
!find "/content/data/Final DATASET" -mindepth 1 -maxdepth 1 -type d | wc -l


75


In [None]:
!find "/content/split_dataset/test" -mindepth 1 -maxdepth 1 -type d | wc -l


75


In [None]:
import torch
from PIL import Image
from torchvision import transforms
from transformers import VisionEncoderDecoderModel, TrOCRProcessor
import json
import os

# =========================
# Configuration
# =========================
OUTPUT_DIR = "/content/trocr_glyph_out"  # directory containing best_model.pth and id2label.json
MODEL_PATH = os.path.join(OUTPUT_DIR, "best_model.pth")
BASE_MODEL = "microsoft/trocr-base-handwritten"
IMG_SIZE = (384, 384)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# =========================
# Load label map
# =========================
with open(os.path.join(OUTPUT_DIR, "id2label.json"), "r", encoding="utf-8") as f:
    label_map = json.load(f)
label_map = {int(k): v for k, v in label_map.items()}  # ensure int keys

# =========================
# Define the Model
# =========================
class TrOCRClassifier(torch.nn.Module):
    def __init__(self, base_model_name: str, num_classes: int):
        super().__init__()
        base_model = VisionEncoderDecoderModel.from_pretrained(base_model_name)
        self.encoder = base_model.encoder
        hidden = self.encoder.config.hidden_size
        self.classifier = torch.nn.Linear(hidden, num_classes)

    def forward(self, pixel_values):
        enc_out = self.encoder(pixel_values=pixel_values)
        pooled = enc_out.last_hidden_state.mean(dim=1)
        return self.classifier(pooled)

model = TrOCRClassifier(BASE_MODEL, len(label_map))
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.to(device)
model.eval()

# =========================
# Image Preprocessing
# =========================
processor = TrOCRProcessor.from_pretrained(BASE_MODEL)
image_mean = processor.image_processor.image_mean
image_std = processor.image_processor.image_std

if len(image_mean) == 1:
    image_mean = image_mean * 3
    image_std = image_std * 3

transform = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=image_mean, std=image_std)
])

# =========================
# Prediction Function
# =========================
def predict_image(image_path: str):
    img = Image.open(image_path).convert("RGB")
    img_tensor = transform(img).unsqueeze(0).to(device)
    with torch.no_grad():
        logits = model(img_tensor)
        pred_idx = logits.argmax(dim=1).item()
        return label_map.get(pred_idx, f"Unknown ({pred_idx})")

# =========================
# Example Usage
# =========================
if __name__ == "__main__":
    img_path = "/content/t1.png"  # Replace with path to test image
    pred_char = predict_image(img_path)
    print(f"Predicted Character: {pred_char}")


Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted Character: ண


In [None]:
import os
import json
import torch
import numpy as np
from PIL import Image
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import matplotlib.pyplot as plt




# Configuration
DATA_DIR = "/content/split_dataset/test"  # <<< UPDATE this to your separate test dataset path
MODEL_PATH = "/content/drive/MyDrive/Tamil NLP Project/TR OCR/best_model.pth"       # <<< UPDATE this to saved model path
OUTPUT_DIR = "/content/drive/MyDrive/eval_output"          # <<< OUTPUT directory
os.makedirs(OUTPUT_DIR, exist_ok=True)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 16

# Label Map (must match original training)
label_map = {
    0: 'மூ', 1: 'ா', 2: 'ட', 3: 'ம', 4: 'ப', 5: 'ெ', 6: 'ய', 7: 'ல ', 8: 'ன',
    9: 'ற', 10: 'க', 11: 'வ', 12: 'ண', 13: 'த', 14: 'ச', 15: 'ங', 16: 'ள',
    17: 'தி', 18: 'வி', 19: 'றி', 20: 'லி', 21: 'னி', 22: 'யி', 23: 'ழி',
    24: 'பி', 25: 'ரி', 26: 'சி', 27: 'ணி', 28: 'மி', 29: 'கி', 30: 'டு',
    31: 'கு', 32: 'ளு', 33: 'லு', 34: 'மு', 35: 'ணு', 36: 'னு', 37: 'ஞ',
    38: 'பு', 39: 'று', 40: 'ரு', 41: 'சு', 42: 'து', 43: 'வு', 44: 'யு',
    45: 'ழு', 46: 'டி', 47: 'ளி', 48: 'எ', 49: 'ழ', 50: 'கீ', 51: 'றூ',
    52: 'மீ', 53: 'வீ', 54: 'நூ', 55: 'றீ', 56: 'தீ', 57: 'கூ', 58: 'தூ',
    59: 'சூ', 60: 'யீ', 61: 'லூ', 62: 'உ', 63: 'அ', 64: 'ழீ', 65: 'யூ',
    66: 'சீ', 67: 'ணீ', 68: 'ஆ', 69: 'ளு', 70: 'இ', 71: 'ை', 72: 'ர', 73: 'ந', 74: 'ஒ'
}
NUM_CLASSES = len(label_map)
reverse_label_map = {v: k for k, v in label_map.items()}

present_classes = np.unique(all_labels)
expected_classes = set(range(NUM_CLASSES))
missing_classes = expected_classes - set(present_classes)

if missing_classes:
    print(" Missing label indices in test set:", missing_classes)
    print(" Missing characters:", [label_map[i] for i in missing_classes])
else:
    print(" All 75 classes are present in the test set.")

# Preprocessing
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
image_mean = processor.image_processor.image_mean
image_std = processor.image_processor.image_std

eval_transform = transforms.Compose([
    transforms.Resize((384, 384)),
    transforms.ToTensor(),
    transforms.Normalize(mean=image_mean, std=image_std),
])


# Dataset
class GlyphTestDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.samples = []
        for label in range(NUM_CLASSES):
            label_dir = os.path.join(root_dir, str(label))
            if not os.path.isdir(label_dir):
                continue
            for fname in os.listdir(label_dir):
                if fname.lower().endswith(('.png', '.jpg', '.jpeg')):
                    self.samples.append((os.path.join(label_dir, fname), label))
        self.transform = transform

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path, label = self.samples[idx]
        img = Image.open(img_path).convert("RGB")
        if self.transform:
            img = self.transform(img)
        return img, label

# Model
class TrOCRClassifier(torch.nn.Module):
    def __init__(self, base_model_name, num_classes):
        super().__init__()
        base_model = VisionEncoderDecoderModel.from_pretrained(base_model_name)
        self.encoder = base_model.encoder
        self.classifier = torch.nn.Linear(self.encoder.config.hidden_size, num_classes)

    def forward(self, pixel_values):
        enc_out = self.encoder(pixel_values=pixel_values)
        pooled = enc_out.last_hidden_state.mean(dim=1)
        return self.classifier(pooled)

# Load model
model = TrOCRClassifier("microsoft/trocr-base-handwritten", NUM_CLASSES).to(DEVICE)
model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
model.eval()

# Inference
dataset = GlyphTestDataset(DATA_DIR, transform=eval_transform)
loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)
all_preds, all_labels = [], []

with torch.no_grad():
    for imgs, labels in loader:
        imgs = imgs.to(DEVICE)
        logits = model(imgs)
        preds = logits.argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.numpy())

report_dict = classification_report(
    all_labels,
    all_preds,
    labels=list(range(NUM_CLASSES)),  # Force all 75 classes
    target_names=[label_map[i] for i in range(NUM_CLASSES)],
    zero_division=0,
    output_dict=True
)

report_df = pd.DataFrame(report_dict).transpose()
report_df.to_csv(os.path.join(OUTPUT_DIR, "per_class_metrics.csv"))

# Force Pandas to show all rows (disable truncation)
pd.set_option('display.max_rows', None)

print("\nPer-Class Metrics (saved to CSV):")
print(report_df[['precision', 'recall', 'f1-score', 'support']])

# Optional: Confusion matrix
cm = confusion_matrix(all_labels, all_preds, labels=list(range(NUM_CLASSES)))
plt.figure(figsize=(12, 10))
plt.imshow(cm, cmap='Blues', aspect='auto')
plt.title('Confusion Matrix')
plt.colorbar()
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "confusion_matrix.png"))
plt.close()

print(f"\n Evaluation Complete. Results saved to: {OUTPUT_DIR}")


 All 75 classes are present in the test set.


Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Per-Class Metrics (saved to CSV):
              precision    recall  f1-score      support
மூ             1.000000  1.000000  1.000000    20.000000
ா              1.000000  0.950000  0.974359    40.000000
ட              1.000000  1.000000  1.000000    20.000000
ம              1.000000  0.960000  0.979592    25.000000
ப              1.000000  1.000000  1.000000    22.000000
ெ              1.000000  1.000000  1.000000    28.000000
ய              1.000000  1.000000  1.000000    20.000000
ல              1.000000  1.000000  1.000000    21.000000
ன              1.000000  1.000000  1.000000    25.000000
ற              1.000000  0.950000  0.974359    20.000000
க              1.000000  0.960000  0.979592    25.000000
வ              0.952381  1.000000  0.975610    20.000000
ண              1.000000  1.000000  1.000000    20.000000
த              1.000000  1.000000  1.000000    20.000000
ச              0.950000  0.950000  0.950000    20.000000
ங              1.000000  1.000000  1.000000    20.000

In [None]:
TROCR

In [None]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m434.2 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5
