In [1]:
import torch
import torch.nn as nn
from torchvision import transforms
from PIL import Image, ImageTk
import os
import csv
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from typing import Dict, Tuple

class Config:
    CHARS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,-?!&"
    VOCAB_SIZE = len(CHARS) + 1
    IMG_HEIGHT = 64
    IMG_WIDTH = 256
    HIDDEN_SIZE = 256
    BASE_DIR = os.getcwd()
    MINI_MODEL_PATH = os.path.join(BASE_DIR, "htr_crnn_mini.pth")

def create_char_to_int_mapping(chars: str) -> Tuple[Dict[str, int], Dict[int, str]]:
    char_to_int = {char: i + 1 for i, char in enumerate(chars)}
    int_to_char = {i + 1: char for i, char in enumerate(chars)}
    char_to_int['CTC_BLANK'] = 0
    int_to_char[0] = ''
    return char_to_int, int_to_char

CHAR_TO_INT, INT_TO_CHAR = create_char_to_int_mapping(Config.CHARS)

class CRNN(nn.Module):
    def __init__(self, img_height, vocab_size, hidden_size):
        super(CRNN, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(128, 256, 3, padding=1), nn.BatchNorm2d(256), nn.ReLU(),
            nn.Conv2d(256, 256, 3, padding=1), nn.BatchNorm2d(256), nn.ReLU(),
            nn.MaxPool2d((2, 1), (2, 1)),
            nn.Conv2d(256, 512, 3, padding=1), nn.BatchNorm2d(512), nn.ReLU(),
            nn.Conv2d(512, 512, 3, padding=1), nn.BatchNorm2d(512), nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, None))
        )
        self.map_to_rnn = nn.Linear(512, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size, num_layers=3, bidirectional=True, dropout=0.3, batch_first=False)
        self.linear = nn.Linear(hidden_size * 2, vocab_size)

    def forward(self, x):
        cnn_out = self.cnn(x)
        cnn_out = cnn_out.squeeze(2)
        cnn_out = cnn_out.permute(0, 2, 1)
        rnn_input = self.map_to_rnn(cnn_out).permute(1, 0, 2)
        rnn_out, _ = self.rnn(rnn_input)
        return nn.functional.log_softmax(self.linear(rnn_out), dim=2)

def decode_ctc(output: torch.Tensor, int_to_char: Dict[int, str]) -> tuple:
    probs = output.exp()
    preds = output.argmax(dim=1)
    max_probs = probs.max(dim=1)[0]
    decoded_text, conf_scores = [], []
    prev = -1
    for i, idx in enumerate(preds.cpu().numpy()):
        if idx != 0 and idx != prev:
            decoded_text.append(int_to_char.get(int(idx), '?'))
            conf_scores.append(max_probs[i].item())
        prev = idx
    avg_conf = sum(conf_scores)/len(conf_scores) if conf_scores else 0.0
    return "".join(decoded_text), avg_conf

def predict_handwritten_text(image_path: str, model: nn.Module, device) -> tuple:
    transform = transforms.Compose([
        transforms.Resize((Config.IMG_HEIGHT, Config.IMG_WIDTH)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5]*3, std=[0.5]*3)
    ])
    image = Image.open(image_path).convert("RGB")
    img_tensor = transform(image).unsqueeze(0).to(device)
    with torch.no_grad():
        out = model(img_tensor).squeeze(1)
        text, conf = decode_ctc(out, INT_TO_CHAR)
    return text, conf

class HTR_GUI:
    def __init__(self, root):
        self.root = root
        self.root.title("HTR CRNN - Handwriting Recognition")
        self.root.geometry("900x650")
        self.root.resizable(False, False)
        self.model_path = tk.StringVar()
        self.model_path.set(Config.MINI_MODEL_PATH)
        self.image_path = None
        self.create_widgets()

    def create_widgets(self):
        tk.Label(self.root, text="Handwritten Text Recognition (CRNN)", font=("Arial", 18, "bold")).pack(pady=10)
        frame_model = tk.Frame(self.root)
        frame_model.pack(pady=10)
        tk.Label(frame_model, text="Select Model:", font=("Arial", 12)).pack(side=tk.LEFT)
        self.model_combo = ttk.Combobox(frame_model, width=50, textvariable=self.model_path, values=[Config.MINI_MODEL_PATH, "Select custom model"])
        self.model_combo.pack(side=tk.LEFT, padx=10)
        self.model_combo.bind("<<ComboboxSelected>>", self.select_model)
        frame_img = tk.Frame(self.root)
        frame_img.pack(pady=10)
        self.img_label = tk.Label(frame_img, text="No Image Selected", width=60, height=15, bg="#f0f0f0")
        self.img_label.pack()
        self.path_label = tk.Label(self.root, text="", font=("Arial", 10), fg="gray")
        self.path_label.pack(pady=5)
        tk.Button(self.root, text="Select Image", command=self.select_image, font=("Arial", 12), width=20).pack(pady=10)
        tk.Button(self.root, text="Predict Text", command=self.predict, font=("Arial", 14), width=20, bg="#4CAF50", fg="white").pack(pady=15)
        self.output_box = tk.Text(self.root, height=8, width=80, font=("Courier", 12))
        self.output_box.pack(pady=10)

    def select_model(self, event=None):
        if self.model_combo.get() == "Select custom model":
            model = filedialog.askopenfilename(title="Choose Model File", filetypes=[("PyTorch Model", "*.pth")])
            if model:
                self.model_path.set(model)

    def open_full_image(self, img_path):
        win = tk.Toplevel(self.root)
        win.title("Full Image View")
        img = Image.open(img_path)
        img_tk = ImageTk.PhotoImage(img)
        lbl = tk.Label(win, image=img_tk)
        lbl.image = img_tk
        lbl.pack()

    def select_image(self):
        img = filedialog.askopenfilename(title="Select Handwritten Image", filetypes=[("Image Files", "*.png;*.jpg;*.jpeg;*.bmp")])
        if img:
            self.image_path = img
            self.path_label.config(text=img)
            image = Image.open(img)
            image.thumbnail((400, 300))
            img_tk = ImageTk.PhotoImage(image)
            self.img_label.configure(image=img_tk, text="")
            self.img_label.image = img_tk
            self.open_full_image(img)

    def predict(self):
        if not self.image_path:
            messagebox.showerror("Error", "Please select an image first.")
            return
        model_file = self.model_path.get()
        if not os.path.exists(model_file):
            messagebox.showerror("Error", "Model file not found.")
            return
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = CRNN(Config.IMG_HEIGHT, Config.VOCAB_SIZE, Config.HIDDEN_SIZE).to(device)
        model.load_state_dict(torch.load(model_file, map_location=device))
        model.eval()
        text, conf = predict_handwritten_text(self.image_path, model, device)
        self.output_box.delete("1.0", tk.END)
        self.output_box.insert(tk.END, f"Predicted Text: {text}\n")
        self.output_box.insert(tk.END, f"Confidence: {conf:.4f}\n")

if __name__ == "__main__":
    root = tk.Tk()
    app = HTR_GUI(root)
    root.mainloop()


  model.load_state_dict(torch.load(model_file, map_location=device))


In [3]:
import os
import torch
from PIL import Image, ImageTk
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer


class Config:
    BASE_DIR = os.getcwd()
    DEFAULT_MODEL_DIR = os.path.join(BASE_DIR, "tsr_vit_tablebank_v_hr")


class TSR_GUI:
    def __init__(self, root):
        self.root = root
        self.root.title("TSR - Table Structure Recognition (ViT-BART)")
        self.root.geometry("900x650")
        self.root.resizable(False, False)

        self.model_dir = tk.StringVar()
        self.model_dir.set(Config.DEFAULT_MODEL_DIR)
        self.image_path = None

        self.create_widgets()

    def create_widgets(self):
        tk.Label(self.root, text="Table Structure Recognition (ViT-BART)", font=("Arial", 18, "bold")).pack(pady=10)

        frame_model = tk.Frame(self.root)
        frame_model.pack(pady=10)

        tk.Label(frame_model, text="Select TSR Model:", font=("Arial", 12)).pack(side=tk.LEFT)
        self.model_combo = ttk.Combobox(
            frame_model,
            width=50,
            textvariable=self.model_dir,
            values=[Config.DEFAULT_MODEL_DIR, "Select custom model"]
        )
        self.model_combo.pack(side=tk.LEFT, padx=10)
        self.model_combo.bind("<<ComboboxSelected>>", self.select_model)

        frame_img = tk.Frame(self.root)
        frame_img.pack(pady=10)

        self.img_label = tk.Label(frame_img, text="No Image Selected", width=60, height=15, bg="#f0f0f0")
        self.img_label.pack()

        self.path_label = tk.Label(self.root, text="", font=("Arial", 10), fg="gray")
        self.path_label.pack(pady=5)

        tk.Button(self.root, text="Select Image", command=self.select_image, font=("Arial", 12), width=20).pack(pady=10)

        tk.Button(self.root, text="Extract Table Structure", command=self.predict, font=("Arial", 14),
                  width=25, bg="#4CAF50", fg="white").pack(pady=15)

        self.output_box = tk.Text(self.root, height=10, width=80, font=("Courier", 11))
        self.output_box.pack(pady=10)

    def select_model(self, event=None):
        if self.model_combo.get() == "Select custom model":
            directory = filedialog.askdirectory(title="Choose Model Directory")
            if directory:
                self.model_dir.set(directory)

    def open_full_image(self, path):
        win = tk.Toplevel(self.root)
        win.title("Full Image View")
        img = Image.open(path)
        tk_img = ImageTk.PhotoImage(img)
        lbl = tk.Label(win, image=tk_img)
        lbl.image = tk_img
        lbl.pack()

    def select_image(self):
        img = filedialog.askopenfilename(
            title="Select Table Image",
            filetypes=[("Image Files", "*.png;*.jpg;*.jpeg;*.bmp;*.tif;*.tiff")]
        )
        if img:
            self.image_path = img
            self.path_label.config(text=img)
            image = Image.open(img)
            image.thumbnail((400, 300))
            tk_img = ImageTk.PhotoImage(image)
            self.img_label.configure(image=tk_img, text="")
            self.img_label.image = tk_img
            self.open_full_image(img)

    def load_model(self):
        model_dir = self.model_dir.get()
        if not os.path.exists(model_dir):
            messagebox.showerror("Error", "Model directory not found.")
            return None, None, None

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        processor = ViTImageProcessor.from_pretrained(model_dir)
        tokenizer = AutoTokenizer.from_pretrained(model_dir)
        model = VisionEncoderDecoderModel.from_pretrained(model_dir).to(device)
        model.eval()

        return processor, tokenizer, model

    def predict(self):
        if not self.image_path:
            messagebox.showerror("Error", "Please select an image first.")
            return

        processor, tokenizer, model = self.load_model()
        if model is None:
            return

        device = next(model.parameters()).device

        img = Image.open(self.image_path).convert("RGB")
        pixel_values = processor(img, return_tensors="pt")["pixel_values"].to(device)

        with torch.no_grad():
            out = model.generate(
                pixel_values=pixel_values,
                max_length=512,
                num_beams=4,
                early_stopping=True
            )

        text = tokenizer.decode(out[0], skip_special_tokens=True).strip()

        self.output_box.delete("1.0", tk.END)
        self.output_box.insert(tk.END, text)


if __name__ == "__main__":
    root = tk.Tk()
    app = TSR_GUI(root)
    root.mainloop()
