In [5]:
import re
from collections import defaultdict
from typing import List
import numpy as np
import nltk
from nltk import edit_distance

# Regular expressions to match LaTeX patterns
inline_reg = re.compile(r"\\\((.*?)(?<!\\)\\\)")
display_reg = re.compile(r"\\\[(.+?)(?<!\\)\\\)")
table_reg = re.compile(r"\\begin\{tabular\}(.+?)(?:\\end\{tabular\}|$)", re.S)


def compute_metrics(pred, gt, minlen=4):
    metrics = {}
    if len(pred) < minlen or len(gt) < minlen:
        return metrics
    metrics["edit_dist"] = edit_distance(pred, gt) / max(len(pred), len(gt))
    reference = gt.split()
    hypothesis = pred.split()
    metrics["bleu"] = nltk.translate.bleu([reference], hypothesis)
    try:
        metrics["meteor"] = nltk.translate.meteor([reference], hypothesis)
    except LookupError:
        metrics["meteor"] = np.nan
    reference = set(reference)
    hypothesis = set(hypothesis)
    metrics["precision"] = nltk.scores.precision(reference, hypothesis)
    metrics["recall"] = nltk.scores.recall(reference, hypothesis)
    metrics["f_measure"] = nltk.scores.f_measure(reference, hypothesis)
    return metrics


def split_text(pages: List[str]):
    text, math, table = [], [], []
    for page in pages:
        for i, reg in enumerate([inline_reg, display_reg, table_reg]):
            matches = "\n".join(reg.findall(page))
            if i == 2:
                table.append(matches)
            elif i == 1:
                math[-1] += matches
            else:
                math.append(matches)
            page = reg.sub("", page)
        text.append(page.strip())
    return text, math, table


def get_metrics(gt: List[str], pred: List[str]):
    metrics = defaultdict(list)
    for m in [compute_metrics(p, g) for p, g in zip(pred, gt)]:
        for key, value in m.items():
            metrics[key].append(value)
    return dict(metrics)


def prompt_user_input():
    gt_file = input("Enter the file location of the ground truth LaTeX file:\n")
    pred_file = input("Enter the file location of the predicted LaTeX file:\n")
    return gt_file, pred_file


def read_latex_file(file_path, chunk_size=1024*1024):
    with open(file_path, 'r', encoding='utf-8') as file:
        while True:
            chunk = file.read(chunk_size)
            if not chunk:
                break
            yield chunk


def process_latex_content(file_content):
    content = "".join(file_content)
    pages = content.strip().split("\n\n")
    return split_text(pages)


if __name__ == "__main__":
    gt_file, pred_file = prompt_user_input()

    gt_content = list(read_latex_file(gt_file))
    pred_content = list(read_latex_file(pred_file))

    gt_text, gt_math, gt_table = process_latex_content(gt_content)
    pred_text, pred_math, pred_table = process_latex_content(pred_content)

    metrics_text = get_metrics(gt_text, pred_text)
    metrics_math = get_metrics(gt_math, pred_math)
    metrics_table = get_metrics(gt_table, pred_table)

    print("Metrics for Text:")
    for key, value in metrics_text.items():
        print(f"{key}: {value}")

    print("\nMetrics for Math:")
    for key, value in metrics_math.items():
        print(f"{key}: {value}")

    print("\nMetrics for Tables:")
    for key, value in metrics_table.items():
        print(f"{key}: {value}")


Enter the file location of the ground truth LaTeX file:
C:\Users\Acer\Desktop\docs\nagykiss24.txt
Enter the file location of the predicted LaTeX file:
C:\Users\Acer\Desktop\docs\naggykiss_detected.txt
Metrics for Text:
edit_dist: [0.9782245887308273, 0.9312790413303986, 0.9556129638327854, 0.9975417032484636, 0.9988031119090365, 0.9974232841414852, 0.9488859764089121]
bleu: [0.013093748045135224, 0.012537873076227573, 2.235342075756402e-79, 0, 0, 0, 1.88409334569858e-79]
meteor: [0.13112170075786553, 0.21028787569816623, 0.12711866096201596, 0.0, 0.0, 0.0, 0.11496419148134186]
precision: [0.07462686567164178, 0.1504424778761062, 0.05537459283387622, 0.0, 0.0, 0.0, 0.07837837837837838]
recall: [0.9375, 0.7285714285714285, 0.38636363636363635, 0.0, 0.0, 0.0, 0.3258426966292135]
f_measure: [0.1382488479262673, 0.24938875305623473, 0.09686609686609686, 0, 0, 0, 0.12636165577342048]

Metrics for Math:

Metrics for Tables:
