In [13]:
## Regarder les erreurs

import os
import xml.etree.ElementTree as ET
from difflib import SequenceMatcher
from evaluate import load

ground_truth_folder = "manuscript_groundtruth"
htr_folder = "manuscript_htr24"
corrected_folder = "manuscript_final"

def extract_lines_from_alto(path):
    tree = ET.parse(path)
    root = tree.getroot()
    ns = {'a': 'http://www.loc.gov/standards/alto/ns-v4#'}
    strings = root.findall(".//a:String", ns)
    return [s.attrib.get("CONTENT", "").strip() for s in strings]

def diff_chars(s1, s2):
    sm = SequenceMatcher(None, s1, s2)
    return [(tag, s1[a:b], s2[c:d]) for tag, a, b, c, d in sm.get_opcodes() if tag != 'equal']

wer = load("wer")
cer = load("cer")

files = sorted(os.listdir(ground_truth_folder))
total_lines = 0

for filename in files:
    if not filename.endswith(".xml"):
        continue

    gt_path = os.path.join(ground_truth_folder, filename)
    htr_path = os.path.join(htr_folder, filename)
    corr_path = os.path.join(corrected_folder, filename)

    gt_lines = extract_lines_from_alto(gt_path)
    htr_lines = extract_lines_from_alto(htr_path)
    corr_lines = extract_lines_from_alto(corr_path)

    for i, (gt, htr, corr) in enumerate(zip(gt_lines, htr_lines, corr_lines)):
        htr_diff = diff_chars(gt, htr)
        corr_diff = diff_chars(gt, corr)
        wer_score = wer.compute(predictions=[corr], references=[gt])
        cer_score = cer.compute(predictions=[corr], references=[gt])

        print(f" File: {filename} | Line {i+1}")
        print(f"   Ground Truth: {gt}")
        print(f"   HTR         : {htr}")
        print(f"   Dif. HTR    : {htr_diff}")
        print(f"   Corrected   : {corr}")
        print(f"   Dif. Corr   : {corr_diff}")
        print(f"   CER corr vs GT: {cer_score:.4f}, WER: {wer_score:.4f}")
        print()
        total_lines += 1

 File: ARCH-DIV-000494_0037.xml | Line 1
   Ground Truth: 29
   HTR         : 29
   Dif. HTR    : []
   Corrected   : 29
   Dif. Corr   : []
   CER corr vs GT: 0.0000, WER: 0.0000

 File: ARCH-DIV-000494_0037.xml | Line 2
   Ground Truth: Chapitre II
   HTR         : Chapitre "1
   Dif. HTR    : [('replace', 'II', '"1')]
   Corrected   : Chapitre " 1
   Dif. Corr   : [('replace', 'II', '" 1')]
   CER corr vs GT: 0.2727, WER: 1.0000

 File: ARCH-DIV-000494_0037.xml | Line 3
   Ground Truth: I
   HTR         : 7
   Dif. HTR    : [('replace', 'I', '7')]
   Corrected   : du
   Dif. Corr   : [('replace', 'I', 'du')]
   CER corr vs GT: 2.0000, WER: 1.0000

 File: ARCH-DIV-000494_0037.xml | Line 4
   Ground Truth: Jusque vers le milieu du XI^e siècle, les princes
   HTR         : pesique ver la Vi li en du Il sécle ls prunees
   Dif. HTR    : [('replace', 'Ju', 'pe'), ('insert', '', 'i'), ('delete', 's', ''), ('replace', 'e', 'a'), ('replace', 'm', 'V'), ('insert', '', ' '), ('insert', '', ' 