# Extract text and images (Classical Chinese)

This file identifies the text lines manually labelled in Transkribus. It saves individual lines as text and images to different files, which can then be used to create a HF dataset.

In [2]:
import xml.etree.ElementTree as ET
from PIL import Image, ImageDraw
import pandas as pd
import glob
import itertools
import unicodedata
import os 
from pathlib import Path

In [3]:
cwd = os.getcwd()
print(f"Current working directory: {cwd}")

Current working directory: c:\Users\alexm\NUS Dropbox\Alexander Mozdzen\ocr\gongguan-ocr-1\src


In [140]:
output_folder = "../data"
input_folders = [
    f"{output_folder}/01_11",
    f"{output_folder}/12_21", 
    f"{output_folder}/22_31",
    f"{output_folder}/32_41"
]
# create output folders
os.makedirs(f"{output_folder}/texts", exist_ok=True)
os.makedirs(f"{output_folder}/images", exist_ok=True)

In [5]:
def normalize_chinese_text(text):
    """
    Normalize text to standard Chinese Unicode form.
    Converts variant Unicode characters (e.g., Kangxi Radicals) into normal forms.
    """
    return unicodedata.normalize("NFKC", text)

New version that uses the included index in the XML file for the reading order of the text lines

In [145]:
def parse_xml_index_ordering(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    namespace = {
        'ns': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'
    }

    # initialize a dict of empty lists
    regions = {
        "region_id": [],
        "coord_str": [],
        "ordered_text": []
    }

    # iterate through each TextRegion
    for text_region in root.findall(".//ns:TextRegion", namespace):
        coords_elem = text_region.find("ns:Coords", namespace)
        if coords_elem is None:
            continue

        # pull out the region ID & its coords
        region_id = text_region.get("id")
        coords_str = coords_elem.get("points")

        text_lines = []
        for line in text_region.findall(".//ns:TextLine", namespace):
            unicode_elem = line.find(".//ns:Unicode", namespace)
            if unicode_elem is None:
                continue
            try:
                normalized = normalize_chinese_text(unicode_elem.text) if unicode_elem.text else ""
            except Exception:
                normalized = ""
            try:
                index = int(line.get("custom").split("{index:")[1].split(";}")[0])
            except Exception:
                continue  # skip if index is malformed
            text_lines.append((normalized, index))
        
        # sort by the index and join into one string
        text_lines.sort(key=lambda x: x[1])
        ordered_text = " ".join(text for text, _ in text_lines)

        # append into each list in our dict
        regions["region_id"].append(region_id)
        regions["coord_str"].append(coords_str)
        regions["ordered_text"].append(ordered_text)

    return regions

In [142]:
def create_images_from_regions_subfolders(subfolder_name, page_name, image_path, regions,
                                          buffer_above=10, buffer_below=10,
                                          buffer_left=10, buffer_right=10):
    image = Image.open(image_path)

    for region_id, coords_str in zip(regions["region_id"], regions["coord_str"]):
        points = [tuple(map(int, point.split(','))) for point in coords_str.split()]
        x_coords, y_coords = zip(*points)

        min_x = min(x_coords) - buffer_left
        max_x = max(x_coords) + buffer_right
        min_y = min(y_coords) - buffer_above
        max_y = max(y_coords) + buffer_below

        cropped_image = image.crop((min_x, min_y, max_x, max_y))
        cropped_image.save(f"{output_folder}/images/{subfolder_name}_{page_name}_{region_id}.png")

In [146]:
# Option A: iterdir + is_dir

output_folder = Path("../data/")
for subfolder in output_folder.iterdir():
    image_paths = list(subfolder.glob("*.jpg"))
    filenames = [p.stem for p in image_paths]
    xml_paths = glob.glob(f"{subfolder}/page/*.xml")
    for page_name, xml_path, image_path in zip(filenames, xml_paths, image_paths):
    
        regions = parse_xml_index_ordering(xml_path)
        # Save text information as dataframe:
        pd.DataFrame({
        "text": regions["ordered_text"],
        "identifier": [f"{subfolder.name}_{page_name}_{region_id}" for region_id in regions["region_id"]]
        }).to_csv(f"{output_folder}/texts/{subfolder.name}_{page_name}.csv", index=False)
        
         # Create images from the text regions
        create_images_from_regions_subfolders(subfolder.name, page_name, image_path, regions,
                                                buffer_above=0, buffer_below=0, buffer_left=0, buffer_right=0)