# Extract text and images (Classical Chinese)

This file identifies the text lines manually labelled in Transkribus. It saves individual lines as text and images to different files, which can then be used to create a HF dataset.

In [11]:
import xml.etree.ElementTree as ET
from PIL import Image, ImageDraw
import pandas as pd
import glob
import itertools
import unicodedata
import os 

In [3]:
#define dirs
output_folder = "../data/Chinese"
input_folder = f"{output_folder}/7030191/Gongguan_sample"
text_output_folder = f"{output_folder}/texts"
image_output_folder = f"{output_folder}/images"

In [18]:
def normalize_chinese_text(text):
    """
    Normalize text to standard Chinese Unicode form.
    Converts variant Unicode characters (e.g., Kangxi Radicals) into normal forms.
    """
    return unicodedata.normalize("NFKC", text)

In [19]:
def reorder_vertical_text(text_lines):
    """
    Reorder Chinese text from row-major to column-major for vertical text.
    """
    # Determine max length (widest line)
    max_length = max(len(line) for line in text_lines)

    # Pad all lines to the same length to ensure proper alignment
    padded_lines = [line.ljust(max_length, " ") for line in text_lines]

    # Transpose: Convert rows to columns (read column-by-column, top-to-bottom)
    vertical_text = ["".join(column) for column in itertools.zip_longest(*padded_lines, fillvalue=" ")]

    return "".join(vertical_text).strip()

In [16]:
#parse XML
def parse_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    namespace = {'ns': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'}

    regions = []

    # Iterate over TextRegion elements
    for text_region in root.findall(".//ns:TextRegion", namespace):
        region_id = text_region.get("id")
        coords_elem = text_region.find("ns:Coords", namespace)

        if coords_elem is None:
            continue

        # Extract individual TextLines (each one is a vertical string)
        text_lines = [
            line.find(".//ns:Unicode", namespace).text
            for line in text_region.findall(".//ns:TextLine", namespace)
            if line.find(".//ns:Unicode", namespace) is not None
        ]

        if not text_lines:
            continue  # Skip empty regions

        # Normalize all extracted text
        normalized_lines = [normalize_chinese_text(line) for line in text_lines]

        # Convert text_lines into vertical text format
        vertical_text = reorder_vertical_text(normalized_lines)

        # Extract coordinates
        coords_str = coords_elem.get("points")
        regions.append((region_id, coords_str, vertical_text))

    return regions

In [None]:
#parse XML with index ordering
def parse_xml_index_ordering(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    namespace = {'ns': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'}

    regions = []

    # Iterate over TextRegion elements
    for text_region in root.findall(".//ns:TextRegion", namespace):
        region_id = text_region.get("id")
        coords_elem = text_region.find("ns:Coords", namespace)

        if coords_elem is None:
            continue

        # Extract TextLines with their indices
        text_lines_with_indices = []
        for line in text_region.findall(".//ns:TextLine", namespace):
            # Get the index from the custom attribute
            custom_attr = line.get("custom", "")
            index = 0  # Default index if not found
            
            # Parse the index from the custom attribute
            if "readingOrder {index:" in custom_attr:
                try:
                    index_str = custom_attr.split("readingOrder {index:")[1].split("}")[0]
                    index = int(index_str)
                except (IndexError, ValueError):
                    pass  # Keep default index if parsing fails
            
            # Get the text content
            unicode_elem = line.find(".//ns:Unicode", namespace)
            if unicode_elem is not None and unicode_elem.text:
                text = unicode_elem.text
                # Normalize the text
                normalized_text = normalize_chinese_text(text)
                text_lines_with_indices.append((index, normalized_text))
        
        if not text_lines_with_indices:
            continue  # Skip empty regions
        
        # Sort text lines by index
        text_lines_with_indices.sort(key=lambda x: x[0])
        
        # Extract just the text in order
        ordered_text = " ".join(text for _, text in text_lines_with_indices)
        
        # Extract coordinates
        coords_str = coords_elem.get("points")
        regions.append((region_id, coords_str, ordered_text))

    return regions

In [21]:
# Draw each text line with buffer and save as image
def create_images_from_regions(page_name, image_path, regions, buffer_above=10, buffer_below=10, buffer_left=10, buffer_right=10):
    data = []
    image = Image.open(image_path)

    for idx, (region_id, coords_str, region_text) in enumerate(regions):
        # Parse the coordinates and find the bounding box
        points = [tuple(map(int, point.split(','))) for point in coords_str.split()]
        
        x_coords = [p[0] for p in points]
        y_coords = [p[1] for p in points]

        min_x, max_x = min(x_coords) - buffer_left, max(x_coords) + buffer_right
        min_y, max_y = min(y_coords) - buffer_above, max(y_coords) + buffer_below

        # Crop the image to the bounding box
        cropped_image = image.crop((min_x, min_y, max_x, max_y))

        # Append the identifier and text to the data list
        data.append([region_text, f'{page_name}_{region_id}'])

        # Save text data to a CSV file
        df = pd.DataFrame(data, columns=['text', 'identifier'])
        df.to_csv(f'{text_output_folder}/{page_name}.csv', index=False)

        # Save the cropped image
        cropped_image.save(f'{image_output_folder}/{page_name}_{region_id}.png')

In [39]:
# Example: Parse a single XML file and display the results
print("Example of parse_xml() results:")
print("-" * 50)

# Get the first XML file from the input folder
xml_files = glob.glob(f"{input_folder}/page/*.xml")
if xml_files:
    # Normalize path and convert backslashes to forward slashes
    example_xml_path = os.path.normpath(xml_files[0]).replace("\\", "/")
    print(f"Processing file: {example_xml_path}")
    
    # Parse the XML file
    # regions = parse_xml(example_xml_path)
    tree = ET.parse(example_xml_path)
    root = tree.getroot()
    namespace = {'ns': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'}

    regions = []

    # Iterate over TextRegion elements
    for text_region in root.findall(".//ns:TextRegion", namespace):
        region_id = text_region.get("id")
        coords_elem = text_region.find("ns:Coords", namespace)

        if coords_elem is None:
            continue

        # Extract individual TextLines (each one is a vertical string)
        # Each tuple contains (text, index) for each line
        text_lines = [
            (line.find(".//ns:Unicode", namespace).text,  # The text content
             int(line.get("custom").split("{index:")[1].split(";}")[0]))  # The index number
            for line in text_region.findall(".//ns:TextLine", namespace)
            if line.find(".//ns:Unicode", namespace) is not None
        ]
        # text_lines will be a list of tuples, where each tuple is:
        # text_lines[i][0] = text content
        # text_lines[i][1] = index number
        
        # Sort text lines by index and return just the text and index tuples
        # Sort text_lines list by the index number (x[1]) of each tuple
        # text_lines contains tuples of (text_content, index)
        # lambda x: x[1] tells sorted() to use the index (second element) as the sorting key
        sorted_lines = sorted(text_lines, key=lambda x: x[1])
        
        # Print out text lines and indices for first region
        if len(regions) == 0:  # Only print for first region
            print(f"\nText lines for region {region_id}:")
            for text, idx in sorted_lines:
                print(f"Index {idx}: {text}")
            print("-" * 50)
            
        regions.append((region_id, sorted_lines))

        # for i in enumerate(text_lines):
        #     print(f"\ntext_lines {i+1}:")
        #     print(f"index: {text_lines[i][1]}")
        #     print(f"text content: {text_lines[i][0]}")

Example of parse_xml() results:
--------------------------------------------------
Processing file: ../data/Chinese/7030191/Gongguan_sample/page/0001_p001.xml

Text lines for region r_96:
Index 0: 王蹇觀叫
Index 1: 陳情觀叫
Index 2: 羅章觀 
Index 3: 陳天雨
Index 4: 和息
--------------------------------------------------


the following code extracts the coordinates of the text regions from the xml files and crops them into individual images.


In [22]:
for xml_file_path in glob.glob(f"{input_folder}/page/*.xml"):
    # Normalize path and convert backslashes to forward slashes
    xml_file_path = os.path.normpath(xml_file_path).replace("\\", "/")
    page_name = xml_file_path.split("/")[-1].replace(".xml", "")
    image_file_path = f"{input_folder}/{page_name}.jpg" 

    # Parse XML and extract text regions
    regions = parse_xml(xml_file_path)

    # Create images from the text regions
    create_images_from_regions(page_name, image_file_path, regions, buffer_above=0, buffer_below=0, buffer_left=0, buffer_right=0)


First we got an error because glob.glob uses the backslashes of windows and returned:
'../data/Chinese/7030191/Gongguan_sample/page\\0001_p001.xml'
Fixing the backerror by normalizing the path using os.path.normpath

In [24]:
# print(next(glob.glob(f"{input_folder}/page/*.xml")))
xml_file_path = glob.glob(f"{input_folder}/page/*.xml")[0]

print(os.path.normpath(xml_file_path).replace("\\", "/"))

# help(glob)

'../data/Chinese/7030191/Gongguan_sample/page\\0001_p001.xml'