# Extract text and images

This file identifies the text lines manually labelled in Transkribus. It saves individual lines as text and images to different files, which can then be used to create a HF dataset.

In [2]:
import xml.etree.ElementTree as ET
from PIL import Image, ImageDraw
import pandas as pd
import glob

In [13]:
#define dirs
output_folder = "../data/Chinese"
input_folder = f"{output_folder}/7030191/Gongguan_sample"
text_output_folder = f"{output_folder}/texts"
image_output_folder = f"{output_folder}/images"

In [14]:
#parse XML files
def parse_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    namespace = {'ns': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'}

    regions = []

    # Iterate over TextRegion elements instead of TextLine
    for text_region in root.findall(".//ns:TextRegion", namespace):
        region_id = text_region.get("id")
        coords_elem = text_region.find("ns:Coords", namespace)

        if coords_elem is None:
            continue

        # Extract the text from all TextLines in this region
        text_lines = text_region.findall(".//ns:TextLine", namespace)
        region_text = "\n".join(
            line.find(".//ns:Unicode", namespace).text
            for line in text_lines if line.find(".//ns:Unicode", namespace) is not None
        )

        # Extract coordinates
        coords_str = coords_elem.get("points")
        regions.append((region_id, coords_str, region_text))

    return regions

In [16]:
# Draw each text line with buffer and save as image
def create_images_from_regions(page_name, image_path, regions, buffer_above=10, buffer_below=10, buffer_left=10, buffer_right=10):
    data = []
    image = Image.open(image_path)

    for idx, (region_id, coords_str, region_text) in enumerate(regions):
        # Parse the coordinates and find the bounding box
        points = [tuple(map(int, point.split(','))) for point in coords_str.split()]
        
        x_coords = [p[0] for p in points]
        y_coords = [p[1] for p in points]

        min_x, max_x = min(x_coords) - buffer_left, max(x_coords) + buffer_right
        min_y, max_y = min(y_coords) - buffer_above, max(y_coords) + buffer_below

        # Crop the image to the bounding box
        cropped_image = image.crop((min_x, min_y, max_x, max_y))

        # Append the identifier and text to the data list
        data.append([region_text, f'{page_name}_{region_id}'])

        # Save text data to a CSV file
        df = pd.DataFrame(data, columns=['text', 'identifier'])
        df.to_csv(f'{text_output_folder}/{page_name}.csv', index=False)

        # Save the cropped image
        cropped_image.save(f'{image_output_folder}/{page_name}_{region_id}.png')

In [18]:
for xml_file_path in glob.glob(f"{input_folder}/page/*.xml"):
    page_name = xml_file_path.split("/")[-1].replace(".xml", "")
    image_file_path = f"{input_folder}/{page_name}.jpg" 

    # Parse XML and extract text regions
    regions = parse_xml(xml_file_path)

    # Create images from the text regions
    create_images_from_regions(page_name, image_file_path, regions, buffer_above=0, buffer_below=0, buffer_left=0, buffer_right=0)
