In [2]:
!pip install opencv-python
!pip install -q transformers
!pip install -q sentencepiece
!pip install -q jiwer
!pip install -q datasets
!pip install -q evaluate
!pip install -q -U accelerate

!pip install -q matplotlib
!pip install -q protobuf==3.20.1
!pip install -q tensorboard
!pip install elementpath
!pip install scikit-learn
!pip install numpy==1.26.4
!pip install transformers==4.45.2
!pip install git+https://github.com/huggingface/transformers@muellerzr-more-models-sadface

Collecting opencv-python
  Using cached opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl.metadata (20 kB)
Using cached opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl (39.5 MB)
Installing collected packages: opencv-python
Successfully installed opencv-python-4.11.0.86
Collecting git+https://github.com/huggingface/transformers@muellerzr-more-models-sadface
  Cloning https://github.com/huggingface/transformers (to revision muellerzr-more-models-sadface) to c:\users\dhlabadmin\appdata\local\temp\pip-req-build-y38i65r2
  Resolved https://github.com/huggingface/transformers to commit f8a963c116e6df9fb44f48da6875c12392e6e787
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting tokenizers<0.22,>=0.21 (from tran

  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers 'C:\Users\dhlabadmin\AppData\Local\Temp\pip-req-build-y38i65r2'
  Running command git checkout -b muellerzr-more-models-sadface --track origin/muellerzr-more-models-sadface
  branch 'muellerzr-more-models-sadface' set up to track 'origin/muellerzr-more-models-sadface'.
  Switched to a new branch 'muellerzr-more-models-sadface'
  You can safely remove it manually.


In [None]:
import os
import numpy as np
import pandas as pd
import glob as glob
import matplotlib.pyplot as plt

block_plot = False
plt.rcParams['figure.figsize'] = (12, 9)
bold = f"\033[1m"
reset = f"\033[0m"



In [62]:
from xml.etree import ElementTree as ET

def get_text_regions(xml_file):
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()

        namespaces = {'ns': root.tag.split('}')[0].strip('{')}

        regions = []

        for region in root.findall(".//ns:TextRegion", namespaces):

            for textline in region.findall(".//ns:TextLine", namespaces):
                text_id = textline.get('id')
                text_coords = textline.find(".//ns:Coords", namespaces).get('points')
                unicode_elem = textline.find(".//ns:TexEquiv/ns:Unicode", namespaces)
                if unicode_elem is not None and unicode_elem.text:
                    unicode_text = unicode_elem.text
                else:
                    words = textline.findall(".//ns:Word/ns:TextEquiv/ns:Unicode", namespaces)
                    unicode_text = " ".join(w.text for w in words if w.text)

                if text_coords:
                    regions.append({'id': text_id, 'coords': text_coords, 'text': unicode_text})
        return regions
    
    except Exception as e:
        print(f"Failed processing get_text_regions {xml_file}: {e}")
        return []

In [None]:
import cv2

def polygon_crop(image, points):
    try:
        if not points:
            return None

        mask = np.zeros(image.shape[:2], dtype=np.uint8)
        points = np.array([list(map(int, p.split(','))) for p in points.split()], dtype=np.int32)
        if len(points) < 3:
            print("Not enough points", points)
        
        cv2.fillPoly(mask, [points], 255)

        res = cv2.bitwise_and(image, image, mask=mask)

        x, y, w, h = cv2.boundingRect(points)
        if w==0 or h==0:
            print("w or h is zero", w, h)
        cropped_img = res[y:y+h, x:x+w]
        return cropped_img
    
    except Exception as e:
        print(f"Failed processing polygon_crop: {e}")
        return None

In [65]:
def process_images(image_dir, xml_dir, output_dir, output_csv):
    os.makedirs(output_dir, exist_ok=True)
    data = []

    for image_file in os.listdir(image_dir):
        if not image_file.endswith((".jpg", ".png", ".tif", "jpeg")):
            continue

        image = cv2.imread(os.path.join(image_dir, image_file))
        if image is None:
            print("image is None", os.path.join(image_dir, image_file))

        name = os.path.splitext(image_file)[0]
        xml_path = os.path.join(xml_dir, name + ".xml")

        text_regions = get_text_regions(xml_path)

        for i, region in enumerate(text_regions):
            cropped_img = polygon_crop(image, region['coords'])

            if cropped_img is None or cropped_img.size == 0:
                print(f"Failed processing {name}_{i:02}.png")
                continue

            filename = f"{name}_{i:02}.png"
            save_path = os.path.join(output_dir, filename)
            cv2.imwrite(save_path, cropped_img)

            data.append({'filename': filename, 'text': region['text']})

    df = pd.DataFrame(data)
    df.to_csv(output_csv, index=False, encoding="utf-8")

In [None]:
# image_dir = "C:/Users/dhlabadmin/Desktop/m-test/full-datasets/unpacked-datasets/6470048"
# image_file = "C:/Users/dhlabadmin/Desktop/m-test/full-datasets/unpacked-datasets/6470048/shchodennyk-0050.jpg"
# xml_dir = "C:/Users/dhlabadmin/Desktop/m-test/full-datasets/unpacked-datasets/6470048/page/shchodennyk-0050.xml"
# output_dir = os.path.join(image_dir, "cropped_test")
# output_csv = os.path.join(image_dir, "test.txt")
# os.makedirs(output_dir, exist_ok=True)

# data = []
# image = cv2.imread(image_file)
# if image is None:
#     print("image is None", image_file)

# text_regions = get_text_regions(xml_dir)

# for i, region in enumerate(text_regions):
#     cropped_img = polygon_crop(image, region['coords'])

#     if cropped_img is None or cropped_img.size == 0:
#         print(f"Failed processing test_{i:02}.png")
#         continue

#     filename = f"test_{i:02}.png"
#     save_path = os.path.join(output_dir, filename)
#     cv2.imwrite(save_path, cropped_img)

#     data.append({'filename': filename, 'text': region['text']})

# df = pd.DataFrame(data)
# df.to_csv(output_csv, index=False, encoding="utf-8")

## Dataset 6470048

In [66]:
image_dir = "C:/Users/dhlabadmin/Desktop/m-test/full-datasets/unpacked-datasets/6470048/"
xml_dir = "C:/Users/dhlabadmin/Desktop/m-test/full-datasets/unpacked-datasets/6470048/page/"
output_dir = os.path.join(image_dir, "cropped_6470048")
output_csv = os.path.join(image_dir, "cropped_6470048.txt")
process_images(image_dir, xml_dir, output_dir, output_csv)

Failed processing Akhtyrs'ka-0002_00.png
Failed processing Akhtyrs'ka-0003_19.png
Failed processing Akhtyrs'ka-0004_00.png
Failed processing Letters_Shwedowa_11_2022-0001_00.png
Failed processing Letters_Shwedowa_11_2022-0015_00.png
Failed processing Letters_Shwedowa_11_2022-0032_00.png
Failed processing Letters_Shwedowa_11_2022-0040_00.png
Failed processing Letters_Shwedowa_11_2022-0042_00.png
Failed processing Letters_Shwedowa_11_2022-0042_01.png
Failed processing Letters_Shwedowa_11_2022-0042_02.png
Failed processing Letters_Shwedowa_11_2022-0043_00.png
Failed processing Letters_Shwedowa_11_2022-0043_01.png
Failed processing Moroz_Dudyk-0004_00.png
Failed processing Moroz_Dudyk-0004_17.png
Failed processing Moroz_Dudyk-0007_00.png
Failed processing Moroz_Dudyk-0007_04.png
Failed processing Moroz_Dudyk-0007_15.png
Failed processing Moroz_Dudyk-0009_00.png
Failed processing Moroz_Dudyk-0009_05.png
Failed processing Moroz_Dudyk-0009_06.png
Failed processing Moroz_Dudyk-0009_10.png
Fail

In [None]:
# dataset = pd.read_csv("C:/Users/dhlabadmin/Desktop/m-test/full-datasets/unpacked-datasets/6470048/cropped_6470048.txt", delimiter=",", header=None, names=["file_name", "text"], on_bad_lines="skip")
# dataset_prev = pd.read_csv("C:/Users/dhlabadmin/Desktop/m-test/full-datasets/unpacked-datasets/6470048/tmp/cropped_6470048.txt", delimiter=",", header=None, names=["file_name", "text"], on_bad_lines="skip")

# old_text_dict = dict(zip(dataset_prev["file_name"], dataset_prev["text"]))
# dataset["text"] = dataset.apply(lambda row: old_text_dict.get(row["file_name"], row["text"]) if row["text"] == "" else row["text"], axis=1)
# dataset.to_csv("C:/Users/dhlabadmin/Desktop/m-test/full-datasets/unpacked-datasets/6470048/df_crops_6470048.txt")

## Dataset 14484847

In [None]:
image_dir = "C:/Users/dhlabadmin/Desktop/m-test/full-datasets/unpacked-datasets/14484847/"
xml_dir = "C:/Users/dhlabadmin/Desktop/m-test/full-datasets/unpacked-datasets/14484847/page/"
output_dir = os.path.join(image_dir, "cropped_14484847")
output_csv = os.path.join(image_dir, "cropped_14484847.txt")
process_images(image_dir, xml_dir, output_dir, output_csv)

In [None]:
from sklearn.model_selection import train_test_split

dataset = pd.read_csv(os.path.join(image_dir, "cropped_14484847.txt"), delimiter=",", header=None, names=["file_name", "text"], on_bad_lines="skip")

train_df, test_df = train_test_split(dataset, test_size=0.2, random_state=42)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [75]:
train_df.head()

Unnamed: 0,file_name,text
0,Ukrainka-0229_06.png,54. Славен у Бога Марисін по¬
1,Luk'anenko_last_pages-0003_09.png,по мамованно був зайтотий і я дітий.
2,Shev_Kobzar-0048_43.png,Господа молити.
3,shchodennyk-0024_37.png,"в нае почачась рознова щю Сжищів, в Якому я"
4,Luk'anenko_last_pages-0007_26.png,"зе паротя до Лохвиці, а там може спаю¬"


In [76]:
test_df.head()

Unnamed: 0,file_name,text
0,Ukrainka-0126_04.png,"на одну підводу скрині та перини,"
1,Luk'anenko_last_pages-0042_26.png,"загубив мене, відбив від життя."
2,Ukrainka-0080_16.png,-В Іванихи породіллі дитя калихала.
3,shchodennyk-0026_08.png,-Все розповів до найметшія дрібниць... чого же...
4,Luk'anenko_last_pages-0024_23.png,бону стилістики і навіть зміста


In [None]:
train_df.to_csv(os.path.join(image_dir, "14484847_train.txt"), index=False, encoding="utf-8")
test_df.to_csv(os.path.join(image_dir, "14484847_test.txt"), index=False, encoding="utf-8")

In [None]:
# split cropped images in different directories

test_images = f"{image_dir}14484847_test/"
train_images = f"{image_dir}14484847_train/"
os.makedirs(test_images, exist_ok=True)
os.makedirs(train_images, exist_ok=True)

for _, row in test_df.iterrows():
    os.rename(os.path.join(image_dir, "cropped_14484847/") + row['file_name'], test_images + row['file_name'])

for _, row in train_df.iterrows():
    os.rename(os.path.join(image_dir, "cropped_14484847/") + row['file_name'], train_images + row['file_name'])