## File conversion from .gif to .jpg

### Setup folders

In [1]:
import os

def setup_folders(input: str, output):
    input_folders = [
        f"./{input}/FormatA",
        f"./{input}/FormatB"
    ]
    output_folders = [
        f"./{output}/FormatA",
        f"./{output}/FormatB",
    ]

    if not os.path.isdir(f"./{output}"):
        os.mkdir(f"./{output}")
    if not os.path.isdir(f"./{output}/FormatA"):
        os.mkdir(f"./{output}/FormatA")
    if not os.path.isdir(f"./{output}/FormatB"):
        os.mkdir(f"./{output}/FormatB")

    return input_folders, output_folders


input_folders, output_folders = setup_folders("MINSKI_ZAPISNICI_SORTIRANO_ZAJEDNO","jpg_minski_zapisnici")

### Convert images

In [2]:
from utils import convert_gif_to_jpg
from time import time

num_threads = 5

folders = list(zip(input_folders, output_folders))
for input_folder, output_folder in folders:
    print(f"Folder {input_folder}")
    print("Start convert...")
    s = time()
    convert_gif_to_jpg(input_folder, output_folder, num_threads)
    e = time()
    print(f"End convert, time took {e-s}")

Folder ./MINSKI_ZAPISNICI_SORTIRANO_ZAJEDNO/FormatA
Start convert...
Converted 30519.gif to JPG
Converted 30520.gif to JPG
Converted 30706.gif to JPG
Converted 30535.gif to JPG
Converted 30543.gif to JPG
Converted 30960.gif to JPG
Converted 30926.gif to JPG
Converted 30967.gif to JPG
Converted 30925.gif to JPG
Converted 30922.gif to JPG
Converted 30969.gif to JPG
Converted 30972.gif to JPG
Converted 30977a.gif to JPG
Converted 31131.gif to JPG
Converted 31094.gif to JPG
Converted 31142.gif to JPG
Converted 31139.gif to JPG
Converted 31150.gif to JPG
Converted 31136.gif to JPG
Converted 31138.gif to JPG
Converted 33021a.gif to JPG
Converted 32862.gif to JPG
Converted 33021b.gif to JPG
Converted 32860.gif to JPG
Converted 31190.gif to JPG
Converted 40688.gif to JPG
Converted 50677.gif to JPG
Converted 50678.gif to JPG
Converted 50676.gif to JPG
Converted 33238.gif to JPG
Converted 50690.gif to JPG
Converted 50685.gif to JPG
Converted 50684.gif to JPG
Converted 50682.gif to JPG
Converted 

## Cleaning images

In [4]:
input_folders, output_folders = setup_folders("jpg_minski_zapisnici", "jpg_minski_zapisnici_clean")

In [4]:
from utils.clean_image_pipeline import apply_image_cleaning_pipeline

input_path = "./jpg_minski_zapisnici/FormatA/30543.jpg"
output_path = "./jpg_minski_zapisnici_clean/FormatA/30543.jpg"

apply_image_cleaning_pipeline(input_path, output_path)


In [7]:
from time import time
from utils.clean_image_pipeline import apply_image_cleaning_pipeline
import threading

num_threads = 5
folders = list(zip(input_folders, output_folders))
for input_folder, output_folder in folders:
    print(f"Folder {input_folder}")
    print("Start clean...")
    s = time()
    images = os.listdir(input_folder)
    threads = []

    for in_image in images:
        image1, image2 = f"{input_folder}/{in_image}", f"{output_folder}/{in_image}"
        print("In: ", image1)
        print("Out: ", image2)
        thread = threading.Thread(target=apply_image_cleaning_pipeline, args=(image1, image2))
        thread.start()
        threads.append(thread)
        if len(threads) >= num_threads:
            for thread in threads:
                thread.join()
            threads = []
        #apply_image_cleaning_pipeline(image1, image2)
    e = time()
    print(f"End clean, time took {e-s}")

Folder ./jpg_minski_zapisnici/FormatA
Start clean...
In:  ./jpg_minski_zapisnici/FormatA/30519.jpg
Out:  ./jpg_minski_zapisnici_clean/FormatA/30519.jpg
In:  ./jpg_minski_zapisnici/FormatA/30520.jpg
Out:  ./jpg_minski_zapisnici_clean/FormatA/30520.jpg
In:  ./jpg_minski_zapisnici/FormatA/30535.jpg
Out:  ./jpg_minski_zapisnici_clean/FormatA/30535.jpg
In:  ./jpg_minski_zapisnici/FormatA/30543.jpg
Out:  ./jpg_minski_zapisnici_clean/FormatA/30543.jpg
In:  ./jpg_minski_zapisnici/FormatA/30706.jpg
Out:  ./jpg_minski_zapisnici_clean/FormatA/30706.jpg
In:  ./jpg_minski_zapisnici/FormatA/30922.jpg
Out:  ./jpg_minski_zapisnici_clean/FormatA/30922.jpg
In:  ./jpg_minski_zapisnici/FormatA/30925.jpg
Out:  ./jpg_minski_zapisnici_clean/FormatA/30925.jpg
In:  ./jpg_minski_zapisnici/FormatA/30926.jpg
Out:  ./jpg_minski_zapisnici_clean/FormatA/30926.jpg
In:  ./jpg_minski_zapisnici/FormatA/30960.jpg
Out:  ./jpg_minski_zapisnici_clean/FormatA/30960.jpg
In:  ./jpg_minski_zapisnici/FormatA/30967.jpg
Out:  ./jp

## Region detection

### Manual region detection

In [2]:
from utils.extract_regions import extract_interesting_regions
import cv2
image_path = "./jpg_minski_zapisnici_clean/FormatA/30543.jpg"
out_path = "./tmp/region_detection12_"

image = cv2.imread(image_path)
regions = extract_interesting_regions(image)

for i, region in enumerate(regions):
    a = f"{out_path}{i}.jpg"
    print(a)
    if len(region) > 0:
        print("write")
        cv2.imwrite(a, region)

./tmp/region_detection12_0.jpg
write
./tmp/region_detection12_1.jpg
write
./tmp/region_detection12_2.jpg
write
./tmp/region_detection12_3.jpg
write
./tmp/region_detection12_4.jpg
write
./tmp/region_detection12_5.jpg
write
./tmp/region_detection12_6.jpg
write
./tmp/region_detection12_7.jpg
write


In [6]:
input_folders, output_folders = setup_folders("jpg_minski_zapisnici_clean","jpg_minski_zapisnici_clean_manual_regions")

In [9]:
from time import time
from utils.extract_regions import extract_interesting_regions
import threading

def extract_interesting_regions_threading(src: str, dest: str):
    empty, dest, ext = dest.split('.')
    image = cv2.imread(src)
    regions = extract_interesting_regions(image)

    for i, region in enumerate(regions):
        a = f".{dest}_{i}.{ext}"
        if len(region) > 0:
            cv2.imwrite(a, region)

num_threads = 5
folders = list(zip(input_folders, output_folders))
for input_folder, output_folder in folders:
    print(f"Folder {input_folder}")
    print("Start clean...")
    s = time()
    images = os.listdir(input_folder)
    threads = []

    for in_image in images:
        image1, image2 = f"{input_folder}/{in_image}", f"{output_folder}/{in_image}"
        print("In: ", image1)
        print("Out: ", image2)
        #extract_interesting_regions_threading(image1, image2)
        #"""
        thread = threading.Thread(target=extract_interesting_regions_threading, args=(image1, image2))
        thread.start()
        threads.append(thread)
        if len(threads) >= num_threads:
            for thread in threads:
                thread.join()
            threads = []
        #"""
    e = time()
    print(f"End clean, time took {e-s}")

Results not good, because of the state of the images the line detection, which is predominantly used for this action, cannot work correctly most of the time.
Best way is to use ML assistance for this work.

### ML region detection

In [4]:
!pip install ultralytics==8.0.227

In [2]:
from ultralytics import YOLO

model_path = "./models/region_detection_model.pt"
img_path = "./jpg_minski_zapisnici_clean/FormatA/30519.jpg"
model = YOLO(model_path)
import cv2
image = cv2.imread(img_path)

results = model.predict(source=image, imgsz=640,conf=0.25,iou=0.45)
results = results[0]  
for i in range(len(results.boxes)):
        box = results.boxes[i]
        cls = results.cls
        tensor = box.xyxy[0]
        x1 = int(tensor[0].item())
        y1 = int(tensor[1].item())
        x2 = int(tensor[2].item())
        y2 = int(tensor[3].item())
        cv2.rectangle(image,(x1,y1),(x2,y2),(255,0,0),3) 

a= "./tmp/30519-regions-drawings.jpg"
cv2.imwrite(a,image)



0: 640x480 1 overhead, 1 record, 1 map, 1 legend, 1 structure, 1 detailed information, 1 unit record, 1 demainer information, 1 demining information, 179.8ms
Speed: 10.2ms preprocess, 179.8ms inference, 8.1ms postprocess per image at shape (1, 3, 640, 480)


AttributeError: 'Results' object has no attribute 'cls'. See valid attributes below.

    A class for storing and manipulating inference results.

    Args:
        orig_img (numpy.ndarray): The original image as a numpy array.
        path (str): The path to the image file.
        names (dict): A dictionary of class names.
        boxes (torch.tensor, optional): A 2D tensor of bounding box coordinates for each detection.
        masks (torch.tensor, optional): A 3D tensor of detection masks, where each mask is a binary image.
        probs (torch.tensor, optional): A 1D tensor of probabilities of each class for classification task.
        keypoints (List[List[float]], optional): A list of detected keypoints for each object.

    Attributes:
        orig_img (numpy.ndarray): The original image as a numpy array.
        orig_shape (tuple): The original image shape in (height, width) format.
        boxes (Boxes, optional): A Boxes object containing the detection bounding boxes.
        masks (Masks, optional): A Masks object containing the detection masks.
        probs (Probs, optional): A Probs object containing probabilities of each class for classification task.
        keypoints (Keypoints, optional): A Keypoints object containing detected keypoints for each object.
        speed (dict): A dictionary of preprocess, inference, and postprocess speeds in milliseconds per image.
        names (dict): A dictionary of class names.
        path (str): The path to the image file.
        _keys (tuple): A tuple of attribute names for non-empty attributes.
    

In [3]:
from ultralytics import YOLO
model_path = "./models/region_detection_model.pt"
extract_interesting_regions_ml_model = YOLO(model_path)


def extract_interesting_regions_ml(image):
    results = extract_interesting_regions_ml_model.predict(source=image,imgsz=640,conf=0.25,iou=0.45)
    results = results[0]
    boxes = results.boxes
    classes = boxes.cls
    zipped = list(zip(classes, boxes))
    rectangles = []
    classes = []
    for cls, box in zipped:
        tensor = box.xyxy[0]
        x1 = int(tensor[0].item())
        y1 = int(tensor[1].item())
        x2 = int(tensor[2].item())
        y2 = int(tensor[3].item())
        rectangles.append([[x1,y1],[x2, y2]])
        classes.append(int(cls))
    return rectangles, classes

In [17]:
input_folders, output_folders = setup_folders("slike_ciste","slike_ciste_custom_ime")

def save_image(src, dest):
    image = cv2.imread(src)
    cv2.imwrite(dest, image)
    pass

num_threads = 5
folders = list(zip(input_folders, output_folders))
for input_folder, output_folder in folders:
    print(f"Folder {input_folder}")
    print("Start clean...")
    s = time()
    images = os.listdir(input_folder)
    threads = []

    for idx, in_image in enumerate(images):
        out_image = f"form_{idx}.jpg"
        image1, image2 = f"{input_folder}/{in_image}", f"{output_folder}/{out_image}"
        print("In: ", image1)
        print("Out: ", image2)
        #extract_interesting_regions_threading(image1, image2)
        #"""
        thread = threading.Thread(target=save_image, args=(image1, image2))
        thread.start()
        threads.append(thread)
        if len(threads) >= num_threads:
            for thread in threads:
                thread.join()
            threads = []
        #"""
    e = time()
    print(f"End clean, time took {e-s}")

Folder ./slike_ciste/FormatA
Start clean...
In:  ./slike_ciste/FormatA/30519.jpg
Out:  ./slike_ciste_custom_ime/FormatA/form_0.jpg
In:  ./slike_ciste/FormatA/30520.jpg
Out:  ./slike_ciste_custom_ime/FormatA/form_1.jpg
In:  ./slike_ciste/FormatA/30535.jpg
Out:  ./slike_ciste_custom_ime/FormatA/form_2.jpg
In:  ./slike_ciste/FormatA/30543-denoise-sharpen.jpg
Out:  ./slike_ciste_custom_ime/FormatA/form_3.jpg
In:  ./slike_ciste/FormatA/30706-denoise.jpg
Out:  ./slike_ciste_custom_ime/FormatA/form_4.jpg
In:  ./slike_ciste/FormatA/30922-denoise.jpg
Out:  ./slike_ciste_custom_ime/FormatA/form_5.jpg
In:  ./slike_ciste/FormatA/30925-denoise.jpg
Out:  ./slike_ciste_custom_ime/FormatA/form_6.jpg
In:  ./slike_ciste/FormatA/30926-denoise.jpg
Out:  ./slike_ciste_custom_ime/FormatA/form_7.jpg
In:  ./slike_ciste/FormatA/30960-denoise-sharpen.jpg
Out:  ./slike_ciste_custom_ime/FormatA/form_8.jpg
In:  ./slike_ciste/FormatA/30967-denoise.jpg
Out:  ./slike_ciste_custom_ime/FormatA/form_9.jpg
In:  ./slike_c

In [23]:
input_folders, output_folders = setup_folders("slike_ciste_custom_ime","jpg_minski_zapisnici_clean_ml_regions_2")

In [24]:
from time import time
import threading
import cv2
from utils.extract_regions import extract_rectangle

def extract_interesting_regions_threading(src: str, dest: str):
    empty, dest, ext = dest.split('.')
    image = cv2.imread(src)
    regions, classes = extract_interesting_regions_ml(image)
    regions = [extract_rectangle(image, region[0], region[1], wiggle_room=15) for region in regions]
    regions = list(zip(classes, regions))
    for i, region in regions:
        a = f".{dest}_{i}.{ext}"
        if len(region) > 0:
            written = cv2.imwrite(a, region)
            print("Is written", written)

num_threads = 5
folders = list(zip(input_folders, output_folders))
for input_folder, output_folder in folders:
    print(f"Folder {input_folder}")
    print("Start clean...")
    s = time()
    images = os.listdir(input_folder)
    threads = []

    for in_image in images:
        image1, image2 = f"{input_folder}/{in_image}", f"{output_folder}/{in_image}"
        print("In: ", image1)
        print("Out: ", image2)
        #extract_interesting_regions_threading(image1, image2)
        #"""
        thread = threading.Thread(target=extract_interesting_regions_threading, args=(image1, image2))
        thread.start()
        threads.append(thread)
        if len(threads) >= num_threads:
            for thread in threads:
                thread.join()
            threads = []
        #"""
    e = time()
    print(f"End clean, time took {e-s}")

Folder ./slike_ciste_custom_ime/FormatA
Start clean...
In:  ./slike_ciste_custom_ime/FormatA/form_0.jpg
Out:  ./jpg_minski_zapisnici_clean_ml_regions_2/FormatA/form_0.jpg
In:  ./slike_ciste_custom_ime/FormatA/form_1.jpg
Out:  ./jpg_minski_zapisnici_clean_ml_regions_2/FormatA/form_1.jpg
In:  ./slike_ciste_custom_ime/FormatA/form_10.jpg
Out:  ./jpg_minski_zapisnici_clean_ml_regions_2/FormatA/form_10.jpg
In:  ./slike_ciste_custom_ime/FormatA/form_100.jpg
Out:  ./jpg_minski_zapisnici_clean_ml_regions_2/FormatA/form_100.jpg
In:  ./slike_ciste_custom_ime/FormatA/form_101.jpg
Out:  ./jpg_minski_zapisnici_clean_ml_regions_2/FormatA/form_101.jpg





0: 640x480 1 overhead, 1 record, 1 map, 1 legend, 1 structure, 1 detailed information, 1 unit record, 1 demainer information, 1 demining information, 155.8ms
Speed: 31.0ms preprocess, 155.8ms inference, 10.2ms postprocess per image at shape (1, 3, 640, 480)
Is written True
Is written True
Is written True
Is written True
Is written True
0: 640x480 1



0: 640x480 1 overhead, 1 record, 1 map, 1 legend, 229.2ms
Speed: 10.2ms preprocess, 229.2ms inference, 4.5ms postprocess per image at shape (1, 3, 640, 480)
Is written True
Is written True
Is written True
0: 640x480 1 overhead, 1 record, 1 map, 1 legend, 2 structures, 1 detailed information, 1 unit record, 1 demainer information, 1 demining information, 143.1ms
Speed: 11.8ms preprocess, 143.1ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 480)
Is written True
Is written True
Is written True
Is written True
Is written True
Is written True
Is written True
Is written True
Is written True
Is written True
Is written True


The results with the trained model is much better in all cases, so it will be used for further implementations

In [25]:
import os
output = "./jpg_minski_zapisnici_clean_ml_regions_separated_2"
if not os.path.isdir(f"./{output}"):
    os.mkdir(f"./{output}")
outputs = []
for i in range(9):
    output = f"./jpg_minski_zapisnici_clean_ml_regions_separated_2/reg_{i}"
    if not os.path.isdir(f"./{output}"):
        os.mkdir(f"./{output}")
    outputs.append(output)

From now on only Format A will be considered as it is the one that has latin text

In [1]:
import cv2
input = "./jpg_minski_zapisnici_clean_ml_regions_2/FormatA"
files = os.listdir(input)
for i, out in enumerate(outputs):
    suffix = f"_{i}.jpg"
    for file in files:
        if file.endswith(suffix):
            img = cv2.imread(f"{input}/{file}")
            cv2.imwrite(f"{out}/{file}", img)

NameError: name 'os' is not defined

In [1]:
import os
from utils.label_generator import (
    generate_word_labels,
    generate_line_labels,
    generate_region_labels
)

path = "./MINSKI_ZAPISNICI_IMENOVANI_REGIJE_DOKUMENTI/REGIJA_5_PASCALVOC_LABELE"
labels = os.listdir(path)
labels = [l for l in labels if l.endswith(".xml")]

generate_word_labels(path, labels)
generate_line_labels(path, labels)
#generate_region_labels(path, labels)
#

In [3]:
#region 0
import os
from utils.label_generator import (
    generate_word_labels,
    generate_line_labels,
    generate_region_labels
)

path = "./MINSKI_ZAPISNICI_IMENOVANI_REGIJE_DOKUMENTI/REGIJA_0_PASCALVOC_LABELE"
labels = os.listdir(path)
labels = [l for l in labels if l.endswith(".xml")]

generate_word_labels(path, labels)

In [10]:
import os
minski_zapisnici = "./minski_zapisnici_only_word_reg5"
if not os.path.isdir(minski_zapisnici):
    os.mkdir(minski_zapisnici)
outputs = []
for i in ["word", "line"]:
    output = f"./{minski_zapisnici}/{i}"
    if not os.path.isdir(f"./{output}"):
        os.mkdir(f"./{output}")

    outputs.append(output)

In [9]:
from utils.extract_regions import extract_rectangle
import cv2
import os

line_labels_5 = "./MINSKI_ZAPISNICI_IMENOVANI_REGIJE_DOKUMENTI/REGIJA_5_PASCALVOC_LABELE/line_labels.txt"
word_labels_5 = "./MINSKI_ZAPISNICI_IMENOVANI_REGIJE_DOKUMENTI/REGIJA_5_PASCALVOC_LABELE/word_labels.txt"
word_labels = "./MINSKI_ZAPISNICI_IMENOVANI_REGIJE_DOKUMENTI/REGIJA_0_PASCALVOC_LABELE/word_labels.txt"

path1 = "./MINSKI_ZAPISNICI_IMENOVANI_REGIJE_DOKUMENTI/REGIJA_5_PASCALVOC_LABELE"
path2 = "./MINSKI_ZAPISNICI_IMENOVANI_REGIJE_DOKUMENTI/REGIJA_0_PASCALVOC_LABELE"


labels = os.listdir(path)
labels = [l for l in labels if l.endswith(".xml")]

def generate_line_images(line_labels, src, dest):
    with open(line_labels, "r", encoding="utf8") as read:
        with open(f"{dest}/line.txt", "w", encoding="utf8") as write:
            while line := read.readline():
                line = line.removesuffix("\n").split("|")
                filename, l_idx, w, h, l_x_min, l_y_min, l_x_max, l_y_max, text = line
                img = cv2.imread(f"{src}/{filename}")
                line_img = extract_rectangle(img, [int(l_x_min), int(l_y_min)], [int(l_x_max), int(l_y_max)])
                filename = filename.removesuffix(".jpg")
                img_path = f"{dest}/line/{filename}_{l_idx}.jpg"
                try:
                    cv2.imwrite(img_path, line_img)
                    line[0] = f"{filename}_{l_idx}.jpg"
                    write.write(f"{'|'.join(line)}\n")
                except Exception as e:
                    print(e)

def generate_word_images(word_labels, src, dest):
    with open(word_labels, "r", encoding="utf8") as read:
        with open(f"{dest}/word.txt", "w", encoding="utf8") as write:
            while line := read.readline():
                line = line.removesuffix("\n").split("|")
                filename, word_idx, w, h, xmin, ymin, xmax, ymax, word = line
                img = cv2.imread(f"{src}/{filename}")
                if img is None:
                    print(f"{src}/{filename}")
                word_img = extract_rectangle(img, [int(xmin), int(ymin)], [int(xmax), int(ymax)])
                filename = filename.removesuffix(".jpg")
                img_path = f"{dest}/word/{filename}_{word_idx}.jpg"
                try:
                    cv2.imwrite(img_path, word_img)
                    line[0] = f"{filename}_{word_idx}.jpg"
                    write.write(f"{'|'.join(line)}\n")
                except Exception as e:
                    print(e)


generate_line_images(line_labels_5,path1,"./a/1")
generate_word_images(word_labels_5,path1,"./a/1")
generate_word_images(word_labels,path2,"./a/2")




OpenCV(4.9.0) D:\a\opencv-python\opencv-python\opencv\modules\imgcodecs\src\loadsave.cpp:786: error: (-215:Assertion failed) !_img.empty() in function 'cv::imwrite'



In [34]:
generate_line_images("./test_dataset_1/line_labels.txt", "./test_dataset_1/regions", "./test_dataset_1/labels_l")

OpenCV(4.9.0) D:\a\opencv-python\opencv-python\opencv\modules\imgcodecs\src\loadsave.cpp:786: error: (-215:Assertion failed) !_img.empty() in function 'cv::imwrite'

OpenCV(4.9.0) D:\a\opencv-python\opencv-python\opencv\modules\imgcodecs\src\loadsave.cpp:786: error: (-215:Assertion failed) !_img.empty() in function 'cv::imwrite'

OpenCV(4.9.0) D:\a\opencv-python\opencv-python\opencv\modules\imgcodecs\src\loadsave.cpp:786: error: (-215:Assertion failed) !_img.empty() in function 'cv::imwrite'

OpenCV(4.9.0) D:\a\opencv-python\opencv-python\opencv\modules\imgcodecs\src\loadsave.cpp:786: error: (-215:Assertion failed) !_img.empty() in function 'cv::imwrite'

OpenCV(4.9.0) D:\a\opencv-python\opencv-python\opencv\modules\imgcodecs\src\loadsave.cpp:786: error: (-215:Assertion failed) !_img.empty() in function 'cv::imwrite'

OpenCV(4.9.0) D:\a\opencv-python\opencv-python\opencv\modules\imgcodecs\src\loadsave.cpp:786: error: (-215:Assertion failed) !_img.empty() in function 'cv::imwrite'

Open

In [29]:
generate_word_images("./test_dataset_1/word_labels.txt", "./test_dataset_1/regions", "./test_dataset_1/labels_w")

FileNotFoundError: [Errno 2] No such file or directory: './test_dataset_1/labels_w/word.txt'

In [None]:
generate_line_images(line_labels, path, )

In [41]:
#region 0
word_labels = "./MINSKI_ZAPISNICI_IMENOVANI_REGIJE_DOKUMENTI/REGIJA_0_PASCALVOC_LABELE/word_labels.txt"

path = "./MINSKI_ZAPISNICI_IMENOVANI_REGIJE_DOKUMENTI/REGIJA_0_PASCALVOC_LABELE"
minski_zapisnici = "./minski_zapisnici_0"
if not os.path.isdir(minski_zapisnici):
    os.mkdir(minski_zapisnici)

minski_zapisnici1 = "./minski_zapisnici_0/word"
if not os.path.isdir(minski_zapisnici1):
    os.mkdir(minski_zapisnici1)

generate_word_images(word_labels,path,minski_zapisnici)

OpenCV(4.9.0) D:\a\opencv-python\opencv-python\opencv\modules\imgcodecs\src\loadsave.cpp:786: error: (-215:Assertion failed) !_img.empty() in function 'cv::imwrite'



In [15]:
from sklearn.model_selection import train_test_split
import os


def split_train_set(src_txt: str, out: str, train_size=0.7, val_size=0.2, test_size=0.1, random_state=None):
    assert abs(train_size+val_size+test_size-1.0) < 1e-5, "Train, validation, and test sizes must sum to 1.0"
    elements = []
    with open(src_txt, "r", encoding="utf8") as read:
        while line := read.readline(): 
            elements.append(line)

    if not elements:
        return 0,0,0

    train_val_data, test_data = train_test_split(elements, test_size=test_size, random_state=random_state)
    val_relative_size = val_size / (train_size + val_size)
    train_data, val_data = train_test_split(train_val_data, test_size=val_relative_size, random_state=random_state)
    with open(f"{out}/train.txt", "w", encoding="utf8") as train:
        for el in train_data:
            train.write(el)

    with open(f"{out}/test.txt", "w", encoding="utf8") as test:
        for el in test_data:
            test.write(el)

    with open(f"{out}/val.txt", "w", encoding="utf8") as val:
        for el in val_data:
            val.write(el)

    return len(train_data), len(val_data), len(test_data)


"""
data = "./htr_data/v8/ml_word_data"
if not os.path.isdir(data):
    os.mkdir(data)

r = split_train_set("./htr_data/v8/word.txt", data)
print(r)
print(sum(r))
"""
"""
data = "./minski_zapisnici/ml_word_data"
if not os.path.isdir(data):
    os.mkdir(data)

r = split_train_set("./minski_zapisnici/word.txt", data)
print(r)
"""

'\ndata = "./minski_zapisnici/ml_word_data"\nif not os.path.isdir(data):\n    os.mkdir(data)\n\nr = split_train_set("./minski_zapisnici/word.txt", data)\nprint(r)\n'

5161


In [20]:
base = "./htr_data/v6"

t = f"{base}/word"

ff = list(os.listdir(t))

print(len(ff))
out = []

with open(f"{base}/word.txt", "r", encoding="utf8") as rf:
    with open(f"{base}/word_unique.txt", "w", encoding="utf8") as wf:
        while line := rf.readline():
            line = line.split("|")
            name = line[0]

            if name in ff:
                ff.remove(name)
                wf.write(f"{'|'.join(line)}")
            else: out.append(name)

print(len(out))

5529
92


In [22]:
t = "./htr_data/v5/line"

ff = list(os.listdir(t))


with open("./MINSKI_ZAPISNICI_IMENOVANI_REGIJE_DOKUMENTI/a.txt", "r", encoding="utf8") as rf:
    with open("./MINSKI_ZAPISNICI_IMENOVANI_REGIJE_DOKUMENTI/aa.txt", "w", encoding="utf8") as wf:
        while line := rf.readline():
            line = line.split("|")
            name = line[0]

            if name in ff:
                ff.remove(name)
                wf.write(f"{'|'.join(line)}")
            else: out.append(name)





In [11]:
import os
from shutil import copyfile

imgs = [i for i in os.listdir("./MINSKI_ZAPISNICI_IMENOVANI_REGIJE_DOKUMENTI/REGIJA_0_PASCALVOC_LABELE") if i.endswith(".jpg")]
a = "./MINSKI_ZAPISNICI_IMENOVANI_REGIJE_DOKUMENTI/REGIJA_0_PASCALVOC_LABELE"

for i in imgs:
    copyfile(f"{a}/{i}", f"./htr_data/v2/line/{i}")

In [5]:
import os
from utils.pascal_voc_to_yolo import pascalVOC2yolo

path = "./MINSKI_ZAPISNICI_IMENOVANI_REGIJE_DOKUMENTI/REGIJA_5_PASCALVOC_LABELE"

labels = os.listdir(path)
labels = [l for l in labels if l.endswith(".xml")]

for l in labels:
    pascalVOC2yolo(l.removesuffix(".xml"), path)

In [1]:
import os
path = "./MINSKI_ZAPISNICI_IMENOVANI_REGIJE_DOKUMENTI/REGIJA_5_PASCALVOC_LABELE"

labels = os.listdir(path)
labels = [l for l in labels if l.endswith(".xml")]

minski_zapisnici = "./MINSKI_ZAPISNICI_IMENOVANI_REGIJE_DOKUMENTI_YOLO"
if not os.path.isdir(minski_zapisnici):
    os.mkdir(minski_zapisnici)

outputs = []
for i in ["img", "lbl"]:
    output = f"{minski_zapisnici}/{i}"
    if not os.path.isdir(f"./{output}"):
        os.mkdir(f"./{output}")

    outputs.append(output)

import shutil 
for l in labels:
    l = l.removesuffix(".xml")
    jpg_src = f"{path}/{l}.jpg"
    jpg_dest = f"{outputs[0]}/{l}.jpg"

    txt_src = f"{path}/{l}.txt"
    txt_dest = f"{outputs[1]}/{l}.txt"

    shutil.copyfile(jpg_src, jpg_dest)
    shutil.copyfile(txt_src, txt_dest)


FileNotFoundError: [WinError 3] The system cannot find the path specified: './MINSKI_ZAPISNICI_IMENOVANI_REGIJE_DOKUMENTI/REGIJA_5_PASCALVOC_LABELE_FULL'

In [6]:
import cv2
from pipeline.handwritten_text_recognition import ExtractText


def build_ds(out_file: str, img_path: str, lbl_file_path: str):
    print("Start building ds", out_file)
    extract_text = ExtractText()
    with open(out_file, "w", encoding="utf8") as ds_file:
        with open(lbl_file_path, "r", encoding="utf8") as lbl_file:
            while line := lbl_file.readline():
                line = line.split("|")
                img_name = line[0]
                lbl = line[-1].removesuffix("\n")
                img = f"{img_path}/{img_name}"
                text = extract_text.execute(img)[0][0]
                ds_file.write(f'"{text}","{lbl}"\n')
    print("Finish building ds", out_file)


build_ds("./llm_ds_test.csv", "./mine_logs2/line", "./mine_logs2/ml_line_data/test.txt")
build_ds(
    "./llm_ds_train.csv", "./mine_logs2/line", "./mine_logs2/ml_line_data/train.txt"
)
build_ds("./llm_ds_val.csv", "./mine_logs2/line", "./mine_logs2/ml_line_data/val.txt")

Start building ds ./llm_ds_test.csv
Finish building ds ./llm_ds_test.csv
Start building ds ./llm_ds_train.csv
Finish building ds ./llm_ds_train.csv
Start building ds ./llm_ds_val.csv
Finish building ds ./llm_ds_val.csv


In [4]:
with open("./llm_ds.csv", "r", encoding="utf8") as rf:
    with open("./llm_lama2_ds.csv", "w", encoding="utf8") as wf:
        while line := rf.readline():
            line = line.split("\",\"")
            _in = line[0].removeprefix("\"")
            out = line[1].removesuffix("\"\n")
            text = f"<s>[INST]:Prepravi sljedeći tekst u smislenu i gramatički tačnu rečenicu: {_in}[/INST]{out}</s>\n"
            wf.write(text)


In [9]:
d = {
     "features": [
        {
            "feature_idx": 0,
            "name": "text",
            "type": {
                "dtype": "string",
                "_type": "Value"
            }
        }
    ]
}
rows = []
with open("./llm_lama2_ds.csv", "r", encoding="utf8") as rf:
    idx = 0
    while row := rf.readline():
        rows.append({
            "row_idx": idx,
            "text": row,
            "truncated_cells": []
        })
        idx += 1


d["text"] = rows

import json
with open("./llm_lama2_ds.json", "w", ) as wf:
    wf.write(json.dumps(d))


In [11]:
with open("./llm_lama2_ds.csv", "r", encoding="utf8") as rf:
    idx = 0
    with open("./llm_lama2_ds1.json", "w", ) as wf:
        while row := rf.readline():
            a = {"text": row, "idx": idx}
            wf.write(f"{json.dumps(a)}\n")
            idx += 1

In [5]:
a = ["ds/llm_ds_test.csv", "ds/llm_ds_train.csv", "ds/llm_ds_val.csv" ]

with open("../ds/llm_ds_faulty.csv", "w", encoding="utf8") as wf:
    for i in a:
        with open(f"../{i}", "r", encoding="utf8") as rf:
            while line:= rf.readline():
                line = line.split("\",\"")
                det = line[0].removeprefix("\"")
                lbl = line[1].removesuffix("\"\n")
                if det != lbl:
                    wf.write(f'"{det}","{lbl}"\n')
        

In [10]:
import os

imgs = "./test_dataset_1/regions"


names_a = set()

with open("./test_dataset_1/word_labels.txt", "r", encoding="utf8") as rf:
    while l := rf.readline():
        l = l.removesuffix("\n").split("|")
        n = l[0]
        names_a.add(n)


names_b = set(os.listdir(imgs))


missing = names_b.difference(names_a)

print(len(names_a))
print(len(names_b))
print(len(missing))


for i in missing:
    os.remove(f"{imgs}/{i}")






303
366
63


In [18]:
from sklearn.model_selection import train_test_split


regions = [i.removesuffix(".jpg") for i in os.listdir("./test_dataset_1/regions")]

words = []
with open("./test_dataset_1/word_labels.txt", "r", encoding="utf8") as rf:
    while w:=rf.readline():
        words.append(w.split("|")[0].removesuffix(".jpg"))
reg_histogram = list(sorted([(r, len([i for i in words if i.startswith(r)])) for r in regions], key= lambda el: el[1], reverse=True))


In [21]:
total_words = len(words)
train_target_words = total_words * 0.7
val_target_words = total_words * 0.2
test_target_words = total_words * 0.1

print(
    int(train_target_words),
    int(val_target_words),
    int(test_target_words)+1)


3935 1124 563


In [22]:
train_set, val_set, test_set = [], [], []
train_words, val_words, test_words = 0, 0, 0

for image_path, num_words in reg_histogram:
    if train_words + num_words <= train_target_words:
        train_set.append(image_path)
        train_words += num_words
    elif val_words + num_words <= val_target_words:
        val_set.append(image_path)
        val_words += num_words
    else:
        test_set.append(image_path)
        test_words += num_words

In [23]:
print(f"Train set: {len(train_set)} images, {train_words} words")
print(f"Validation set: {len(val_set)} images, {val_words} words")
print(f"Test set: {len(test_set)} images, {test_words} words")

Train set: 97 images, 3935 words
Validation set: 64 images, 1124 words
Test set: 142 images, 563 words


In [26]:
with open("./test_dataset_1/regions_sets/train_set.txt", "w") as wf:
    for i in train_set:
        wf.write(f"{i}\n")

with open("./test_dataset_1/regions_sets/val_set.txt", "w") as wf:
    for i in val_set:
        wf.write(f"{i}\n")

with open("./test_dataset_1/regions_sets/test_set.txt", "w") as wf:
    for i in test_set:
        wf.write(f"{i}\n")


In [28]:
wl = None
with open("./test_dataset_1/word_labels.txt", "r", encoding="utf8") as r:
    wl=r.readlines()

ll = None
with open("./test_dataset_1/line_labels.txt", "r", encoding="utf8") as r:
    ll=r.readlines()


def write(check, file_write_to, lbl):
    with open(file_write_to, "w", encoding="utf8") as wf:
        wf.writelines([l for c in check for l in lbl if l.startswith(c)])
        



write(train_set, "./test_dataset_1/word_ttv/train.txt", wl)
write(val_set, "./test_dataset_1/word_ttv/val.txt", wl)
write(test_set, "./test_dataset_1/word_ttv/test.txt", wl)


write(train_set, "./test_dataset_1/line_ttv/train.txt", ll)
write(val_set, "./test_dataset_1/line_ttv/val.txt", ll)
write(test_set, "./test_dataset_1/line_ttv/test.txt", ll)

# Kako se ds napravio 

1. kreirani datasetovi koji medjusobno nemaju istu raspodjelu slika
    a. model treniran na linijama (recenicama)
    b. model treniran samo na rijecima (ciframa)
2. uporedjeni rezultati tih modela
3. Odabrani najbolji rezultati
4. Podjeljen dataset na tran, val, test tako da se moze trenirati model nad istim podacima s istom raspodjelom


In [36]:
lines = None
with open("./test_dataset_1/a.txt", "r", encoding="utf8") as rf:
    lines = rf.readlines()

words = None
with open("./test_dataset_1/b.txt", "r", encoding="utf8") as rf:
    words = rf.readlines()

test = None
with open("./test_dataset_1/region_sets/test_set.txt", "r") as rf:
    test = rf.readlines()


val = None
with open("./test_dataset_1/region_sets/val_set.txt", "r") as rf:
    val = rf.readlines()


train = None
with open("./test_dataset_1/region_sets/train_set.txt", "r") as rf:
    train = rf.readlines()


w = {
    "train": [i for i in words for j in train if i.startswith(j.removesuffix("\n"))],
    "test": [i for i in words for j in test if i.startswith(j.removesuffix("\n"))],
    "val": [i for i in words for j in val if i.startswith(j.removesuffix("\n"))],
}

l = {
    "train": [i for i in lines for j in train if i.startswith(j.removesuffix("\n"))],
    "test": [i for i in lines for j in test if i.startswith(j.removesuffix("\n"))],
    "val": [i for i in lines for j in val if i.startswith(j.removesuffix("\n"))],
}


for key, val in l.items():
    with open(f"./test_dataset_1/line_1/{key}.txt", "w", encoding="utf8") as f:
        f.writelines(val)

for key, val in w.items():
    with open(f"./test_dataset_1/word_1/{key}.txt", "w", encoding="utf8") as f:
        f.writelines(val)






In [38]:
import os

regions = [r.removesuffix(".jpg") for r in os.listdir("./test_dataset_1/regions")]

lines = None
with open("./test_dataset_1/a.txt", "r", encoding="utf8") as rf:
    lines = rf.readlines()

words = None
with open("./test_dataset_1/b.txt", "r", encoding="utf8") as rf:
    words = rf.readlines()



res = [(r, sum([1 for i in lines if i.startswith(r)]), sum([1 for i in words if i.startswith(r)])) for r in regions]




In [43]:
res = list(sorted(res, key=lambda x: (x[1], x[2]), reverse=True))

In [45]:
# Example: assuming `dataset` is a list of tuples where each tuple is (image_path, num_lines, num_words)
total_lines = len(lines)
total_words = len(words)

train_target_lines = int(total_lines * 0.7)
val_target_lines = int(total_lines * 0.2)
test_target_lines = int(total_lines * 0.1) + 1 

train_target_words = int(total_words * 0.7)
val_target_words = int(total_words * 0.2)
test_target_words = int(total_words * 0.1) + 1

print(
    train_target_lines,
    val_target_lines,
    test_target_lines
)

print(
    train_target_words,
    val_target_words,
    test_target_words
)

1163 332 167
4010 1145 573


In [46]:
train_set, val_set, test_set = [], [], []
train_lines, val_lines, test_lines = 0, 0, 0
train_words, val_words, test_words = 0, 0, 0

for image_path, num_lines, num_words in res:
    if train_lines + num_lines <= train_target_lines and train_words + num_words <= train_target_words:
        train_set.append(image_path)
        train_lines += num_lines
        train_words += num_words
    elif val_lines + num_lines <= val_target_lines and val_words + num_words <= val_target_words:
        val_set.append(image_path)
        val_lines += num_lines
        val_words += num_words
    else:
        test_set.append(image_path)
        test_lines += num_lines
        test_words += num_words


In [47]:
print(f"Train set: {len(train_set)} images, {train_lines} lines, {train_words} words")
print(f"Validation set: {len(val_set)} images, {val_lines} lines, {val_words} words")
print(f"Test set: {len(test_set)} images, {test_lines} lines, {test_words} words")

Train set: 116 images, 868 lines, 4010 words
Validation set: 65 images, 332 lines, 1086 words
Test set: 122 images, 262 lines, 433 words


In [54]:
t = 868+332+262
print(868/t, 332/t, 262/t)
t=4010+1086+433
print(4010/t, 1086/t, 433/t)

0.5937072503419972 0.2270861833105335 0.17920656634746923
0.7252667751853862 0.19641888225718937 0.07831434255742449


In [49]:
with open("./test_dataset_1/regions_sets_1/train_set.txt", "w") as wf:
    wf.writelines([f"{t}\n" for t in train_set])

with open("./test_dataset_1/regions_sets_1/val_set.txt", "w") as wf:
    wf.writelines([f"{t}\n" for t in val_set])

with open("./test_dataset_1/regions_sets_1/test_set.txt", "w") as wf:
    wf.writelines([f"{t}\n" for t in test_set])

In [52]:
lines = None
with open("./test_dataset_1/a.txt", "r", encoding="utf8") as rf:
    lines = rf.readlines()

words = None
with open("./test_dataset_1/b.txt", "r", encoding="utf8") as rf:
    words = rf.readlines()

test = None
with open("./test_dataset_1/regions_sets_1/test_set.txt", "r") as rf:
    test = rf.readlines()


val = None
with open("./test_dataset_1/regions_sets_1/val_set.txt", "r") as rf:
    val = rf.readlines()


train = None
with open("./test_dataset_1/regions_sets_1/train_set.txt", "r") as rf:
    train = rf.readlines()


w = {
    "train": [i for i in words for j in train if i.startswith(j.removesuffix("\n"))],
    "test": [i for i in words for j in test if i.startswith(j.removesuffix("\n"))],
    "val": [i for i in words for j in val if i.startswith(j.removesuffix("\n"))],
}

l = {
    "train": [i for i in lines for j in train if i.startswith(j.removesuffix("\n"))],
    "test": [i for i in lines for j in test if i.startswith(j.removesuffix("\n"))],
    "val": [i for i in lines for j in val if i.startswith(j.removesuffix("\n"))],
}


for key, val in l.items():
    with open(f"./test_dataset_1/line_2/{key}.txt", "w", encoding="utf8") as f:
        f.writelines(val)

for key, val in w.items():
    with open(f"./test_dataset_1/word_2/{key}.txt", "w", encoding="utf8") as f:
        f.writelines(val)






In [10]:
import os

regions = [r.removesuffix(".jpg") for r in os.listdir("./test_dataset_1/regions")]

lines = None
with open("./a/a.txt", "r", encoding="utf8") as rf:
    lines = rf.readlines()

words = None
with open("./a/b.txt", "r", encoding="utf8") as rf:
    words = rf.readlines()

In [11]:
from sklearn.model_selection import train_test_split

rest_of_data, test_data = train_test_split(regions, test_size=0.1)

print(len(rest_of_data), len(test_data))

272 31


In [12]:
print("words", len(words))
print("lines", len(lines))
words1 = [j for i in rest_of_data for j in words if j.startswith(i)]
lines1 = [j for i in rest_of_data for j in lines if j.startswith(i)]
print("words", len(words1))
print("lines", len(lines1))
words2 = [j for i in test_data for j in words if j.startswith(i)]
lines2 = [j for i in test_data for j in lines if j.startswith(i)]
print("words", len(words2))
print("lines", len(lines2))

words 5621
lines 1476
words 5047
lines 1309
words 574
lines 167


In [16]:
with open("./tmp.txt", "w", encoding="utf8") as wf:
    wf.writelines(lines1)
split_train_set("./tmp.txt", "./htr_data/v15/ml_line_data")




with open("./tmp.txt", "w", encoding="utf8") as wf:
    wf.writelines(words1)
split_train_set("./tmp.txt", "./htr_data/v16/ml_word_data")


os.remove("./tmp.txt")

In [17]:
with open("./htr_data/v17/lines.txt", "w", encoding="utf8") as wf:
    wf.writelines(lines2)


with open("./htr_data/v17/words.txt", "w", encoding="utf8") as wf:
    wf.writelines(words2)


In [18]:
import os

nums = list(os.listdir("./test_dataset_1/nums_ttv"))

ttv_nums = {
}
for n in nums:
    with open(f"./test_dataset_1/nums_ttv/{n}", "r", encoding="utf8") as rf:
        ttv_nums[n.removesuffix(".txt")] = rf.readlines()

for key, val in ttv_nums.items():
    with open(f"./htr_data/v15/ml_line_data/{key}.txt", "a", encoding="utf8") as af:
        af.writelines(val)
    with open(f"./htr_data/v16/ml_word_data/{key}.txt", "a", encoding="utf8") as af:
        af.writelines(val)