## File conversion from .gif to .jpg

### Setup folders

In [1]:
import os

def setup_folders(input: str, output):
    input_folders = [
        f"./{input}/FormatA",
        f"./{input}/FormatB"
    ]
    output_folders = [
        f"./{output}/FormatA",
        f"./{output}/FormatB",
    ]

    if not os.path.isdir(f"./{output}"):
        os.mkdir(f"./{output}")
    if not os.path.isdir(f"./{output}/FormatA"):
        os.mkdir(f"./{output}/FormatA")
    if not os.path.isdir(f"./{output}/FormatB"):
        os.mkdir(f"./{output}/FormatB")

    return input_folders, output_folders


input_folders, output_folders = setup_folders("MINSKI_ZAPISNICI_SORTIRANO_ZAJEDNO","jpg_minski_zapisnici")

### Convert images

In [2]:
from utils import convert_gif_to_jpg
from time import time

num_threads = 5

folders = list(zip(input_folders, output_folders))
for input_folder, output_folder in folders:
    print(f"Folder {input_folder}")
    print("Start convert...")
    s = time()
    convert_gif_to_jpg(input_folder, output_folder, num_threads)
    e = time()
    print(f"End convert, time took {e-s}")

Folder ./MINSKI_ZAPISNICI_SORTIRANO_ZAJEDNO/FormatA
Start convert...
Converted 30519.gif to JPG
Converted 30520.gif to JPG
Converted 30706.gif to JPG
Converted 30535.gif to JPG
Converted 30543.gif to JPG
Converted 30960.gif to JPG
Converted 30926.gif to JPG
Converted 30967.gif to JPG
Converted 30925.gif to JPG
Converted 30922.gif to JPG
Converted 30969.gif to JPG
Converted 30972.gif to JPG
Converted 30977a.gif to JPG
Converted 31131.gif to JPG
Converted 31094.gif to JPG
Converted 31142.gif to JPG
Converted 31139.gif to JPG
Converted 31150.gif to JPG
Converted 31136.gif to JPG
Converted 31138.gif to JPG
Converted 33021a.gif to JPG
Converted 32862.gif to JPG
Converted 33021b.gif to JPG
Converted 32860.gif to JPG
Converted 31190.gif to JPG
Converted 40688.gif to JPG
Converted 50677.gif to JPG
Converted 50678.gif to JPG
Converted 50676.gif to JPG
Converted 33238.gif to JPG
Converted 50690.gif to JPG
Converted 50685.gif to JPG
Converted 50684.gif to JPG
Converted 50682.gif to JPG
Converted 

## Cleaning images

In [4]:
input_folders, output_folders = setup_folders("jpg_minski_zapisnici", "jpg_minski_zapisnici_clean")

In [4]:
from utils.clean_image_pipeline import apply_image_cleaning_pipeline

input_path = "./jpg_minski_zapisnici/FormatA/30543.jpg"
output_path = "./jpg_minski_zapisnici_clean/FormatA/30543.jpg"

apply_image_cleaning_pipeline(input_path, output_path)


In [7]:
from time import time
from utils.clean_image_pipeline import apply_image_cleaning_pipeline
import threading

num_threads = 5
folders = list(zip(input_folders, output_folders))
for input_folder, output_folder in folders:
    print(f"Folder {input_folder}")
    print("Start clean...")
    s = time()
    images = os.listdir(input_folder)
    threads = []

    for in_image in images:
        image1, image2 = f"{input_folder}/{in_image}", f"{output_folder}/{in_image}"
        print("In: ", image1)
        print("Out: ", image2)
        thread = threading.Thread(target=apply_image_cleaning_pipeline, args=(image1, image2))
        thread.start()
        threads.append(thread)
        if len(threads) >= num_threads:
            for thread in threads:
                thread.join()
            threads = []
        #apply_image_cleaning_pipeline(image1, image2)
    e = time()
    print(f"End clean, time took {e-s}")

Folder ./jpg_minski_zapisnici/FormatA
Start clean...
In:  ./jpg_minski_zapisnici/FormatA/30519.jpg
Out:  ./jpg_minski_zapisnici_clean/FormatA/30519.jpg
In:  ./jpg_minski_zapisnici/FormatA/30520.jpg
Out:  ./jpg_minski_zapisnici_clean/FormatA/30520.jpg
In:  ./jpg_minski_zapisnici/FormatA/30535.jpg
Out:  ./jpg_minski_zapisnici_clean/FormatA/30535.jpg
In:  ./jpg_minski_zapisnici/FormatA/30543.jpg
Out:  ./jpg_minski_zapisnici_clean/FormatA/30543.jpg
In:  ./jpg_minski_zapisnici/FormatA/30706.jpg
Out:  ./jpg_minski_zapisnici_clean/FormatA/30706.jpg
In:  ./jpg_minski_zapisnici/FormatA/30922.jpg
Out:  ./jpg_minski_zapisnici_clean/FormatA/30922.jpg
In:  ./jpg_minski_zapisnici/FormatA/30925.jpg
Out:  ./jpg_minski_zapisnici_clean/FormatA/30925.jpg
In:  ./jpg_minski_zapisnici/FormatA/30926.jpg
Out:  ./jpg_minski_zapisnici_clean/FormatA/30926.jpg
In:  ./jpg_minski_zapisnici/FormatA/30960.jpg
Out:  ./jpg_minski_zapisnici_clean/FormatA/30960.jpg
In:  ./jpg_minski_zapisnici/FormatA/30967.jpg
Out:  ./jp

## Region detection

### Manual region detection

In [2]:
from utils.extract_regions import extract_interesting_regions
import cv2
image_path = "./jpg_minski_zapisnici_clean/FormatA/30543.jpg"
out_path = "./tmp/region_detection12_"

image = cv2.imread(image_path)
regions = extract_interesting_regions(image)

for i, region in enumerate(regions):
    a = f"{out_path}{i}.jpg"
    print(a)
    if len(region) > 0:
        print("write")
        cv2.imwrite(a, region)

./tmp/region_detection12_0.jpg
write
./tmp/region_detection12_1.jpg
write
./tmp/region_detection12_2.jpg
write
./tmp/region_detection12_3.jpg
write
./tmp/region_detection12_4.jpg
write
./tmp/region_detection12_5.jpg
write
./tmp/region_detection12_6.jpg
write
./tmp/region_detection12_7.jpg
write


In [6]:
input_folders, output_folders = setup_folders("jpg_minski_zapisnici_clean","jpg_minski_zapisnici_clean_manual_regions")

In [9]:
from time import time
from utils.extract_regions import extract_interesting_regions
import threading

def extract_interesting_regions_threading(src: str, dest: str):
    empty, dest, ext = dest.split('.')
    image = cv2.imread(src)
    regions = extract_interesting_regions(image)

    for i, region in enumerate(regions):
        a = f".{dest}_{i}.{ext}"
        if len(region) > 0:
            cv2.imwrite(a, region)

num_threads = 5
folders = list(zip(input_folders, output_folders))
for input_folder, output_folder in folders:
    print(f"Folder {input_folder}")
    print("Start clean...")
    s = time()
    images = os.listdir(input_folder)
    threads = []

    for in_image in images:
        image1, image2 = f"{input_folder}/{in_image}", f"{output_folder}/{in_image}"
        print("In: ", image1)
        print("Out: ", image2)
        #extract_interesting_regions_threading(image1, image2)
        #"""
        thread = threading.Thread(target=extract_interesting_regions_threading, args=(image1, image2))
        thread.start()
        threads.append(thread)
        if len(threads) >= num_threads:
            for thread in threads:
                thread.join()
            threads = []
        #"""
    e = time()
    print(f"End clean, time took {e-s}")

Results not good, because of the state of the images the line detection, which is predominantly used for this action, cannot work correctly most of the time.
Best way is to use ML assistance for this work.

### ML region detection

In [4]:
!pip install ultralytics==8.0.227

In [5]:
from ultralytics import YOLO

model_path = "./models/region_detection_model.pt"
img_path = "./jpg_minski_zapisnici_clean/FormatA/30519.jpg"
model = YOLO(model_path)
import cv2
image = cv2.imread(img_path)

results = model.predict(source=image, imgsz=640,conf=0.25,iou=0.45)
results = results[0]  
for i in range(len(results.boxes)):
        box = results.boxes[i]
        cls = results.cls
        tensor = box.xyxy[0]
        x1 = int(tensor[0].item())
        y1 = int(tensor[1].item())
        x2 = int(tensor[2].item())
        y2 = int(tensor[3].item())
        cv2.rectangle(image,(x1,y1),(x2,y2),(255,0,0),3) 

a= "./tmp/30519-regions-drawings.jpg"
cv2.imwrite(a,image)


AttributeError: partially initialized module 'pandas' has no attribute '_pandas_parser_CAPI' (most likely due to a circular import)

In [11]:
from ultralytics import YOLO
model_path = "./models/region_detection_model.pt"
extract_interesting_regions_ml_model = YOLO(model_path)


def extract_interesting_regions_ml(image):
    results = extract_interesting_regions_ml_model.predict(source=image,imgsz=640,conf=0.25,iou=0.45)
    results = results[0]
    boxes = results.boxes
    classes = boxes.cls
    zipped = list(zip(classes, boxes))
    rectangles = []
    classes = []
    for cls, box in zipped:
        tensor = box.xyxy[0]
        x1 = int(tensor[0].item())
        y1 = int(tensor[1].item())
        x2 = int(tensor[2].item())
        y2 = int(tensor[3].item())
        rectangles.append([[x1,y1],[x2, y2]])
        classes.append(int(cls))
    return rectangles, classes

In [3]:
input_folders, output_folders = setup_folders("jpg_minski_zapisnici_clean","jpg_minski_zapisnici_clean_ml_regions")

In [13]:
from time import time
import threading
import cv2
from utils.extract_regions import extract_rectangle

def extract_interesting_regions_threading(src: str, dest: str):
    empty, dest, ext = dest.split('.')
    image = cv2.imread(src)
    regions, classes = extract_interesting_regions_ml(image)
    regions = [extract_rectangle(image, region[0], region[1]) for region in regions]
    regions = list(zip(classes, regions))
    for i, region in regions:
        a = f".{dest}_{i}.{ext}"
        if len(region) > 0:
            written = cv2.imwrite(a, region)
            print("Is written", written)

num_threads = 5
folders = list(zip(input_folders, output_folders))
for input_folder, output_folder in folders:
    print(f"Folder {input_folder}")
    print("Start clean...")
    s = time()
    images = os.listdir(input_folder)
    threads = []

    for in_image in images:
        image1, image2 = f"{input_folder}/{in_image}", f"{output_folder}/{in_image}"
        print("In: ", image1)
        print("Out: ", image2)
        #extract_interesting_regions_threading(image1, image2)
        #"""
        thread = threading.Thread(target=extract_interesting_regions_threading, args=(image1, image2))
        thread.start()
        threads.append(thread)
        if len(threads) >= num_threads:
            for thread in threads:
                thread.join()
            threads = []
        #"""
    e = time()
    print(f"End clean, time took {e-s}")

Folder ./jpg_minski_zapisnici_clean/FormatA
Start clean...
In:  ./jpg_minski_zapisnici_clean/FormatA/30519.jpg
Out:  ./jpg_minski_zapisnici_clean_ml_regions/FormatA/30519.jpg
In:  ./jpg_minski_zapisnici_clean/FormatA/30520.jpg
Out:  ./jpg_minski_zapisnici_clean_ml_regions/FormatA/30520.jpg
In:  ./jpg_minski_zapisnici_clean/FormatA/30535.jpg
Out:  ./jpg_minski_zapisnici_clean_ml_regions/FormatA/30535.jpg
In:  ./jpg_minski_zapisnici_clean/FormatA/30543.jpg
Out:  ./jpg_minski_zapisnici_clean_ml_regions/FormatA/30543.jpg
In:  ./jpg_minski_zapisnici_clean/FormatA/30706.jpg
Out:  ./jpg_minski_zapisnici_clean_ml_regions/FormatA/30706.jpg





0: 640x480 1 overhead, 1 record, 1 map, 1 legend, 1 structure, 1 detailed information, 1 unit record, 1 demainer information, 1 demining information, 90.4ms
Speed: 20.9ms preprocess, 90.4ms inference, 4.0ms postprocess per image at shape (1, 3, 640, 480)
Is written True
Is written True
Is written True
Is written True
Is written True
Is written True
0: 64



0: 640x416 1 overhead, 1 record, 2 maps, 1 legend, 1 structure, 1 detailed information, 1 unit record, 1 demainer information, 1 demining information, 73.4ms
Speed: 7.1ms preprocess, 73.4ms inference, 4.0ms postprocess per image at shape (1, 3, 640, 416)
Is written True
Is written True
Is written True
Is written True
Is written True
Is written True
Is written True
Is written True
0: 640x448 1 record, 1 map, 1 legend, 110.4ms
Speed: 5.1ms preprocess, 110.4ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 448)
Is written True
Is written True
Is written True
Is written True
Is written True


The results with the trained model is much better in all cases, so it will be used for further implementations

In [18]:
import os
output = "./jpg_minski_zapisnici_clean_ml_regions_separated"
if not os.path.isdir(f"./{output}"):
    os.mkdir(f"./{output}")
outputs = []
for i in range(9):
    output = f"./jpg_minski_zapisnici_clean_ml_regions_separated/reg_{i}"
    if not os.path.isdir(f"./{output}"):
        os.mkdir(f"./{output}")
    outputs.append(output)

From now on only Format A will be considered as it is the one that has latin text

In [20]:
import cv2
input = "./jpg_minski_zapisnici_clean_ml_regions/FormatA"
files = os.listdir(input)
for i, out in enumerate(outputs):
    suffix = f"_{i}.jpg"
    for file in files:
        if file.endswith(suffix):
            img = cv2.imread(f"{input}/{file}")
            cv2.imwrite(f"{out}/{file}", img)