### Prepare Train and Validation Sets

In [1]:
import os
import pandas as pd
import shutil
from pathlib import Path
from PIL import Image

In [2]:
def denormalise_box_coordinates(start_x_norm, start_y_norm, end_x_norm, end_y_norm, doc_width, doc_height):
    start_x = int(start_x_norm * doc_width)
    end_x = int(end_x_norm * doc_width)
    start_y = int(start_y_norm * doc_height)
    end_y = int(end_y_norm * doc_height)
    
    return start_x, start_y, end_x, end_y

def get_box_coords(start_x, start_y, end_x, end_y):
    return [[start_x, start_y], [end_x, start_y], [end_x, end_x], [start_x, end_x]]

### Read Samples and Labels

In [8]:
document_id = 5
train_test_split = 0.7

In [9]:
sample_path = Path(f"../data/samples/document_{document_id}/images")
labels_path = Path(f"../data/samples/document_{document_id}/labels.csv")

In [10]:
fine_tuning_dataset_path = Path(f"../data/fine_tuning/document_{document_id}")

In [11]:
samples = os.listdir(sample_path)

In [12]:
num_samples = len(samples)
num_samples

10

### Split Train and Test

In [13]:
split = int(num_samples * train_test_split)
sample_id_list = list(map(lambda x: int(x.split(".")[0].split("_")[-1]), samples))
train_samples = sample_id_list[:split]
val_samples = sample_id_list[split:]

len(train_samples), len(val_samples)

(7, 3)

In [14]:
labels = pd.read_csv(labels_path)

In [15]:
labels.head()

Unnamed: 0,id,name,label,start_x_norm,start_y_norm,end_x_norm,end_y_norm,sample_document,template_box
0,28,cognome,NZAEXZ,0.081159,0.165049,0.23913,0.184466,113,18
1,29,cognome_b,WYLHGZXXYX,0.728986,0.165049,0.942029,0.183495,113,19
2,30,cognome,VNY,0.081159,0.165049,0.146377,0.184466,114,18
3,31,cognome_b,DJJIWZHCFQP,0.728986,0.165049,0.976812,0.182524,114,19
4,32,cognome,DVUOM,0.081159,0.165049,0.198551,0.184466,115,18


In [16]:
len(labels), labels.sample_document.nunique()

(20, 10)

In [17]:
image = Image.open(sample_path / samples[0])
image_width = image.width
image_height = image.height

### Copy images

In [18]:
for sample_id in train_samples:
    src = sample_path / f"sample_{sample_id}.png"
    dst = fine_tuning_dataset_path / "train_images/"
    dst.mkdir(parents=True, exist_ok=True)
    shutil.copyfile(src, dst / f"sample_{sample_id}.png")

for sample_id in val_samples:
    src = sample_path / f"sample_{sample_id}.png"
    dst = fine_tuning_dataset_path / "val_images/"
    dst.mkdir(parents=True, exist_ok=True)
    shutil.copyfile(src, dst / f"sample_{sample_id}.png")

### Write labels

In [26]:
train_annotations = []
validation_annotations = []

for sample_id in train_samples:
    sample_labels = labels[labels["sample_document"] == sample_id]
    boxes_annotation = sample_labels.apply(lambda x: 
                                           {"transcription": x["label"], 
                                            "points": get_box_coords(*denormalise_box_coordinates(x["start_x_norm"], x["start_y_norm"], 
                                                                                                  x["end_x_norm"], x["end_y_norm"], 
                                                                                                  image_width, image_height))}
                    , axis=1).values

    boxes_annotation = " ".join(map(lambda x: str(x), boxes_annotation))
    image_annotation = "\t".join([str(fine_tuning_dataset_path / f"train/sample_{sample_id}.png"), boxes_annotation])
    train_annotations.append(image_annotation)

for sample_id in val_samples:
    sample_labels = labels[labels["sample_document"] == sample_id]
    boxes_annotation = sample_labels.apply(lambda x: 
                                           {"transcription": x["label"], 
                                            "points": get_box_coords(*denormalise_box_coordinates(x["start_x_norm"], x["start_y_norm"], 
                                                                                                  x["end_x_norm"], x["end_y_norm"], 
                                                                                                  image_width, image_height))}
                    , axis=1).values

    boxes_annotation = " ".join(map(lambda x: str(x), boxes_annotation))
    image_annotation = "\t".join([str(fine_tuning_dataset_path / f"train/sample_{sample_id}.png"), boxes_annotation])
    validation_annotations.append(image_annotation)

train_annotations = "\n".join(train_annotations)
validation_annotations = "\n".join(validation_annotations)

In [27]:
### Write annotations
fine_tuning_dataset_path.mkdir(parents=True, exist_ok=True)

with open(fine_tuning_dataset_path / 'train_labels.txt', "w") as file:
    file.write(train_annotations)

with open(fine_tuning_dataset_path / 'val_labels.txt', "w") as file:
    file.write(validation_annotations)