# Import

In [1]:
import ast
import cv2 as cv
import numpy as np
import os
import pandas as pd

# Config

## Data Path

In [2]:
OUTPUT_ROOT_DIR = 'augmented-data'
os.makedirs(OUTPUT_ROOT_DIR + '/images', exist_ok=True)

IMAGE_PATH_TEMPLATE = 'raw-data/Arshasb_7k/{0:05d}/page_{0:05d}.png'
LABEL_PATH_TEMPLATE = 'raw-data/Arshasb_7k/{0:05d}/label_{0:05d}.xlsx'

OUTPUT_IMAGE_PATH = OUTPUT_ROOT_DIR + '/images/{}.png'
OUTPUT_METADATA_PATH = OUTPUT_ROOT_DIR + '/metadata.csv'

# Main

In [3]:
NUM_OF_PAGES = 6000
dataset = []
max_random_horizontal_shift_amount = 3
max_random_vertical_shift_amount = 5

for i in range(NUM_OF_PAGES):
    image_path = IMAGE_PATH_TEMPLATE.format(i+1, i+1)
    label_path = LABEL_PATH_TEMPLATE.format(i+1, i+1)

    image = cv.imread(image_path, cv.IMREAD_GRAYSCALE)

    label_pdf = pd.read_excel(label_path)
    label_pdf.point1 = label_pdf.point1.apply(ast.literal_eval)
    label_pdf.point2 = label_pdf.point2.apply(ast.literal_eval)
    label_pdf.point3 = label_pdf.point3.apply(ast.literal_eval)
    label_pdf.point4 = label_pdf.point4.apply(ast.literal_eval)

    for row in label_pdf.iterrows():
        dataset.append({
            'label': row[1].word,
            'image': image[
                slice(
                    row[1].point1[1] + np.random.randint(-max_random_vertical_shift_amount, max_random_vertical_shift_amount),
                    row[1].point2[1] + np.random.randint(-max_random_vertical_shift_amount, max_random_vertical_shift_amount)
                ), 
                slice(
                    row[1].point1[0] + np.random.randint(-max_random_horizontal_shift_amount, max_random_horizontal_shift_amount),
                    row[1].point3[0] + np.random.randint(-max_random_horizontal_shift_amount, max_random_horizontal_shift_amount)
                )
            ]
        })

metadata_dict = {'id': [], 'label': []}
for i in range(len(dataset)):
    if min(dataset[i]['image'].shape) > 0:
        metadata_dict['id'].append(i)
        metadata_dict['label'].append(dataset[i]['label'])
        response = cv.imwrite(OUTPUT_IMAGE_PATH.format(i), dataset[i]['image'])
        if not response:
            print('Failure in writing image with label: {}'.format(dataset[i]['label']))

metadata_pdf = pd.DataFrame(metadata_dict)
metadata_pdf.to_csv(OUTPUT_METADATA_PATH, index=False)