## Import 

In [1]:
import cv2 as cv
import numpy as np
import os
import pandas as pd

## Config

In [17]:
FONT = 'nazanin'
NUM_OF_PAGES = 20
MAX_RANDOM_HORIZONTAL_SHIFT_AMOUNT = 1
MAX_RANDOM_VERTICAL_SHIFT_AMOUNT = 1

## Data Path

In [5]:
INPUT_ROOT_DIR = 'raw-dataset/{}'.format(FONT)
os.makedirs(INPUT_ROOT_DIR, exist_ok=True)

IMAGE_PATH_TEMPLATE = INPUT_ROOT_DIR + '/{0:05d}/resized_page.png'
LABEL_PATH_TEMPLATE = INPUT_ROOT_DIR + '/{0:05d}/label_cleaned.csv'

OUTPUT_ROOT_DIR = 'dataset/{}'.format(FONT)
os.makedirs(OUTPUT_ROOT_DIR + '/images', exist_ok=True)
OUTPUT_IMAGE_PATH = OUTPUT_ROOT_DIR + '/images/{}.png'
OUTPUT_METADATA_PATH = OUTPUT_ROOT_DIR + '/metadata.csv'

## Main

In [21]:
dataset = []

for i in range(NUM_OF_PAGES):
    image_path = IMAGE_PATH_TEMPLATE.format(i+1)
    label_path = LABEL_PATH_TEMPLATE.format(i+1)

    image = cv.imread(image_path, cv.IMREAD_GRAYSCALE)
    label_pdf = pd.read_csv(label_path)

    for row in label_pdf.iterrows():
        dataset.append({
            'label': row[1].word,
            'image': image[
                slice(
                    row[1].y1 + np.random.randint(-MAX_RANDOM_VERTICAL_SHIFT_AMOUNT, MAX_RANDOM_VERTICAL_SHIFT_AMOUNT),
                    row[1].y2 + np.random.randint(-MAX_RANDOM_VERTICAL_SHIFT_AMOUNT, MAX_RANDOM_VERTICAL_SHIFT_AMOUNT)
                ), 
                slice(
                    row[1].x1 + np.random.randint(-MAX_RANDOM_HORIZONTAL_SHIFT_AMOUNT, MAX_RANDOM_HORIZONTAL_SHIFT_AMOUNT),
                    row[1].x2 + np.random.randint(-MAX_RANDOM_HORIZONTAL_SHIFT_AMOUNT, MAX_RANDOM_HORIZONTAL_SHIFT_AMOUNT)
                )
            ]
        })

metadata_dict = {'id': [], 'label': []}
for i in range(len(dataset)):
    if min(dataset[i]['image'].shape) > 0:
        metadata_dict['id'].append(i)
        metadata_dict['label'].append(dataset[i]['label'])
        response = cv.imwrite(OUTPUT_IMAGE_PATH.format(i), dataset[i]['image'])
        if not response:
            print('Failure in writing image with label: {}'.format(dataset[i]['label']))

metadata_pdf = pd.DataFrame(metadata_dict)
metadata_pdf.to_csv(OUTPUT_METADATA_PATH, index=False)