# Synthetic Data
In our case the priority is precision, so we use random insert.
To generate noisy variations of the images derived from pdf-blanks:
* [Fill in the inputs](#fill)
* [Add visual noise](#noise)

In [None]:
import pandas as pd
import numpy as np

from PIL import Image, ImageOps
from matplotlib import pyplot as plt
from matplotlib import patches
from pathlib import Path
from fitz import fitz

In [None]:
# local libs
from scripts import simulate as sim
from scripts import prep

In [None]:
# page-level reference (multipage docs)
pages = pd.read_csv('./data/page-summary.csv.gz')
pages.columns

In [None]:
# extracted page-images
images = [str(x).split('/').pop()[:-4] for x in Path(f'./data/images').glob('*.png')]

print(len(set(pd.read_csv('data/page-summary.csv.gz')['source']).intersection(set(images))) == len(images))

# extracted page-inputs
forms = [str(x).split('/').pop()[:-7] for x in Path(f'./data/inputs').glob('*.csv.gz')]
forms = list(set(forms).intersection(set(images)))
len(forms)

<a name="fill"></a>

## Simulate fill-in
For the better version we would need more fonts, and some advanced logic for the text fill-in based on the input label.

In [None]:
source = np.random.choice(forms)
print(source)
# load image
image = np.array(ImageOps.grayscale(Image.open(f'./data/images/{source}.png')))
# load form-inputs info
inputs = pd.read_csv(f'./data/inputs/{source}.csv.gz')
# load textual content
content = pd.read_csv(f'./data/info/{source}.csv.gz')[['left','top','right','bottom','text']]
matrix = sim.layout_matrix(content)

index = int(source.split('-').pop())
doc = '-'.join(source.split('-')[:-1])
with fitz.open(f'./data/forms/{doc}.pdf') as doc:
    page = doc.load_page(index)
    image, info = sim.fill_in_blanks(page, 200, matrix)
        
fig, ax = plt.subplots(figsize=(10,10))
ax.imshow(image, 'gray')
s = min(image.shape)
for box in inputs[['left','top','right','bottom','field_type_string','field_display']].values:
    x1, y1, x2, y2 = box[:4].astype(float)
    t, d = box[4:]
    w, h = (x2 - x1) * s, (y2 - y1) * s
    x, y = x1 * s, y1 * s
    c = f'C{d}' if t in sim.WIDGETS else 'gray'
    ax.add_patch(patches.Rectangle((x, y), w, h, linewidth=1, edgecolor=c, facecolor='none'))
plt.title('Highlighted boxes with simulated input')
plt.show()


In [None]:
pd.DataFrame.from_dict(info)[['field_name','field_display','field_type_string','text_maxlen','value']].head()

<a name="noise"></a>

## Simulate noisy data
Let's add some skew (small rotation angle), random orientation (0, 90, 180, 270), uneven light, and noise.

In [None]:
level = 0.5
fig, ax = plt.subplots(1, 2, figsize=(8,8))
ax[0].imshow((prep.generate_noise(1000, scale=2) * 255).astype(np.uint8), 'gray')
ax[0].set_title('light only')
ax[1].imshow((prep.generate_noise(1024, scale=2) * 255).astype(np.uint8), 'gray')
ax[1].set_title('light + noise')
plt.show()

In [None]:
level = 0.5
fig, ax = plt.subplots(1, 2, figsize=(10,10))
ax[0].imshow(prep.random_transform(image, noise=0.5, perspective=False)[0], 'gray')
ax[0].set_title('Noise + rotation')
ax[1].imshow(prep.random_transform(image, noise=0.5, perspective=True)[0], 'gray')
ax[1].set_title('Noise + rotation + distortion')
plt.show()