# Data extracted from PDF forms
For this project we need
* `layout`: hierarchy of blocks defined by bounding box coordinates and `type`
* `text`: words and `phrases` (linked sequences)
* `inputs`: fields where certain data should be entered defined by text `label` and data type spec.
* `images`: some are `logos` we want to recognize; some contain text we want to be aware of for our vision model

There are several options, we use `PyMuPDF` package: `scripts/parse.py` is initial bulk extraction for exploration.

For our doc-indexing pipeline we need a refined version based on the representation model our exploration outputs. We also need to chose embedding models (text and image) for similarity queries.

* For the text embeddings we go with a pretrained model, maybe with a minimal tune up.
* For the image embedding we are going to train our own model based on either `ResNet` or `ViT` architecture adapted to grayscale.

The [single-source-batch data-loaders](#loader) we use to simplify training could make learning very sensitive to data quality: we need a way to classify each source for fitness to be a learning sample.

In [None]:
import re
import os
import json
import torch
import numpy as np
import pandas as pd

from time import time
from pathlib import Path
from PIL import Image, ImageOps
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms, models
from matplotlib import pyplot as plt
from matplotlib import patches, colormaps

In [None]:
# load local notebook-utils
from scripts import simulate as sim
from scripts import parse, render
from scripts.backbone import Encoder

In [None]:
# doc-level lookup table
docs = pd.read_csv('./data/forms.csv.gz')
docs = docs.loc[docs['lang'].isin(['en','fr','sp'])].fillna('')
docs['taxonomy'] = docs.apply(lambda r:f"{r['type']}{r['sub']}".strip().upper(), axis=1)

# page-level reference (multipage docs)
pages = pd.read_csv('./data/page-summary.csv.gz')
pages['file'] = pages['source'].apply(lambda x:'-'.join(x.split('-')[:-1]))

In [None]:
pages.columns

In [None]:
BOX = ['top','left','bottom','right']

In [None]:
images = [str(x).split('/').pop()[:-4] for x in Path(f'./data/images').glob('*.png')]

In [None]:
# look into data where the inputs info is available
samples = pages.loc[(pages['source'].isin(images))&(~pages['source'].str.startswith('que-')),'source'].to_list()

In [None]:
pages[pages['source'].isin(samples)]['word-count'].quantile([0.,.25,.5,.75,1.])

In [None]:
pages['inputs'] = pages.apply(lambda r:r['text-input'] + r['check-box'] + r['radio-button'] \
                                        if r['text-input'] >= 0 else 0, axis=1)
pages[pages['source'].isin(samples)]['inputs'].quantile([0.,.25,.5,.75,1.])

<a name="loader"></a>

Let's explore the page-view data.

In [None]:
class PageViewPortDataset(Dataset):
    """
    use a single document noisy variation for a batch of random view-ports
    """
    def __init__(self, source: str, max_samples: int = 8):
        self.max_samples = max_samples
        # load source image
        view = np.array(ImageOps.grayscale(Image.open(f'./data/images/{source}.png')))
        #view = make_noisy_sample(view)
        # create renderer
        self.view = render.AgentView((255. - view).astype(np.uint8))
        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.5), (0.5))])
        labels = pages.loc[pages['source'] == source, ['word-count','inputs']].values[0]
        self.labels = np.array([labels[0] > 500, labels[1] > 0]).astype(int)

    def __len__(self):
        return self.max_samples
    
    def random_viewport(self):
        # pan: anywhere within the page-view bounding box
        center = (np.array(self.view.space.center) * (0.5 + np.random.rand(2))).astype(int)
        rotation = np.random.randint(0, 360)
        # keep sufficient field in view to hint layout
        zoom = np.random.rand() - 3.5
        return center, rotation, zoom
    
    def __getitem__(self, idx):
        # generate random viewport
        center, rotation, zoom = self.random_viewport()
        # render corresponding view
        X = self.transform(self.view.set_state(center, rotation, zoom).astype(np.float32))
        Y = self.labels
        return X, Y


# show example
sample = 'cnd-5000-s5.fr-5' #np.random.choice(samples)
loader = DataLoader(PageViewPortDataset(sample, max_samples=1), batch_size=1, shuffle=False)
for X, Y in loader:
    print(f'source: {sample}\nbatch:  X:{X.shape}  Y:{Y.shape}')
    for i in range(1):
        fig, ax = plt.subplots(figsize=(4, 4))
        ax.imshow(X[i,:].squeeze(), 'gray')
        ax.axis('off')
        ax.set_title(list(Y[i,:].numpy()))
        plt.show()
        

In [None]:
class PageViewDataset(Dataset):
    """
    render pages top-view
    """
    def __init__(self, samples: list):
        self.samples = samples
        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.5), (0.5))])
        self.labels = pages.loc[pages['source'].isin(samples), ['word-count','inputs']].values

    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        source = self.samples[idx]
        # load source image
        view = np.array(ImageOps.grayscale(Image.open(f'./data/images/{source}.png')))
        # renderer full-page view
        view = render.AgentView((255. - view).astype(np.uint8)).top()
        X = self.transform(view.astype(np.float32))
        labels = self.labels[idx,:]
        Y = np.array([labels[0] > 700, labels[1] > 0]).astype(int)
        return X, Y


# show one batch
loader = DataLoader(PageViewDataset(samples), batch_size=4, shuffle=False)
for X, Y in loader:
    print(f'source: {sample}\nbatch:  X:{X.shape}  Y:{Y.shape}')
    for i in range(4):
        fig, ax = plt.subplots(figsize=(4, 4))
        ax.imshow(X[i,:].squeeze(), 'gray')
        ax.axis('off')
        ax.set_title(f'{sample}: {list(Y[i,:].numpy())}')
        plt.show()
    break
        

In [None]:
classes = [(0,0), (0,1), (1,0), (1,1)]
X, Y = [], []
test = np.random.choice(samples, 1024, replace=False)
for inputs, labels in DataLoader(PageViewDataset(test), batch_size=8, shuffle=False):
    for i in range(8):
        X.append(inputs[i,:].squeeze().numpy().flatten())
        Y.append(classes.index(tuple(labels[i,:].numpy())))
        

In [None]:
lda = LDA(n_components=2)
T = lda.fit_transform(X, Y)

centers = np.array([np.median(T[np.where(np.array(Y) == k)], axis=0) for k in range(4)])


fig, ax = plt.subplots(figsize=(6, 6))
cmap = colormaps['rainbow']
ax.scatter(T[:,0], T[:,1], c=np.array(Y)/(len(classes) - 1), cmap=cmap, s=5, alpha=0.3)
for i in range(len(classes)):
    ax.scatter(centers[i,0], centers[i,1], color=cmap(i/(len(classes) - 1)),
               s=75, marker='*', edgecolor='black', label=classes[i])
ax.set_title('LDA separated clusters')
ax.legend()
plt.show()

In [None]:
df = pd.DataFrame(T, columns=['x1','x2'], index=test)
df['label'] = Y
df

In [None]:
data = []
sources = [s for s in samples if not s in test]
for i, (inputs, labels) in enumerate(DataLoader(PageViewDataset(sources), batch_size=64, shuffle=False)):
    X, Y = [],[]
    for i in range(inputs.shape[0]):
        X.append(inputs[i,:].squeeze().numpy().flatten())
        Y.append(classes.index(tuple(labels[i,:].numpy())))
    D = pd.DataFrame(lda.transform(X), columns=['x1','x2'],
                     index=sources[i * 64 : min((i + 1) * 64, ), len(sources)])
    D['label'] = Y
    data.append(D)

data = pd.concatenate(data).drop_duplicates()
len(data) == len(samples)

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
ax.scatter(data.iloc[:,0], data.iloc[:,1], c=data.iloc[:,2]/(len(classes) - 1), cmap=cmap, s=1, alpha=0.3)
for i in range(len(classes)):
    ax.scatter(centers[i,0], centers[i,1], color=cmap(i/(len(classes) - 1)),
               s=100, marker='*', edgecolor='black', label=classes[i])
ax.set_title('LDA separated clusters')
ax.set_xlim([-20, 20])
ax.set_ylim([-20, 20])
ax.legend()
plt.show()

In [None]:
data['label'] = data['label'].apply(lambda x:classes[x])

In [None]:
data = []
for source in samples:
    x = np.array(Image.open(f'data/masks/{source}'))
    x = (np.eye(len(ORDER))[x][:,:,1:] > 0).astype(int)
    x = np.sum(x, (0, 1))
    data.append(list(x/np.sum(x)))
    
data = np.array(data)

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(10, 3))
for i, j, c in [(0, 1, 2), (1, 2, 0), (2, 0, 1)]:
    ax[i].scatter(data[:,i], data[:,j], c=data[:,c], cmap='rainbow', s=3, alpha=0.3)
    ax[i].set_xlabel(ORDER[i].upper())
    ax[i].set_xticks([])
    ax[i].set_ylabel(ORDER[j].upper())
    ax[i].set_yticks([])
    ax[i].set_title(f'Color: {ORDER[c].upper()}', fontsize=10)
plt.show()

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

pca = PCA(n_components=2)
norm = StandardScaler().fit(data)
pca.fit(norm.transform(data))
pca.explained_variance_ratio_

In [None]:
Y = pca.transform(norm.transform(data))

fig, ax = plt.subplots(1, 3, figsize=(10, 3))
# top two components colored by feature value
for i,j in [[0,1],[1,2],[2,0]]:
    ax[i].scatter(Y[:,i], Y[:,j], s=3, alpha=0.1)
plt.show()