# Doc-Indexing Pipeline


* Data
* Preprocessing and normalization
* Phrase/Pattern search with `ElasticSearch`

    * [Experiment with `ELSER`]()
    * [Experiment with `shape-queries`]()
    * Insert format
    * Text-blocks bulk-insert
    * Input-blocks bulk-insert
    * Queries
    
* [Semantic search](#qdrant) with `Qdrant`
    * Doc-titles bulk insert
    * Sentences bulk-insert
    * Queries

* Indexing images
    * As layout-blocks and text-content if any
    * For `logos` recognition


In [1]:
import re
import os
import json
import numpy as np
import pandas as pd
import pytesseract as ts

from time import time
from pathlib import Path
from PIL import Image, ImageOps
from unidecode import unidecode
from torch.cuda import is_available
from elasticsearch import Elasticsearch
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
from sklearn.manifold import TSNE
from collections import Counter
from matplotlib import pyplot as plt
from sentence_transformers import SentenceTransformer, util

## Data

In [3]:
# doc-level lookup table
docs = pd.read_csv('./data/forms.csv.gz')
docs = docs.loc[docs['lang'].isin(['en','fr','sp'])].fillna('')
docs['taxonomy'] = docs.apply(lambda r:f"{r['type']}{r['sub']}".strip().upper(), axis=1)

In [4]:
BOX = ['left','top','right','bottom']

In [9]:
#SCALE = 1000 # for integer ranges version

pages = pd.read_csv('./data/pages.csv.gz')
mean = pages.mean(numeric_only=True)
DW, DH, DS = np.round(mean.loc[['word-width','word-height','space']], 4)
#DW, DH, DS = np.round(mean.loc[['word-width','word-height','space']] * SCALE).astype(int)
DW, DH, DS

(0.0279, 0.0196, 0.0028)

We are going to index document pages textual content, layout and form-input blocks.

In [10]:
def filter_text(data):
    """
    drop `code` (xml etc.)
    """
    data.loc[:,'text'] = data['text'].fillna('').astype(str).str.strip()
    return data.loc[~data['text'].str.startswith('<')]


In [11]:
def normalize_bbox(data):
    """
    remove noise in bbox coordinates:
    for data extracted with 200 dpi -- we only need 4 decimals
    """
    #data = data[data[BOX].applymap(lambda x: isinstance(x, (int, float)))]
    #data = data.loc[(data['right'] > data['left'])&(data['bottom'] > data['top'])]
    data.loc[:,BOX] = np.round(data[BOX], 4)
    #data.loc[:,columns] = np.round(data.loc[:,box] * SCALE).astype(int)
    return data
    

### Text normalization and preprocessing

In [13]:
def normalize_text(text: str):
    """
    normalize whitespace and unicode, keep casing and punctuation
    remove `......` and `. . . . .` from anywhere
    strip some non-word characters ▶ from the start/end
    """
    text = re.sub(r'\s+', ' ', unidecode(text).replace('*', '')).strip()
    text = re.sub(r'\.{2,}', '', text)
    text = re.sub(r'\s[\s.]{3,}\s', ' ', text)
    
    # remove hyphen between digit and letter (for taxonomy pattern matching)
    text = re.sub(r'(?<=[A-Z])-(?=\d)', '', re.sub(r'(?<=\d)-(?=[A-Z])', '', text))
    
    return unidecode(re.sub('^[^a-zA-Z0-9\(\$]*|[^a-zA-Z0-9\)]*$', '', text)).strip(' .')


    # sample outcome
    for d in docs.sample().to_dict('records'):    
        files = pages[pages['doc']==d['file']]['source'].to_list()
        for source in files:
            data = data_filter(pd.read_csv(f'data/info/{source}.csv.gz'))
            print(d['taxonomy'])
            print('--------------------------------------------------------------------------')
            for d in data.to_dict('records'):
                print(str(d['text']).strip())
                print('--------------------------------------------------------------------------')
                print(normalize(str(d['text'])))
                print('==========================================================================')
                

<a name="elastic"></a>
<img src="assets/elasticsearch.png"
     style="display:inline;float:left;vertical-aligh:middle;margin-right:15px"/>

## Phrase/Pattern search with `ElasticSearch`

In this project we need the way to search for exact and fuzzy match in a spacial layout context. We index both `word` and `block` levels along with the relative coordinates on the page. The `word` blocks allow us to run a fuzzy match for the alpha-numeric patterns (taxonomy markers); the `block` may help with search for input labels/instructions.

In [15]:
eclient = Elasticsearch(
    hosts=[os.environ['ELASTIC_URI']],
    basic_auth=('elastic', os.environ['ELASTIC_PASSWORD']),
    verify_certs=False
)

In [16]:
eclient.indices.delete(index='doc-pages')

ObjectApiResponse({'acknowledged': True})

In [17]:
INDEX = 'doc-pages'

SETTINGS = {
    "analysis": {
        "analyzer": {
            "custom_analyzer": {
                "type": "custom",
                "tokenizer": "whitespace",
                "filter": ["lowercase"]
            }
        },
    }
}

MAPPINGS = {
    "properties": {
        # ELSER ml setup
        #"ml.tokens": {
        #    "type": "rank_features" 
        #},
        "content": {
            "type": "text",
            "analyzer": "custom_analyzer"
        },
        "block_type": {
            "type": "keyword"
        },
        "font_size": {
            "type": "byte"
        },
        "display": {
            "type": "keyword"
        },
        "page_id": {
            "type": "keyword"
        },
        "doc_id": {
            "type": "keyword"
        },
        "taxonomy_id": {
            "type": "keyword"
        },
        "orig": {
            "type": "keyword"
        },
        "lang": {
            "type": "keyword"
        },
        # for range-queries experiment: short vs. float?
        "left": {
            "type": "float"
        },
        "top": {
            "type": "float"
        },
        "right": {
            "type": "float"
        },
        "bottom": {
            "type": "float"
        },
        # for shape-queries experiment
        #"box": {
        #    "type": "shape"
        #},
    }
}

In [18]:
if not eclient.indices.exists(index=INDEX):
    eclient.indices.create(index=INDEX, settings=SETTINGS, mappings=MAPPINGS)
print(eclient.indices.exists(index=INDEX))

True


#### [Experiment with `ELSER`](https://www.elastic.co/blog/may-2023-launch-information-retrieval-elasticsearch-ai-model)

Create the default ingestion pipeline to enable [ELSER](https://www.elastic.co/guide/en/machine-learning/8.8/ml-nlp-elser.html#download-deploy-elser) (f have access):
    
    curl -XPUT $ELASTIC_URI/_ingest/pipeline/form_blanks_pipeline \
    -H "Content-Type: application/json" \
    -d '{
        "description": "Indexting text-bloks extracted from pdf-form-blanks for ELSER",
        "processors": [
            {
                "inference": {
                    "model_id": ".elser_model_1",
                    "target_field": "ml",
                    "field_map": {
                        "text": "text_field"
                    },
                    "inference_config": {
                        "text_expansion": { 
                            "results_field": "tokens"
                        }
                    }
                }
            }
        ]
    }'

    curl -XPUT $ELASTIC_URI/documents \
    -H "Content-Type: application/json" \
    -d '{
        "settings": {
            "index.default_pipeline": "form_blanks_pipeline"
        }
    }'


Inference:    
    
    curl -GET $ELASTIC_URI/documents/_search
    -H "Content-Type: application/json" \
    -d '{
       "query": {
            "text_expansion": {
                 "ml.tokens": {
                     "model_id":".elser_model_1",
                     "model_text":" ... "
                 }
            }
        }
    }'
    
#### Experiment with [`shape-queries`](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-shape-query.html)
The `shape` type vs. `geo_shape` type allows arbitrary values for coordinates (instead of `lat,lon` ranges). We have a relative scale (min width and height units) and mostly interested in `distance` and `within` relations.

In [19]:
def bbox_shape(d: dict):
    """
    Convert bounding box (%scale) into a polygon to use geo-spacial queries
    WKT specification expects the following order: minX, maxX, maxY, minY
    """
    l, t, r, b = np.array([d['left'], d['top'], d['right'], d['bottom']]).astype(float) * 100
    return f"BBOX ({l:.2f}, {r:.2f}, {b:.2f}, {t:.2f})"


bbox_shape({'left':0.1, 'top':0.1,'right':0.9,'bottom':0.9})

'BBOX (10.00, 90.00, 90.00, 10.00)'

#### Insert format

In [20]:
def parse_info(data, doc) -> dict:    
    display = { 0:'h', 1:'v' } # horizontal and vertical
    return {
        'content': normalize_text(str(data['text'])),
        'block_type': data['block-type'],
        
        # useful for identifying page title
        'font_size': int(data['font-size']) if data['font-size'] > 0 else None,
        
        # sin and cos define text-line orientataion:
        # could be different from the main content -- set flag -- h[orizontal] v[ertical] d[iagonal]
        'display': '' if np.isnan(data['sin']) else display.get(data['sin'], 'd'),
        
        # layout
        #'box': bbox_shape(data),        
        'left': data['left'],
        'top': data['top'],
        'right': data['right'],
        'bottom': data['bottom'],
        
        # page props
        'page_id': data['page'],
        'doc_id': doc['file'],
        'taxonomy_id': doc['taxonomy'],
        'taxonomy_ext': doc['ext'],
        'lang': doc['lang'],
        'orig': doc['orig'],
    }


In [21]:
# get stats on preprocessing
count, seq, char, skip, mashed = [],[],[],[],[]
for doc in docs.to_dict('records'):    
    files = pages[pages['file']==doc['file']]['source'].to_list()
    for source in files:
        data = filter_text(pd.read_csv(f'data/info/{source}.csv.gz'))
        count.append(len(data))
        if len(data) == 0:
            skip.append(source)
            continue
        for d in data.to_dict('records'):
            record = parse_info(d, doc)
            text = record['content']
            seq.append(len(text))
            # check for too long words unless urls
            longest_word = max([x for x in text.split() if x.find('/') == -1], key=len, default='')
            char.append(len(longest_word))
            if char[-1] > 50:
                #print(source, '===', longest_word)
                mashed.append(source)
        
print(f'empty pages: {len(skip)}  no-whitespace: {len(set(mashed))}')

empty pages: 0  no-whitespace: 230


In [22]:
mashed = [x for x in set(mashed) if x.startswith('que-')]

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(10, 3))
X = sorted(Counter(count).items(), key=lambda x:x[0])
ax[0].scatter([x[0] for x in X],[x[1] + 1 for x in X], s=3)
ax[0].axvline(x=1000, linestyle=':', color='C3')
ax[0].set_yscale('log')
ax[0].set_title('estimate batch-size')
X = sorted(Counter(seq).items(), key=lambda x:x[0])
ax[1].scatter([x[0] for x in X],[x[1] + 1 for x in X], s=3)
ax[1].set_yscale('log')
ax[1].set_title('text length dist')
X = sorted(Counter(char).items(), key=lambda x:x[0])
ax[2].scatter([x[0] for x in X],[x[1] + 1 for x in X], s=3)
ax[2].axvline(x=50, linestyle=':', color='C3')
ax[2].set_yscale('log')
ax[2].set_title('word length dist')
plt.show()

In [24]:
def data_to_ndjson(data, doc, parse) -> list:
    """
    Generator for text-blocks data ingest
    """
    for d in data.to_dict('records'):
        record = parse(d, doc)
        if record['content'] == '': # skip empty
            continue
        taxonomy_ext = re.sub(r'\W+', '', doc['ext']).upper()
        ID = f"{doc['file']}-{d['page']}-{d['block']}".upper()
        yield json.dumps({'index':{'_index':INDEX,'_id':ID}})
        yield json.dumps({ x:record[x] for x in MAPPINGS['properties'] if not x.startswith('ml.') })


#### Text-blocks bulk-insert

In [25]:
start = time()
errors, doc_count, page_count = 0, 0, 0
for doc in docs.to_dict('records'):    
    files = pages[pages['file']==doc['file']]['source'].to_list()
    for source in files:
        if source in skip + mashed:
            continue
            
        data = filter_text(pd.read_csv(f'data/info/{source}.csv.gz'))
        # we will process images separately
        data = data.loc[data['block-type']!='image']
        if len(data) == 0:
            continue
        
        data = normalize_bbox(data)        
        # include trailing space to link tailing word
        data.loc[data['block-type']=='word','right'] += DS
        data.loc[data['block-type']=='word','left'] -= DS
        
        data.index.name = 'block'
        data = data.reset_index()        
        data['block'] = data.apply(lambda r:f"{r['block-type'][0].upper()}{r['block']}", axis=1)
        inserts = data_to_ndjson(data, doc, parse_info)
        result = eclient.bulk(index=INDEX, operations=inserts)
        if result['errors']:
            pass #print(result) # few blocks have invalid bounding box
        errors += result['errors']
        page_count += 1
    doc_count += 1
    print(f'done: {doc_count/len(docs):.2%}', end='\r')
        
print(f'execution time: {(time() - start)/60:.2f}min  errors in {errors}/{errors/page_count:.2%} pages')

execution time: 65.70min  errors in 0/0.00% pages


In [26]:
#eclient.search(index=INDEX, query={'match_all': {}})['hits']['hits']

In [27]:
def get_page_content(doc, page, size=1000):
    query = {'bool': {'must': [{'match': {'doc_id': doc }}, {'match': {'page_id': page }}]}}
    sort = [{'top': {'order': 'asc'}}, {'left': {'order': 'asc'}}]
    hits = eclient.search(index=INDEX, query=query, sort=sort, size=size)
    return [hit['_source'] for hit in hits['hits']['hits']]


#### Input-blocks bulk-insert

In [28]:
# pages with inputs
forms = [str(x) for x in Path(f'./data/inputs').glob('*.csv.gz')]

In [29]:
def filter_inputs(doc, inputs):
    """
    remove hoidden pdf-utils, keep only user-filled
    """
    # filter out pdf-utils
    inputs = inputs.loc[inputs['field_type_string']!='Button']
    hidden = inputs.loc[(inputs['field_type_string']=='Text')&(inputs['right'] - inputs['left'] < DH)]
    inputs = inputs.loc[~inputs.index.isin(hidden.index)]
    if len(inputs) == 0:
        return inputs
    # get content from page (already indexed)
    data = get_page_content(doc['file'], inputs.iloc[0]['page'], size=10000)
    if len(data) == 0:
        return inputs
    
    # auxiliary fields -- value comes in when other fields got filled:
    # build low-res word-presence map to detect overlap easily
    data = pd.DataFrame.from_dict(data)[['content','block_type'] + BOX]
    M = np.round(data[data['block_type']=='word'].loc[:,BOX] * 100).astype(int)
    if len(M) == 0:
        return inputs
    W, H = M[['right','bottom']].max().astype(int)
    matrix = np.zeros((H, W))
    for d in M.to_dict('records'):
        matrix[int(d['top']) + 1:int(d['bottom']), int(d['left']) + 1:int(d['right'])] = 1
    nested = []
    test = np.round(inputs.loc[:,BOX] * 100).astype(int)
    for i in inputs.index: # check if input space is already occupied
        l, t, r, b = test.loc[i,:].values
        if np.any(matrix[int(t) + 1:int(b), int(l) + 1:int(r)]):
            nested.append(i)
    # filter-out nested
    return inputs.loc[~inputs.index.isin(nested)]
    

In [30]:
def parse_inputs(data, doc) -> dict:
    return {
        'content': f"{data['field_type_string']} NAME: {data['field_name']} LABEL: {data['field_label']}",
        'block_type': 'input',
        
        # hints for visual
        'font_size': int(data['text_fontsize']) if data['text_fontsize'] > 0 else None,
        # display=0 seems to be the one we need, others may be conditional or hidden utils
        'display': '' if np.isnan(data['field_display']) else str(data['field_display']),
        
        # layout
        #'box': bbox_shape(data),
        'left': data['left'],
        'top': data['top'],
        'right': data['right'],
        'bottom': data['bottom'],
        
        # page props
        'page_id': data['page'],
        'doc_id': doc['file'],
        'taxonomy_id': doc['taxonomy'],
        'taxonomy_ext': doc['ext'],
        'lang': doc['lang'],
        'orig': doc['orig'],
    }


In [31]:
start = time()
errors, count = 0, 0
for file in forms:
    doc = '-'.join(file.split('/').pop().split('-')[:-1])
    doc = docs.loc[docs['file']==doc]
    if len(doc) == 0:
        continue
    doc = doc.to_dict('records')[0]
    if not doc['lang'] in ['en','fr','sp']:
        continue
        
    inputs = pd.read_csv(file)
    inputs = inputs.loc[(~inputs['field_type_string'].isna())&(~inputs['page'].isna())]
    inputs.loc[:,['field_name','field_label']] = inputs[['field_name','field_label']].fillna('')    
    # remove pdf-doc-tree: form1[0].Page1[0].Name_subform[0].TaxYear_group[0].TaxYearDate[0] -> TaxYearDate
    inputs['field_name'] = inputs['field_name'].apply(lambda x:x.split('].').pop().split('[')[0]).to_list()
    
    inputs = normalize_bbox(inputs)
    inputs = filter_inputs(doc, inputs)
    if len(inputs) == 0:
        continue

    inputs.index = inputs.index.map(lambda x:f'I{x}') # mark as input-block
    inputs.index.name = 'block'
    inputs = inputs.reset_index()
    inserts = data_to_ndjson(inputs, doc, parse_inputs)
    result = eclient.bulk(index=INDEX, operations=inserts)
    if result['errors']:
        pass #print(result)
    errors += result['errors']
    count += 1
    print(f'done: {count/len(forms):.2%}', end='\r')
        
print(f'execution time: {(time() - start)/60:.2f}min  errors in {errors}/{errors/count:.2%} pages')

execution time: 20.93min  errors in 0/0.00% pages


### Queries

In [32]:
start = time()
results = []
for d in docs.to_dict('records'):
    pattern = d['file'][5:] if d['file'][:5] == 'irs-f' else d['file'][4:]
    query = {'bool': {'must': [{'match': {'taxonomy_id': d['taxonomy']}}],
             'filter': [{'fuzzy': {'content': {'value': d['taxonomy'], 'fuzziness': 2 }}}] }}
    result = eclient.search(index=INDEX, query=query, size=10)
    results.append(len(result['hits']['hits']))

print(f'Taxonomy pattern fuzzy search success: {sum(np.array(results) > 0)/len(docs):.2%}  '
      f'avg. execution time: {(time() - start)/len(docs):.4f}sec')

Taxonomy pattern fuzzy search success: 72.50%  avg. execution time: 0.0143sec


In [33]:
start = time()
results = []
for d in docs.to_dict('records'):
    pattern = d['file'][5:] if d['file'][:5] == 'irs-f' else d['file'][4:]
    query = {'bool': {'must': [{'match': {'taxonomy_id': d['taxonomy']}}],
             'filter': [{'match_phrase': {'content': {'query': d['taxonomy'], 'slop': 10 }}}] }}
    result = eclient.search(index=INDEX, query=query, size=10)
    results.append(len(result['hits']['hits']))


print(f'Taxonomy pattern exact match success: {sum(np.array(results) > 0)/len(docs):.2%}  '
      f'avg. execution time: {(time() - start)/len(docs):.3f}sec')

Taxonomy pattern exact match success: 76.31%  avg. execution time: 0.005sec


In [34]:
start = time()
results = []
for d in docs.to_dict('records'):
    pattern = d['file'][5:] if d['file'][:5] == 'irs-f' else d['file'][4:]
    query = {'bool': {'must': [{'match': {'taxonomy_id': d['taxonomy']}}],
             'filter': [
                 {'match_phrase': {'content': {'query': d['taxonomy'], 'slop': 10 }}},
                 {'range': {'bottom': {'lt': 0.2 }}},
             ] }}
    result = eclient.search(index=INDEX, query=query, size=10)
    results.append(len(result['hits']['hits']))

print(f'Taxonomy pattern in the header area exact match success: {sum(np.array(results) > 0)/len(docs):.2%}  '
      f'avg. execution time: {(time() - start)/len(docs):.4f}sec')

Taxonomy pattern in the header area exact match success: 47.41%  avg. execution time: 0.0050sec


In [35]:
start = time()
results = []
for d in docs.to_dict('records'):
    pattern = d['file'][5:] if d['file'][:5] == 'irs-f' else d['file'][4:]
    query = {'bool': {'must': [{'match': {'taxonomy_id': d['taxonomy']}}],
             'filter': [
                 {'match_phrase': {'content': {'query': d['taxonomy'], 'slop': 10 }}},
                 {'range': {'top': {'gt': 0.8 }}},
             ] }}
    result = eclient.search(index=INDEX, query=query, size=10)
    results.append(len(result['hits']['hits']))

print(f'Taxonomy pattern in the footer area exact match success: {sum(np.array(results) > 0)/len(docs):.2%}  '
      f'avg. execution time: {(time() - start)/len(docs):.4f}sec')

Taxonomy pattern in the footer area exact match success: 67.58%  avg. execution time: 0.0049sec


In [36]:
def get_docs(taxonomy):
    """
    retrieve all docs of the type
    """
    query = {'bool': {'must': [{'match': {'taxonomy_id': taxonomy}}, {'match': {'block_type': 'input'}}]}}
    aggs = {'docs': {'terms': {'field': 'doc_id'}}}
    return eclient.search(index=INDEX, query=query, aggs=aggs)['aggregations']['docs']


In [37]:
get_docs('W2')['buckets']

[{'key': 'irs-fw2c', 'doc_count': 512},
 {'key': 'irs-fw2', 'doc_count': 272},
 {'key': 'irs-fw2_21', 'doc_count': 230},
 {'key': 'irs-fw2g', 'doc_count': 159},
 {'key': 'irs-fw2as_21', 'doc_count': 158},
 {'key': 'irs-fw2gu_21', 'doc_count': 158},
 {'key': 'irs-fw2vi_21', 'doc_count': 158},
 {'key': 'irs-fw2as', 'doc_count': 126},
 {'key': 'irs-fw2gu', 'doc_count': 126},
 {'key': 'irs-fw2vi', 'doc_count': 126}]

In [38]:
def find_inputs(doc, input_type=None, size=100):
    """
    find all the inputs (of the type if specified) in the document
    """
    must = [{'match': {'doc_id': doc}}, {'match': {'block_type': 'input'}}]
    if input_type is not None:
        must.append({'match_phrase': {'content': {'query': input_type, 'slop': 5 }}})
    query = {'bool': {'must': must }}
    sort = [{'page_id': {'order': 'asc'}}, {'top': {'order': 'asc'}}, {'left': {'order': 'asc'}}]
    return eclient.search(index=INDEX, query=query, sort=sort, size=size)['hits']['hits']


In [39]:
inputs = find_inputs('irs-fw2', input_type='CheckBox', size=1000)
for hit in inputs:
    print(f"Page: {hit['_source']['page_id']:<3} Inpit-ID: {hit['_id']:<15} {hit['_source']['content']}")

Page: 1   Inpit-ID: IRS-FW2-1-I0    CheckBox NAME: c1_1 LABEL: 
Page: 1   Inpit-ID: IRS-FW2-1-I28   CheckBox NAME: c1_2 LABEL: 
Page: 1   Inpit-ID: IRS-FW2-1-I29   CheckBox NAME: c1_3 LABEL: 
Page: 1   Inpit-ID: IRS-FW2-1-I30   CheckBox NAME: c1_4 LABEL: 
Page: 2   Inpit-ID: IRS-FW2-2-I27   CheckBox NAME: c2_2 LABEL: 
Page: 2   Inpit-ID: IRS-FW2-2-I28   CheckBox NAME: c2_3 LABEL: 
Page: 2   Inpit-ID: IRS-FW2-2-I29   CheckBox NAME: c2_4 LABEL: 
Page: 3   Inpit-ID: IRS-FW2-3-I27   CheckBox NAME: c2_2 LABEL: 
Page: 3   Inpit-ID: IRS-FW2-3-I28   CheckBox NAME: c2_3 LABEL: 
Page: 3   Inpit-ID: IRS-FW2-3-I29   CheckBox NAME: c2_4 LABEL: 
Page: 5   Inpit-ID: IRS-FW2-5-I27   CheckBox NAME: c2_2 LABEL: 
Page: 5   Inpit-ID: IRS-FW2-5-I28   CheckBox NAME: c2_3 LABEL: 
Page: 5   Inpit-ID: IRS-FW2-5-I29   CheckBox NAME: c2_4 LABEL: 
Page: 7   Inpit-ID: IRS-FW2-7-I27   CheckBox NAME: c2_2 LABEL: 
Page: 7   Inpit-ID: IRS-FW2-7-I28   CheckBox NAME: c2_3 LABEL: 
Page: 7   Inpit-ID: IRS-FW2-7-I29   Chec

In [40]:
def find_parent_block(input_data, size=100):
    """
    get a text-block which contains this input
    """
    query = {
        'bool': {
            'must': [{'match': {'doc_id': input_data['_source']['doc_id'] }},
                     {'match': {'page_id': input_data['_source']['page_id'] }},
                     {'match': {'block_type': 'block'}}],
            #'filter': [{'shape': {'box': {'shape': hit['_source']['box'], 'relation': 'contains'}}}]            
            'filter': [{'range': {'top': {'lte': input_data['_source']['top']}}},
                       {'range': {'bottom': {'gte': input_data['_source']['bottom']}}},
                       {'range': {'left': {'lte': input_data['_source']['left']}}},
                       {'range': {'right': {'gte': input_data['_source']['right']}}}]
        }}
    sort = [{'top': {'order': 'asc'}}, {'left': {'order': 'asc'}}]
    return eclient.search(index=INDEX, query=query, sort=sort, size=size)['hits']['hits']


In [41]:
for hit in inputs[:4]:
    print(f"Top: {hit['_source']['top']:.4f} Left: {hit['_source']['left']:.4f}  ID: {hit['_id']}")
    print('----------------------------------------------------')
    for x in find_parent_block(hit):
        print(f"Top: {x['_source']['top']:.4f} Left: {x['_source']['left']:.4f}  Text: {x['_source']['content']}")
    print('====================================================')

Top: 0.0686 Left: 0.2165  ID: IRS-FW2-1-I0
----------------------------------------------------
Top: 0.3534 Left: 0.5696  ID: IRS-FW2-1-I28
----------------------------------------------------
Top: 0.2960 Left: 0.5477  Text: 11 Nonqualified plans 12a See instructions for box 12 C o ed 12b C o ed 12c C o ed 12d C o d e
Top: 0.3534 Left: 0.6284  ID: IRS-FW2-1-I29
----------------------------------------------------
Top: 0.2960 Left: 0.5477  Text: 11 Nonqualified plans 12a See instructions for box 12 C o ed 12b C o ed 12c C o ed 12d C o d e
Top: 0.3534 Left: 0.6873  ID: IRS-FW2-1-I30
----------------------------------------------------
Top: 0.2960 Left: 0.5477  Text: 11 Nonqualified plans 12a See instructions for box 12 C o ed 12b C o ed 12c C o ed 12d C o d e


In [42]:
def find_neighbor_words(input_data, size=3):
    """
    get text-blocks which are at the same level of above:
    inputs usually on the same line or right under
    """
    query = {
        'bool': {
            'must': [{'match': {'doc_id': input_data['_source']['doc_id'] }},
                     {'match': {'page_id': input_data['_source']['page_id'] }},
                     {'match': {'block_type': 'word'}}],
            # define a bigger bounding box: to fit-in as +line up/down and +word left/right
            'filter': [{'range': {'top': {'gte': input_data['_source']['top'] - DH}}},
                       {'range': {'bottom': {'lte': input_data['_source']['bottom'] + DH}}},
                       {'range': {'left': {'gte': input_data['_source']['left'] - DW}}},
                       {'range': {'right': {'lte': input_data['_source']['right'] + DW}}}]
        }}
    sort = [{'top': {'order': 'asc'}}, {'left': {'order': 'asc'}}]
    return eclient.search(index=INDEX, query=query, sort=sort, size=size)['hits']['hits']


for hit in inputs[:4]:
    print(f"Top: {hit['_source']['top']:.4f} Left: {hit['_source']['left']:.4f}  ID: {hit['_id']}")
    print('----------------------------------------------------')
    for x in find_neighbor_words(hit, size=10):
        print(f"Top: {x['_source']['top']:.4f} Left: {x['_source']['left']:.4f}  Text: {x['_source']['content']}")
    print('====================================================')

Top: 0.0686 Left: 0.2165  ID: IRS-FW2-1-I0
----------------------------------------------------
Top: 0.3534 Left: 0.5696  ID: IRS-FW2-1-I28
----------------------------------------------------
Top: 0.3347 Left: 0.5668  Text: Statutory
Top: 0.3353 Left: 0.5449  Text: 13
Top: 0.3428 Left: 0.5668  Text: employee
Top: 0.3745 Left: 0.5449  Text: 14
Top: 0.3747 Left: 0.5640  Text: Other
Top: 0.3534 Left: 0.6284  ID: IRS-FW2-1-I29
----------------------------------------------------
Top: 0.3347 Left: 0.6256  Text: Retirement
Top: 0.3428 Left: 0.6256  Text: plan
Top: 0.3534 Left: 0.6873  ID: IRS-FW2-1-I30
----------------------------------------------------
Top: 0.3347 Left: 0.6845  Text: Third-party
Top: 0.3428 Left: 0.6845  Text: sick
Top: 0.3428 Left: 0.7013  Text: pay


<a name="qdrant"></a>
<img src="assets/qdrant.png"
     style="display:inline;float:left;vertical-aligh:middle;margin-right:15px"/>
     
## Semantic search with `Qdrant`
We split the textual content into sentences wherever possible (titles, instructional pages) and vectorize them.
The sentence length and the sequence length thresholds derived from the stats.

In [43]:
def split_sentences(data):
    return data['text'].apply(normalize_text).apply(lambda x:x.split('. ')).explode()
    

In [None]:
text, length = [],[]
for doc in docs.sample(100).to_dict('records'):    
    files = pages[pages['file']==doc['file']]['source'].to_list()
    for source in files:
        data = filter_text(pd.read_csv(f'data/info/{source}.csv.gz'))
        if len(data) == 0:
            continue
        sentences = split_sentences(data)
        sentences = sentences[sentences.str.len() > 30]
        text += sentences.to_list()
        length += sentences.str.len().to_list()

plt.hist(length, bins=20)
plt.title('sequense length dist')
plt.show()

In [None]:
for x in np.random.choice(text, 10): print(x)

#### Doc-titles bulk insert

In [46]:
ENCODER_MODEL = 'distiluse-base-multilingual-cased-v1'
device = 'cuda' if is_available() else 'cpu'

In [47]:
semantic_encoder = SentenceTransformer(ENCODER_MODEL, device=device)
semantic_encoder.max_seq_length = 200

In [48]:
# document description only
embeddings = semantic_encoder.encode(docs['desc'].to_list())
embeddings.shape

(4436, 512)

In [None]:
xyc = pd.DataFrame(TSNE(n_components=2).fit_transform(embeddings), columns=['x','y'])
xyc['cluster'] = docs['orig']/2

fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(xyc['x'], xyc['y'], c=xyc['cluster'], cmap='brg', alpha=0.5, s=5)
ax.set_title('Documents embeddings by origin')
plt.show()

In [50]:
qclient = QdrantClient(host=os.environ['QDRANT_HOST'], port=6333)

In [51]:
qclient.delete_collection(collection_name=INDEX)

True

In [52]:
qclient.create_collection(
    collection_name=INDEX, 
    vectors_config=VectorParams(size=embeddings.shape[1], distance=Distance.COSINE),
)

True

In [53]:
payload = docs.loc[:,['orig','lang','taxonomy','ext','desc']]
payload.columns = ['orig','lang','taxonomy','ext','text']
payload = payload.to_dict('records')
payload[100:105]

[{'orig': 0,
  'lang': 'en',
  'taxonomy': '990',
  'ext': 'EZ (Sch G)',
  'text': 'Supplemental Information Regarding Fundraising or Gaming Activities'},
 {'orig': 0,
  'lang': 'en',
  'taxonomy': '990',
  'ext': 'EZ (Sch L)',
  'text': 'Transactions with Interested Persons'},
 {'orig': 0,
  'lang': 'en',
  'taxonomy': '990',
  'ext': 'EZ (Sch N)',
  'text': 'Liquidation, Termination, Dissolution, or Significant Disposition of Assets'},
 {'orig': 0,
  'lang': 'en',
  'taxonomy': '990',
  'ext': 'PF',
  'text': 'Return of Private Foundation or Section 4947(a)(1) Trust Treated as Private Foundation'},
 {'orig': 0,
  'lang': 'en',
  'taxonomy': '990',
  'ext': '(Sch D)',
  'text': 'Supplemental Financial Statements'}]

In [54]:
qclient.upload_collection(
    collection_name=INDEX,
    vectors=embeddings,
    payload=payload,
    ids=None, # assign automatically
    batch_size=512,
)

#### Sentences bulk-insert

In [55]:
start = time()
counter = 0
for d in docs.to_dict('records'):    
    files = pages[pages['file']==d['file']]['source'].to_list()
    text = []
    for source in files:
        if source in skip + mashed:
            continue
        data = filter_text(pd.read_csv(f'data/info/{source}.csv.gz'))
        if len(data) > 0:
            continue
            
        # split text into sentences
        sentences = split_sentences(data)
        sentences = sentences[sentences.str.len() > 30]
        text += sentences.to_list()
    
    # compute vectors
    embeddings = semantic_encoder.encode(text)
    payload = pd.DataFrame(np.array([d['orig'], d['lang'], d['taxonomy'], d['ext']] * len(text)).reshape((-1,4)),
                           columns=['orig','lang','taxonomy','ext'])
    payload['text'] = text
    payload = payload.to_dict('records')
    
    # db insert
    qclient.upload_collection(
        collection_name=INDEX,
        vectors=embeddings,
        payload=payload,
        ids=None,
        batch_size=512,
    )
    counter += 1
    print(f'done: {counter/len(docs):.0%}', end='\r')
        
print(f'execution time: {(time() - start)/60:.2f}min')

execution time: 2.03min


### Queries

In [56]:
class SemanticSearch:
    def __init__(self, device):
        self.collection = INDEX
        # initialize encoder model
        self.model = SentenceTransformer(ENCODER_MODEL, device=device)
        # initialize Qdrant client
        self.client = QdrantClient(host=os.environ['QDRANT_HOST'], port=6333)

    def find(self, text: str, num: int = 5):
        # convert text query into vector
        vector = self.model.encode(text).tolist()
        # search for closest vectors in the collection
        results = self.client.search(
            collection_name=self.collection,
            query_vector=vector,
            query_filter=None,
            top=num,
        )
        # return payload of closest matches
        return sorted([hit.payload for hit in results], key=lambda x:x['orig'])

search = SemanticSearch(device)

In [57]:
search.find('taxes related to children')
#search.find('taxable retirement')
#search.find('medical credits')

[{'ext': '',
  'lang': 'en',
  'orig': 0,
  'taxonomy': '8615',
  'text': 'Tax for Certain Children Who Have Unearned Income'},
 {'ext': '',
  'lang': 'en',
  'orig': 0,
  'taxonomy': '15110',
  'text': 'Additional Child Tax Credit Worksheet'},
 {'ext': 'EN',
  'lang': 'fr',
  'orig': 2,
  'taxonomy': 'IM30',
  'text': "Entente entre exploitants associés relative à l'impôt minier"},
 {'ext': 'V',
  'lang': 'en',
  'orig': 2,
  'taxonomy': 'IN103',
  'text': 'Refundable Tax Credit for Childcare Expenses'},
 {'ext': '',
  'lang': 'fr',
  'orig': 2,
  'taxonomy': 'IN103',
  'text': "Le crédit d'impôt remboursable pour frais de garde d'enfants"},
 {'ext': '29.E',
  'lang': 'fr',
  'orig': 2,
  'taxonomy': 'CO1175',
  'text': 'Entente relative à la taxe sur les services publics'},
 {'ext': '8.63',
  'lang': 'fr',
  'orig': 2,
  'taxonomy': 'TP1029',
  'text': "Crédit d'impôt pour frais d'adoption"},
 {'ext': '13',
  'lang': 'fr',
  'orig': 2,
  'taxonomy': 'LM93',
  'text': 'Contestation en

<a name="images"></a>
<h3>Indexing images</h3>

For indexing images we use both:
* `ElasticSearch`: enables `layout` search thus we want the image-blocks indexed; we also want the text some images contain
* `Qdrant`: enables `similarity` search -- we want to recognize `logos` for the starter

In [12]:
def image_text(data):
    """
    some images contain text we try to extract
    """
    images = data.loc[data['block-type']=='image']
    if len(images) == 0:
        return data
    
    source, page = data.iloc[0][['source','page']]
    image = np.array(ImageOps.grayscale(Image.open(f'./data/images/{source}-{page}.png')))
    scale = min(image.shape)
    text = []
    for t, l, b, r in images[['top','left','bottom','right']].values:
        if b - t > 0.5 or r - l > 0.5:
            text.append('IMAGE: ')
            continue
        t, l, b, r = int(t * scale), int(l * scale), int(b * scale), int(r * scale)
        try:
            clip = image[max(t - 5, 0):min(b + 5, image.shape[0]), max(l - 5, 0):min(r + 5, image.shape[1])]
            t = ts.image_to_string(clip).strip()
            t = ' '.join(re.split(r'\W+', t)).strip()
            text.append(f'IMAGE: {t}')
        except:
            text.append('IMAGE: ')
    data.loc[data['block-type']=='image','text'] = text
    return data
