In [81]:
import json
import dpath
from typing import List, Dict
import pandas as pd
import math
import matplotlib.pyplot as plt

def open_results_file(filepath: str):
    with open(filepath) as f:
        data = json.loads(f.read())
    return data

def get_words_from_results(data: Dict):
    words = []
    for _, word in dpath.search(
        data,
        'responses/*/fullTextAnnotation/pages/*/blocks/*/paragraphs/*/words',yielded=True):
        words.append(word)
    return words

def get_tokens_from_words(results: List):
    tokens = []
    for words in results:
        for word in words:
            verts = word['boundingBox']['normalizedVertices']
            left, top = verts[0]['x'], verts[0]['y']
            right, bottom = verts[2]['x'], verts[2]['y']
            rect = [left,top,right,bottom]
            tokens.append(Token(rect, word['symbols'], word['confidence']))
    return Tokens(tokens)


class Token:
    def __init__(self, rectangle:List, symbols:List, confidence:float) -> None:
        self.rectangle = rectangle
        self.symbols = symbols
        self.confidence = confidence
        self.left = self.rectangle[0]
        self.top = 1-self.rectangle[1]
        self.right = self.rectangle[2]
        self.bottom = 1-self.rectangle[3]
    
    @property
    def text(self) -> str:
        return ''.join(char.get('text',' ') for char in self.symbols)

    @property
    def height(self) -> float:
        return self.top-self.bottom
    
    @property
    def length(self) -> float:
        return self.right-self.left

    def __repr__(self):
        return f'Y:{self.top:.3f} X:{self.left:.3f} ---- "{self.text}" ({self.confidence:.02f})'

class Tokens:
    
    def __init__(self, tokens: list) -> None:
        self.tokens=tokens

    truncate = 3
    epsilon = 0.007
    

    def __iter__(self):
        self.index = 0
        return self

    def __next__(self):
        if self.index >= len(self.tokens):
            raise StopIteration
        value = self.tokens[self.index]
        self.index += 1
        return value

    def __len__(self):
        return len(self.tokens)
    
    def __getitem__(self, index):
        return self.tokens[index]

    def __repr__(self):
        return '\n'.join(str(token) for token in self.tokens)

    @property
    def sorted_tokens(self) -> list:
        return sorted(
            self.tokens,
            key=lambda token: (-token.top, token.left),
        )

    def truncate_values(self, values: list, truncate: int):
        return [round(value, truncate) for value in values]

    def get_all_values(self, attribute: str):
        assert attribute in ['top', 'left', 'right', 'bottom'], 'Invalid attribute name'
        return [getattr(token, attribute) for token in self.tokens]
    
    @property
    def rows(self):
        new_row, rows = [], []
        prev_y = math.inf
        for token in self.sorted_tokens:
            is_new_row = bool(abs(token.top - prev_y) > self.epsilon)
            if is_new_row:
                rows.append(new_row)
                new_row=[]
                prev_y = token.top
            new_row.append(token)
            new_row.sort(key=lambda token: token.left)
        rows.append(new_row)
        for index, row in enumerate(rows):
            for token in row:
                token.row=index
        return rows
    
    @property
    def gaps(self):
        gaps = []
        temp_interval, intervals = self.intervals[0], self.intervals[1:]
        for index, interval in enumerate(intervals):
            if index and not interval.overlaps(temp_interval):
                gaps.append(pd.Interval(left=temp_interval.right, right=interval.left))
                temp_interval = interval
            if interval.right > temp_interval.right:
                temp_interval=pd.Interval(temp_interval.left, interval.right)
        return gaps

    def _filter_gaps(self):
        return self.gaps[1:]

    @property
    def intervals(self):
        intervals = []
        HEADER_HEIGHT=0.015
        MAX_OFFSET = 0.2
        for row in self.rows:
            for token in row:
                if token.height < HEADER_HEIGHT and row[0].left < MAX_OFFSET:
                    intervals.append(pd.Interval(token.left, token.right))
        intervals.sort(key=lambda interval: (interval.left, interval.right))
        return intervals

    @property
    def columns(self):
        limits = [gap.left for gap in self._filter_gaps()] + [1]
        print(limits)
        for token in self.tokens:
            for index, limit in zip(range(len(limits), 0, -1), limits[::-1]):
                if token.left <= limit:
                    token.column=index
        columns = []
        for index in range(1, len(limits) + 1):
            columns.append([token for token in tokens if token.column==index])
        return columns

    def plot_intervals(self):
        df = pd.DataFrame( 
            {'left':[i.left for i in self.intervals],
            'right':[i.right for i in self.intervals]},
            )
        start, end = df['right'], df['left']
        width = end-start
        fig, ax = plt.subplots()
        ax.barh(width=width, left=start, height=0.005, y=df.index, color='red',edgecolor='red')
        for gap in self.gaps:
            ax.bar(height=len(self), alpha=0.5, x=gap.left, width=gap.right-gap.left, color='blue', align='edge')
        plt.show()

   





tokens = get_tokens_from_words(get_words_from_results(open_results_file('pag9_d.json')))
tokens.columns[2]


[0.6857143, 0.80336136, 1]


[Y:0.929 X:0.908 ---- "." (0.98),
 Y:0.875 X:0.850 ---- "POR" (0.91),
 Y:0.865 X:0.824 ---- "ARRECADAR" (0.94),
 Y:0.865 X:0.897 ---- "." (0.98),
 Y:0.411 X:0.854 ---- "201966" (0.99),
 Y:0.324 X:0.839 ---- "3.963299" (0.97),
 Y:0.314 X:0.837 ---- "4.226367" (0.91),
 Y:0.292 X:0.830 ---- "11.54340159" (0.78),
 Y:0.280 X:0.837 ---- "2.515160" (0.99),
 Y:0.270 X:0.837 ---- "6.998803" (0.99),
 Y:0.259 X:0.852 ---- "317681" (0.87),
 Y:0.236 X:0.852 ---- "7664743" (0.83),
 Y:0.204 X:0.861 ---- "SEDOSA" (0.51),
 Y:0.160 X:0.837 ---- "4.870715" (0.98),
 Y:0.624 X:0.973 ---- "PAS" (0.30)]

In [79]:
tokens.columns[0]

[0.6857143, 0.80336136, 1]
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3
2
1
3

[Y:0.964 X:0.513 ---- "8" (0.96),
 Y:0.929 X:0.133 ---- "Receita" (0.99),
 Y:0.929 X:0.267 ---- "do" (0.99),
 Y:0.929 X:0.313 ---- "Imperio" (0.99),
 Y:0.929 X:0.449 ---- "no" (0.99),
 Y:0.929 X:0.496 ---- "exercicio" (0.94),
 Y:0.929 X:0.659 ---- "de" (0.99),
 Y:0.879 X:0.319 ---- "DENOMINAÇÃO" (0.94),
 Y:0.879 X:0.415 ---- "DAS" (0.96),
 Y:0.879 X:0.444 ---- "RENDAS" (0.94),
 Y:0.879 X:0.492 ---- "." (0.90),
 Y:0.836 X:0.353 ---- "RENDA" (0.97),
 Y:0.836 X:0.410 ---- "GERAL" (0.98),
 Y:0.836 X:0.461 ---- "." (0.94),
 Y:0.812 X:0.371 ---- "ORDINARIA" (0.95),
 Y:0.812 X:0.440 ---- "." (0.78),
 Y:0.792 X:0.370 ---- "Importação" (0.94),
 Y:0.792 X:0.440 ---- "." (0.89),
 Y:0.772 X:0.131 ---- "Direitos" (1.00),
 Y:0.771 X:0.188 ---- "de" (1.00),
 Y:0.771 X:0.212 ---- "consumo" (0.98),
 Y:0.770 X:0.272 ---- ".." (0.99),
 Y:0.760 X:0.129 ---- "Ditos" (0.99),
 Y:0.760 X:0.170 ---- "de" (0.99),
 Y:0.760 X:0.192 ---- "1" (0.93),
 Y:0.760 X:0.205 ---- "por" (0.99),
 Y:0.760 X:0.235 ---- "cento"