<a href="https://colab.research.google.com/github/atgorvi/pytorch/blob/1.6_doc_references/OK_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install google-cloud-vision
#!pip install --upgrade google-cloud-documentai
#!pip install --upgrade google-cloud-storage
#!pip3 install pypdfium2

In [None]:
import os
import re
import itertools
import io

import numpy as np
import pandas as pd
import pypdfium2 as pdfium
from google.cloud import vision
import google.cloud.vision_v1

from scipy import spatial
from shapely.geometry import box, Polygon
from typing import List, Sequence
from google.api_core.client_options import ClientOptions
from google.cloud import documentai_v1 as documentai
from google.cloud.vision_v1 import types

In [None]:
#the JSON file you downloaded in step 5 above
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/content/drive/MyDrive/DevsLab/teak-amphora-363110-eb0508426386.json'

document_jpg_path = '/content/drive/MyDrive/DevsLab/OK-7/Samples/2_5226783261438517705-1.png'
document_pdf_path = '/content/2_5226783261438517705.pdf'#'/content/photo_2022-12-07_14-43-39.pdf'

###Form parser

In [None]:
class Document:
  def __init__(self, pages=None):
    self.pages = pages

  def set_pages(self, pages):
    if self.pages is not None:
      self.pages.append(pages)
    else:
      self.pages = pages

  def __len__(self):
    return len(self.pages) if self.pages is not None else 0

class Page:
  def __init__(self, image_path: str, document: Document, idx: int, page_ai_result, cloud_ocr_words):
    self.idx = idx
    self.page_ai_result = page_ai_result
    self.cloud_ocr_words = cloud_ocr_words
    self.refactor_text()
    self.years_summaries = self.parse_year_summaries()
    self.image_path = image_path
    self.document = document
    self.start_year = self.set_start_year()
    self.insurers_table = None
    self.year_tables = self.set_year_tables()
    if idx == 0:
      self.document.form_fields = self.parse_form_fields()
      self.document.insurer_fields = self.parse_insurers()
    self.year_fields = self.parse_years()
    
    
  def set_start_year(self) -> int:
    start_year = re.findall(r"[:](\s*\d{4}\s)", self.refactored_plain_text)[0].strip()
    return int(start_year)

  def set_year_tables(self):
    tables = self.sort_tables(self.page_ai_result.pages[self.idx].tables)
    if self.idx == 0:
      self.insurers_table = Table(self, 0, tables[0])
      tables = tables[1:]
    year_tables = []
    for table_idx, table_ai_result in enumerate(tables):
        year_tables.append(Table(self, table_idx, table_ai_result, ))

    return year_tables

  def parse_year_summaries(self):
    chunk_size = 2
    summaries = re.findall(r":(\d+\.\d{2})", self.refactored_plain_text.replace(" ", ""))
    summaries = [float(summary) for summary in summaries]
    return list(zip(*[iter(summaries)] * chunk_size))

  def sort_tables(self, tables):
    sorted_tables = []
    for table in tables:
      sorted_tables.append([table.layout.bounding_poly.vertices[0].y, table])

    sorted_tables.sort(key=lambda x: x[0])
    tables = [table[1] for table in sorted_tables]
    return tables

  def parse_form_fields(self):
    forms_dict = {}

    values = re.findall(r"\d{10}\s+", self.refactored_plain_text)
    card_number, ipn = int(values[0].strip()) if len(values) != 0 else "відомості відсутні", int(values[1].strip()) if len(values) != 0 else "відомості відсутні"
    forms_dict["card_number"] = card_number
    forms_dict["ipn"] = ipn

    passport = re.findall(r"[a-zA-ZА-Яа-яёЁЇїІіЄєҐґ]{2}\s?\d{6}\s+", self.refactored_plain_text)##################################################################################################################
    forms_dict["passport"] = passport[0].strip() if len(passport) != 0 else "відомості відсутні"

    #full_name = re.findall(r"[А-Я][А-Яа-я]+([-]?[А-Я][А-Яа-я]+)?\s[А-Я][А-Яа-я]+\s[А-Я][А-Яа-я]+", self.refactored_plain_text) for several surnnames
    full_name = re.findall(r"[А-Я][А-Яа-я]+?\s[А-Я][А-Яа-я]+\s[А-Я][А-Яа-я]+", self.refactored_plain_text)
    forms_dict["full_name"] = full_name[-1].strip() if len(full_name) != 0 else "відомості відсутні"
    
    return forms_dict

  def parse_insurers(self):
    insurers_fields = {}
    if self.idx == 0:
      df = self.insurers_table.get_pandas_df()
      for key, values in df.iterrows():
        values = values.values
        insurers_fields[values[0]] = {"branch": values[1], "law_inheritage": values[2], "PFU": values[3], "DFS": values[4], "court_decision": values[5], "insurer_name": values[6]}

    return insurers_fields


  def parse_years(self):
    """
    Create dict with values for each year table on page
    """
    start_year = self.start_year
    years = []
    for idx, table in enumerate(self.year_tables):
      years.append(table.year_field)
    return years

  def get_iou(self, words_vertices, token_vertices):
    # Define Each polygon 
    pol1_xy = list(zip(*[iter(words_vertices)] * 2))
    pol2_xy = list(zip(*[iter(token_vertices)] * 2))
    polygon1_shape = Polygon(pol1_xy)
    polygon2_shape = Polygon(pol2_xy)

    # Calculate Intersection and union, and tne IOU
    polygon_intersection = polygon1_shape.intersection(polygon2_shape).area
    polygon_union = polygon1_shape.union(polygon2_shape).area
    IOU = polygon_intersection / polygon_union
    #print(f"polygon_intersection:{polygon_intersection}, polygon_union:{polygon_union}") 
    #print(f"IOU:{IOU}") 
    return IOU

  def order_points(self, pts):
      pts = np.array(pts)
      # initialize a list of coordinates that will be ordered
      # such that the first entry in the list is the top-left,
      # the second entry is the top-right, the third is the
      # bottom-right, and the fourth is the bottom-left
      rect = np.zeros((4, 2), dtype="float32")
      # the top-left point will have the smallest sum, whereas
      # the bottom-right point will have the largest sum
      s = pts.sum(axis=1)
      rect[0] = pts[np.argmin(s)]
      rect[2] = pts[np.argmax(s)]
      # now, compute the difference between the points, the
      # top-right point will have the smallest difference,
      # whereas the bottom-left will have the largest difference
      diff = np.diff(pts, axis=1)
      rect[1] = pts[np.argmin(diff)]
      rect[3] = pts[np.argmax(diff)]
      # return the ordered coordinates
      return rect.tolist()

  def flatten_list(self, _2d_list):
      flat_list = []
      # Iterate through the outer list
      for element in _2d_list:
          if type(element) is list:
              # If the element is of type list, iterate through the sublist
              for item in element:
                  flat_list.append(item)
          else:
              flat_list.append(element)
      return flat_list

  def refactor_text(self):
    """
    Prepare ukr text dictionary
    """
    refactored_text = self.page_ai_result.text
    self.refactored_text_dict = {}

    words_vertices = [self.flatten_list(self.order_points([[word[1][0].x, word[1][0].y],
                                                           [word[1][1].x, word[1][1].y],
                                                           [word[1][2].x, word[1][2].y],
                                                           [word[1][3].x, word[1][3].y]])) for word in self.cloud_ocr_words]
    words_texts = [el[0] for el in self.cloud_ocr_words]
    tree = spatial.KDTree(words_vertices)
    iou_threshold = 0.05

    for idx, old_token in enumerate(self.page_ai_result.pages[self.idx].tokens):
      old_token_vertices = self.flatten_list(self.order_points([[vertice.x, vertice.y] for vertice in old_token.layout.bounding_poly.vertices]))
      old_token_start = self.page_ai_result.pages[self.idx].tokens[idx].layout.text_anchor.text_segments[0].start_index
      old_token_end = self.page_ai_result.pages[self.idx].tokens[idx].layout.text_anchor.text_segments[0].end_index
      old_token_text = self.page_ai_result.text[old_token_start:old_token_end]
      new_text_key = (old_token_start, old_token_end)

      if old_token_text.strip().replace('.','').isdigit():
        #print(f"digit value: {old_token_text}")
        self.refactored_text_dict[new_text_key] = old_token_text
        continue

      results = tree.query([old_token_vertices], k=1, workers=-1)

      result_index = results[1][0]
        
      word_box_iou = self.get_iou(words_vertices[result_index], old_token_vertices)
      new_token_text = words_texts[result_index] + " " if word_box_iou > iou_threshold else old_token_text

      self.refactored_text_dict[new_text_key] = new_token_text

    self.refactored_plain_text = "".join(list(self.refactored_text_dict.values())).replace("\n", " ")

class Table:
  def __init__(self, page: Page, idx: int, table_ai_result, year_summaries=None):
    self.page = page
    self.idx = idx
    self.table_ai_result = table_ai_result
    self.year = page.start_year + idx
    self.year_summaries = year_summaries
    self.df = self.get_pandas_df()
    self.bounding_poly = self.table_ai_result.layout.bounding_poly.vertices
    self.top_left, self.bottom_left, self.top_right, self.bottom_right = self.get_table_coords(self.bounding_poly)
    try:
      self.first_summary = page.years_summaries[idx][0]
      self.second_summary = page.years_summaries[idx][1]
    except:
      self.first_summary = None
      self.second_summary = None
    self.year_field = self.parse_year()

  def get_table_coords(self, bounding_poly):
    top_left = bounding_poly[0]
    bottom_left = bounding_poly[3]
    top_right = bounding_poly[1]
    bottom_right = bounding_poly[2]

    return top_left, top_right, bottom_left, bottom_right

  def get_pandas_df(self):
    header_row_values = self.get_table_data(self.table_ai_result.header_rows, self.page.page_ai_result.text)
    body_row_values = self.get_table_data(self.table_ai_result.body_rows, self.page.page_ai_result.text)
    # Create a Pandas Dataframe to print the values in tabular format.
    df = pd.DataFrame(
        data=body_row_values,
        columns=pd.MultiIndex.from_arrays(header_row_values),
    )
    if len(df) % 2 == 0: df = df [:-1]
    return df

  def get_year_insurer_ids(self, df):
    """
    Find all insurer ids in year table
    """
    insurer_ids_number = int((len(df)-3)/2)
    insurer_ids_indexes = []
    for id_number in range(insurer_ids_number):
      insurer_ids_indexes.append(2+(2*id_number)-1)
      insurer_ids_indexes.append(2+(2*id_number))
    insurer_ids = df.iloc[insurer_ids_indexes, 0]
    return list(filter(lambda insurer_ids: insurer_ids != "", insurer_ids))

  def parse_year(self):
    """
    Create dict with values for each month of year
    """
    year = {}
    insurers = {}
    insures_ids = self.get_year_insurer_ids(self.df)
    #print(insures_ids)
    sum_idx, days_idx = 0, 1
    sliced_df = self.df.iloc[1:, 2:]
    #display(sliced_df)
    for insurer_idx, insurer_id in enumerate(insures_ids):
      #print(sliced_df.iloc[sum_idx].values)
      #print(sliced_df.iloc[days_idx].values)
      insurers[insurer_id] = sliced_df.iloc[sum_idx].values.tolist(), sliced_df.iloc[days_idx].values.tolist()
      sum_idx += 2
      days_idx += 2
    year["year"] = self.year
    year["insurers"] = insurers
    year["total_sum"] = sliced_df.iloc[-2].values.tolist()
    year["work_experience"] = sliced_df.iloc[-1].values.tolist()
    year["first_summary"] = self.first_summary
    year["second_summary"] = self.second_summary

    return year

  def get_table_data(self,
      rows: Sequence[documentai.Document.Page.Table.TableRow], text: str
  ) -> List[List[str]]:
      """
      Get Text data from table rows
      """
      all_values: List[List[str]] = []
      for row in rows:
          current_row_values: List[str] = []
          for cell in row.cells:
              responce = self.text_anchor_to_text(cell.layout.text_anchor, text)
              #print(responce)
              current_row_values.append(responce)
          all_values.append(current_row_values)
          #print(current_row_values, " \n\n")
      return all_values

  def text_anchor_to_text(self, text_anchor: documentai.Document.TextAnchor, text: str) -> str:
    """
    Document AI identifies table data by their offsets in the entirity of the
    document's text. This function converts offsets to a string.
    """
    response = ""
    old_text = ""
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    for segment in text_anchor.text_segments:
        start_index = int(segment.start_index)
        end_index = int(segment.end_index)
        old_text += text[start_index:end_index]
        
        new_indexes = []
        for key, value in self.page.refactored_text_dict.items():
          if start_index <= key[0] <= end_index and start_index < key[1] <= end_index:
            response += value
            new_indexes.append(key)

        if False:#start_index != new_indexes[0][0] or end_index != new_indexes[-1][-1]:
          #print(f"old: {start_index, end_index}, new: {new_indexes}")
          #print(f"old: {old_text}, new: {response}")
          pass
    if response != "":
      return response.strip().replace("\n", "")
    else:
      return old_text

  def get_csv(self):
    self.df.to_csv(f"table_data_{self.page}_{self.idx}.csv", index=False)

In [None]:
def online_process(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str,
    mime_type: str,
) -> documentai.Document:
    """
    Processes a document using the Document AI Online Processing API.
    """

    # Instantiates a client
    docai_client = documentai.DocumentProcessorServiceClient(
        client_options=ClientOptions(
            api_endpoint=f"{location}-documentai.googleapis.com"
        )
    )

    # The full resource name of the processor, e.g.:
    # projects/project-id/locations/location/processor/processor-id
    # You must create new processors in the Cloud Console first
    resource_name = docai_client.processor_path(project_id, location, processor_id)

    # Read the file into memory
    with open(file_path, "rb") as file:
        file_content = file.read()

    # Load Binary Data into Document AI RawDocument Object
    raw_document = documentai.RawDocument(content=file_content, mime_type=mime_type)

    # Configure the process request
    request = documentai.ProcessRequest(name=resource_name, raw_document=raw_document)

    # Use the Document AI client to process the sample form
    result = docai_client.process_document(request=request)

    return result.document

def cloud_ocr(path):
    """Detects document features in an image."""
    client = vision.ImageAnnotatorClient()

    words_list = []

    with io.open(path, 'rb') as image_file:
        content = image_file.read()

    image = vision.Image(content=content)

    response = client.document_text_detection(image=image)

    for page in response.full_text_annotation.pages:
        for block in page.blocks:
            #print('\nBlock confidence: {}\n'.format(block.confidence))

            for paragraph in block.paragraphs:
                #print('Paragraph confidence: {}'.format(paragraph.confidence))

                for word in paragraph.words:
                    word_text = ''.join([
                        symbol.text for symbol in word.symbols
                    ])
                    #print('Word text: {} (confidence: {} bbox {})'.format(word_text, word.confidence, word.bounding_box.vertices))
                    words_list.append([word_text, word.bounding_box.vertices])

                    for symbol in word.symbols:
                        #print('\tSymbol: {} (confidence: {})'.format(symbol.text, symbol.confidence))
                        pass

    if response.error.message:
        raise Exception(
            '{}\nFor more info on error messages, check: '
            'https://cloud.google.com/apis/design/errors'.format(
                response.error.message))
    return words_list

def parse_page(document, image_path):
  PROJECT_ID = "teak-amphora-363110"
  LOCATION = "eu"  # Format is 'us' or 'eu'
  PROCESSOR_ID = "97955062537e29fe"  # Create processor in Cloud Console

  # The local file in your current working directory
  FILE_PATH = image_path
  # Refer to https://cloud.google.com/document-ai/docs/processors-list
  # for supported file types
  MIME_TYPE = "image/jpeg"
  """
  pdf = pdfium.PdfDocument("/content/2_5226783261438517705.pdf")
  n_pages = len(pdf)
  for page_number in range(n_pages):
      page = pdf.get_page(page_number)
      pil_image = page.render_topil(
          scale=1,
          rotation=0,
          crop=(0, 0, 0, 0),
          #colour=(255, 255, 255, 255),
          #annotations=True,
          greyscale=False,
          optimise_mode=pdfium.OptimiseMode.NONE,
      )
      pil_image.save(f"image_{page_number+1}.png")
  """
  document_ai_result = online_process(
      project_id=PROJECT_ID,
      location=LOCATION,
      processor_id=PROCESSOR_ID,
      file_path=FILE_PATH,
      mime_type=MIME_TYPE,
  )

  cloud_ocr_result = cloud_ocr(FILE_PATH)

  header_row_values: List[List[str]] = []
  body_row_values: List[List[str]] = []

  page = Page(FILE_PATH, document, 0, document_ai_result, cloud_ocr_result)

  document.set_pages([page])

  return document

In [None]:
out_document = parse_page(Document(), document_jpg_path)

In [None]:
out_document.form_fields

{'card_number': 1465004840,
 'ipn': 3054920211,
 'passport': 'відомості відсутні',
 'full_name': 'КОНОВАЛОВ ПАВЛО ВОЛОДИМИРОВИЧ'}

In [None]:
out_document.insurer_fields

{'32485166': {'branch': '',
  'law_inheritage': '',
  'PFU': '28000',
  'DFS': '2656',
  'court_decision': '',
  'insurer_name': 'ТОВАРИСТВО 3 ОБМЕЖЕНОЮ ВІДПОВІДАЛЬНІСТЮ " БЕНІШ ДЖІ ПІ ЕС УКРАЇНА "'},
 '34299381': {'branch': '',
  'law_inheritage': '',
  'PFU': '28000',
  'DFS': '701',
  'court_decision': '',
  'insurer_name': 'ТОВАРИСТВО 3 ОБМЕЖЕНОЮ ВІДПОВІДАЛЬНІСТЮ " СУПУТНИК СЕК\'ЮРІТІ "'},
 '3054920211': {'branch': '',
  'law_inheritage': '',
  'PFU': '28000',
  'DFS': '2650',
  'court_decision': '',
  'insurer_name': 'КОНОВАЛОВ ПАВЛО ВОЛОДИМИРОВИЧ'}}

In [None]:
out_document.pages[0].year_fields

[{'year': 2011,
  'insurers': {'32485166': (['1827.29',
     '1842.35',
     '1842.35',
     '1862.40',
     '1876.80',
     '1890.24',
     '1890.24',
     '1902.72',
     '1952.86',
     '1905.40',
     '1905.40',
     '1907.43'],
    ['31 Так',
     '28 Так',
     '31 Так',
     '30 Так',
     '31 Так',
     '30 Так',
     '31 Так',
     '31 Так',
     '30 Так',
     '31 Так',
     '30 Так',
     '31 Так'])},
  'total_sum': ['1827.29',
   '1842.35',
   '1842.35',
   '1862.40',
   '1876.80',
   '1890.24',
   '1890.24',
   '1902.72',
   '1952.86',
   '1905.40',
   '1905.40',
   '1907.43'],
  'work_experience': ['31',
   '28',
   '31',
   '30',
   '31',
   '30',
   '31',
   '31',
   '30',
   '31',
   '30',
   '31'],
  'first_summary': 22605.48,
  'second_summary': 22605.48},
 {'year': 2012,
  'insurers': {'32485166': (['1914.81',
     '94.88',
     '0.00',
     '0.00',
     '0.00',
     '0.00',
     '0.00',
     '0.00',
     '0.00',
     '0.00',
     '0.00',
     '0.00'],
    ['31 Так'

In [None]:
for table in out_document.pages[0].year_tables:
  print(table.year, table.first_summary)
  display(table.get_pandas_df())

2011 22605.48


Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,знаменник,- права,частина,: позначка,про,сплату,страхових,внесків ),Unnamed: 13,Unnamed: 14
0,Код,Філія,1,2,3,4,5,6,7,8,9,10,11,12
1,32485166,,1827.29,1842.35,1842.35,1862.40,1876.80,1890.24,1890.24,1902.72,1952.86,1905.40,1905.40,1907.43
2,,,31 Так,28 Так,31 Так,30 Так,31 Так,30 Так,31 Так,31 Так,30 Так,31 Так,30 Так,31 Так
3,"Усього , грн .",,1827.29,1842.35,1842.35,1862.40,1876.80,1890.24,1890.24,1902.72,1952.86,1905.40,1905.40,1907.43
4,Страховий,стаж,31,28,31,30,31,30,31,31,30,31,30,31


2012 10480.09


Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,знаменник,- права,частина,: позначка,про,сплату,страхових,внесків ),Unnamed: 13,Unnamed: 14
0,Код,філія,1,2,3,4,5,6,7,8,9,10,11,12
1,32485166,,1914.81,94.88,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,,,31 Так,1 Так,0 Hi,0 Hi,0 Hi,0 Hi,0 Hi,0 Hi,0 Hi,0 Hi,0 Hi,0 Hi
3,34299381,,0.00,1047.62,1100.00,1100.00,1100.00,1100.00,1200.00,1250.34,572.44,0.00,0.00,0.00
4,,,0 Hi,28 Так,31 Так,30 Так,31 Так,30 Так,31 Так,31 Так,7 Так,0 Hi,0 Hi,0 Hi
5,"Усього , грн .",,1914.81,1142.50,1100.00,1100.00,1100.00,1100.00,1200.00,1250.34,572.44,0.00,0.00,0.00
6,Страховий,стаж,31,29,31,30,31,30,31,31,15,0,0,0


2013 11541.0


Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,знаменник,- права,частина,: позначка,про,сплату,страхових,внесків ),Unnamed: 13,Unnamed: 14
0,Код,Філія,1,2,3,4,5,6,7,8,9,10,11,12
1,3054920211,,0.00,0.00,1147.00,1147.00,1147.00,1147.00,1147.00,1147.00,1147.00,1147.00,1147.00,1218.00
2,3054920211,,0 OHi,0 Hi,31 Так,30 Так,31 Так,30 Так,31 Так,31 Так,30 Так,31 Так,30 Так,31 Так
3,"Усього , грн .",,0.00,0.00,1147.00,1147.00,1147.00,1147.00,1147.00,1147.00,1147.00,1147.00,1147.00,1218.00
4,Страховий,стаж,0,0,31,30,31,30,31,31,30,31,30,31


2014 14616.0


Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,знаменник,- права,частина,: позначка,про,сплату,страхових,внесків ),Unnamed: 13,Unnamed: 14
0,Код,Фiлiя,1,2,3,4,5,6,7,8,9,10,11,12
1,,,1218.00,1218.00,1218.00,1218.00,1218.00,1218.00,1218.00,1218.00,1218.00,1218.00,1218.00,1218.00
2,3054920211,,31 Так,28 Так,31 Так,30 Так,31 Так,30 Так,31 Так,31 Так,30 Так,31 Так,30 Так,31 Так
3,"Усього , грн .",,1218.00,1218.00,1218.00,1218.00,1218.00,1218.00,1218.00,1218.00,1218.00,1218.00,1218.00,1218.00
4,Страховий,стаж,31,28,31,30,31,30,31,31,30,31,30,31


2015 None


Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,знаменник,- права,частина,: позначка,про,сплату,страхових,внесків ),Unnamed: 13,Unnamed: 14
0,Код,Філія,1,2,3,4,5,6,7,8,9,10,11,12
1,3054920211,,1218.00,1218.00,1218.00,1218.00,1218.00,1218.00,1218.00,1218.00,1378.00,1378.00,1378.00,1378.00
2,,,31 Так,28 Так,31 Так,30 Так,31 Так,30 Так,31 Так,31 Так,30 Так,31 Так,30 Так,31 Так
3,"Усього , грн .",,1218.00,1218.00,1218.00,1218.00,1218.00,1218.00,1218.00,1218.00,1378.00,1378.00,1378.00,1378.00
4,Страховий,стаж,31,28,31,30,31,30,31,31,30,31,30,31


# Drawing boxes


In [None]:
import argparse
from enum import Enum
import io

from google.cloud import vision
from PIL import Image, ImageDraw

In [None]:
def draw_two_tokens_bounds(old_tokens, closest_tokens):
    import argparse
    from enum import Enum
    import io

    from google.cloud import vision
    from PIL import Image, ImageDraw
    

    """Draw a border around the image using the hints in the vector list."""
    image = Image.open(document_jpg_path)
    draw = ImageDraw.Draw(image)
    for idx, token in enumerate(old_tokens):
      bound = old_tokens[idx].layout.bounding_poly
      draw.polygon(
          [
              bound.vertices[0].x,
              bound.vertices[0].y,
              bound.vertices[1].x,
              bound.vertices[1].y,
              bound.vertices[2].x,
              bound.vertices[2].y,
              bound.vertices[3].x,
              bound.vertices[3].y,
          ],
          None,
          "green",
      )

      bound = closest_tokens[idx]
      draw.polygon(
          [
              bound[0],
              bound[1],
              bound[2],
              bound[3],
              bound[4],
              bound[5],
              bound[6],
              bound[7],
          ],
          None,
          "red",
      )


    display(image)

def draw_tokens_bounds():
    """Draw a border around the image using the hints in the vector list."""
    image = Image.open(document_jpg_path)
    draw = ImageDraw.Draw(image)

    for token in out_document.pages[0].page_ai_result.pages[0].tokens:
      bound = token.layout.bounding_poly
      draw.polygon(
          [
              bound.vertices[0].x,
              bound.vertices[0].y,
              bound.vertices[1].x,
              bound.vertices[1].y,
              bound.vertices[2].x,
              bound.vertices[2].y,
              bound.vertices[3].x,
              bound.vertices[3].y,
          ],
          None,
          "orange",
      )

    display(image)

draw_tokens_bounds()

IndexError: ignored

In [None]:


def draw_boxes(image, bounds, color):
    """Draw a border around the image using the hints in the vector list."""
    draw = ImageDraw.Draw(image)

    for bound in bounds:
        draw.polygon(
            [
                bound.vertices[0].x,
                bound.vertices[0].y,
                bound.vertices[1].x,
                bound.vertices[1].y,
                bound.vertices[2].x,
                bound.vertices[2].y,
                bound.vertices[3].x,
                bound.vertices[3].y,
            ],
            None,
            color,
        )
    return image


class FeatureType(Enum):
    PAGE = 1
    BLOCK = 2
    PARA = 3
    WORD = 4
    SYMBOL = 5

def get_document_bounds(image_file, feature):
    """Returns document bounds given an image."""
    client = vision.ImageAnnotatorClient()

    bounds = []
    words_list = []

    with io.open(image_file, "rb") as image_file:
        content = image_file.read()

    image = vision.Image(content=content)

    response = client.document_text_detection(image=image)
    document = response.full_text_annotation

    # Collect specified feature bounds by enumerating all document features
    for page in document.pages:
        for block in page.blocks:
            for paragraph in block.paragraphs:
                for word in paragraph.words:
                    word_text = ''.join([
                        symbol.text for symbol in word.symbols
                    ])
                    #print('Word text: {} (confidence: {} bbox {})'.format(word_text, word.confidence, word.bounding_box.vertices))
                    words_list.append([word_text, word.bounding_box.vertices])
                    for symbol in word.symbols:
                        if feature == FeatureType.SYMBOL:
                            bounds.append(symbol.bounding_box)

                    if feature == FeatureType.WORD:
                        bounds.append(word.bounding_box)

                if feature == FeatureType.PARA:
                    bounds.append(paragraph.bounding_box)

            if feature == FeatureType.BLOCK:
                bounds.append(block.bounding_box)

    # The list `bounds` contains the coordinates of the bounding boxes.
    return bounds, words_list


def render_doc_text(filein, fileout):
    image = Image.open(filein)
    bounds = get_document_bounds(filein, FeatureType.BLOCK)
    #draw_boxes(image, bounds, "blue")
    bounds = get_document_bounds(filein, FeatureType.PARA)
    #draw_boxes(image, bounds, "red")
    bounds = get_document_bounds(filein, FeatureType.WORD)
    #print(bounds)
    draw_boxes(image, bounds[0], "red")

    if fileout != 0:
        image.save(fileout)
    else:
        display(image)
        #image.show()
    return bounds[1]

words = render_doc_text(document_jpg_path, 0)
print(" ".join([word[0] for word in words]))

In [None]:
print([word[0] for word in words])

In [None]:
def detect_document(path):
    """Detects document features in an image."""
    from google.cloud import vision
    import io
    client = vision.ImageAnnotatorClient()

    words_list = []

    with io.open(path, 'rb') as image_file:
        content = image_file.read()

    image = vision.Image(content=content)

    response = client.document_text_detection(image=image)

    for page in response.full_text_annotation.pages:
        for block in page.blocks:
            #print('\nBlock confidence: {}\n'.format(block.confidence))

            for paragraph in block.paragraphs:
                #print('Paragraph confidence: {}'.format(paragraph.confidence))

                for word in paragraph.words:
                    word_text = ''.join([
                        symbol.text for symbol in word.symbols
                    ])
                    #print('Word text: {} (confidence: {} bbox {})'.format(word_text, word.confidence, word.bounding_box.vertices))
                    words_list.append([word_text, word.bounding_box.vertices])

                    for symbol in word.symbols:
                        #print('\tSymbol: {} (confidence: {})'.format(symbol.text, symbol.confidence))
                        pass

    if response.error.message:
        raise Exception(
            '{}\nFor more info on error messages, check: '
            'https://cloud.google.com/apis/design/errors'.format(
                response.error.message))
    return words_list
        
cloud_ocr_words = detect_document(document_jpg_path)

In [None]:
" ".join([word[0] for word in cloud_ocr_words])

###Trash

In [None]:
class Page:
  def __init__(self, image_path, document, idx, page_ai_result, cloud_ocr_words):
    self.idx = idx
    self.page_ai_result = page_ai_result
    self.cloud_ocr_words = cloud_ocr_words
    self.refactor_text()
    self.image_path = image_path
    self.document = document
    self.start_year = self.set_start_year()
    self.tables = self.set_tables()
    self.form_fields = self.parse_forms()
    self.insurer_fields = self.parse_insurers()
    self.year_fields = self.parse_years()
    
  def set_start_year(self) -> int:
    start_year = re.findall(r"Звітний рік : \d{4}", self.refactored_plain_text)[0].split(" ")[-1]
    return int(start_year)

  def set_tables(self):
    document_tabels = []
    for table_idx, table_ai_result in enumerate(self.page_ai_result.pages[self.idx].tables):
        document_tabels.append(Table(self, table_idx))
    return self.sort_tables(document_tabels)

  def sort_tables(self, tables):
    sorted_tables = []
    for idx, table in enumerate(tables):
      sorted_tables.append([table.top_left.y, table])

    sorted_tables.sort(key=lambda x: x[0])
    tables = [table[1] for table in sorted_tables]

    #Reset inder after sorting
    for idx, table in enumerate(tables):
      table.idx = idx
    return tables

  def parse_forms(self):
    forms_dict = {}

    values = re.findall(r"\d{10}\s+", self.refactored_plain_text)
    card_number, ipn = int(values[0].strip()) if len(values) != 0 else "відомості відсутні", int(values[1].strip()) if len(values) != 0 else "відомості відсутні"
    forms_dict["card_number"] = card_number
    forms_dict["ipn"] = ipn

    passport = re.findall(r"[a-zA-ZА-Яа-яёЁЇїІіЄєҐґ]{2}\s?\d{6}\s+", self.refactored_plain_text)##################################################################################################################
    forms_dict["passport"] = passport[0].strip() if len(passport) != 0 else "відомості відсутні"

    #full_name = re.findall(r"[А-Я][А-Яа-я]+([-]?[А-Я][А-Яа-я]+)?\s[А-Я][А-Яа-я]+\s[А-Я][А-Яа-я]+", self.refactored_plain_text) for several surnnames
    full_name = re.findall(r"[А-Я][А-Яа-я]+?\s[А-Я][А-Яа-я]+\s[А-Я][А-Яа-я]+", self.refactored_plain_text)
    forms_dict["full_name"] = full_name[-1].strip() if len(full_name) != 0 else "відомості відсутні"
    
    return forms_dict

  def parse_insurers(self):
    insurers_fields = {}
    if self.idx == 0:
      df = self.tables[0].get_pandas_df()
      for key, values in df.iterrows():
        values = values.values
        insurers_fields[values[0]] = {"branch": values[1], "law_inheritage": values[2], "PFU": values[3], "DFS": values[4], "court_decision": values[5], "insurer_name": values[6]}

    return insurers_fields


  def parse_years(self):
    """
    Create dict with values for each year table on page
    """
    start_year = self.start_year
    years = {}
    year_tables = self.tables
    if self.idx == 0:
      year_tables = year_tables[1:]
    for idx, table in enumerate(year_tables):
      table_df = table.get_pandas_df()
      years[start_year+idx] = self.parse_year(table_df)
    return years

  def get_table_insurer_ids(self, df):
    """
    Find all insurer ids in year table
    """
    insurer_ids_number = int((len(df)-3)/2)
    insurer_ids_indexes = []
    for id_number in range(insurer_ids_number):
      insurer_ids_indexes.append(2+(2*id_number)-1)
      insurer_ids_indexes.append(2+(2*id_number))
    insurer_ids = df.iloc[insurer_ids_indexes, 0]
    return list(filter(lambda insurer_ids: insurer_ids != "", insurer_ids))

  def parse_year(self, df):
    """
    Create dict with values for each month of year
    """
    months = {}
    insures_ids = self.get_table_insurer_ids(df)
    for (colname, colval) in df.iloc[:, 2:].iteritems():
        col_values = colval.values
        month_dict = {}
        

        for idx, insurer_id in enumerate(insures_ids):
          insurer_amount_idx = 1 + 2 * idx
          insurer_days_idx = 2 + 2 * idx
          month_dict[str(insurer_id) + "_amount"] = col_values[insurer_amount_idx]
          month_dict[str(insurer_id) +  "_days"] = col_values[insurer_days_idx].split(" ")[0]
          if idx == len(insures_ids)-1:
            break
        
        month_dict["total"] = col_values[-2]
        month_dict["work_experience"] = col_values[-1]

        months[int(col_values[0])] = month_dict

    return months

  def get_iou(self, words_vertices, token_vertices):
    # Define Each polygon 
    pol1_xy = list(zip(*[iter(words_vertices)] * 2))
    pol2_xy = list(zip(*[iter(token_vertices)] * 2))
    polygon1_shape = Polygon(pol1_xy)
    polygon2_shape = Polygon(pol2_xy)

    # Calculate Intersection and union, and tne IOU
    polygon_intersection = polygon1_shape.intersection(polygon2_shape).area
    polygon_union = polygon1_shape.union(polygon2_shape).area
    IOU = polygon_intersection / polygon_union
    #print(f"polygon_intersection:{polygon_intersection}, polygon_union:{polygon_union}") 
    #print(f"IOU:{IOU}") 
    return IOU

  def draw_two_tokens_bounds(self, old_tokens, closest_tokens):
      import argparse
      from enum import Enum
      import io

      from google.cloud import vision
      from PIL import Image, ImageDraw
      

      """Draw a border around the image using the hints in the vector list."""
      image = Image.open(document_jpg_path)
      draw = ImageDraw.Draw(image)
      for idx, token in enumerate(old_tokens):
        bound = old_tokens[idx].layout.bounding_poly
        draw.polygon(
            [
                bound.vertices[0].x,
                bound.vertices[0].y,
                bound.vertices[1].x,
                bound.vertices[1].y,
                bound.vertices[2].x,
                bound.vertices[2].y,
                bound.vertices[3].x,
                bound.vertices[3].y,
            ],
            None,
            "green",
        )

        bound = closest_tokens[idx]
        draw.polygon(
            [
                bound[0],
                bound[1],
                bound[2],
                bound[3],
                bound[4],
                bound[5],
                bound[6],
                bound[7],
            ],
            None,
            "red",
        )


      display(image)

  def order_points(self, pts):
      pts = np.array(pts)
      # initialize a list of coordinates that will be ordered
      # such that the first entry in the list is the top-left,
      # the second entry is the top-right, the third is the
      # bottom-right, and the fourth is the bottom-left
      rect = np.zeros((4, 2), dtype="float32")
      # the top-left point will have the smallest sum, whereas
      # the bottom-right point will have the largest sum
      s = pts.sum(axis=1)
      rect[0] = pts[np.argmin(s)]
      rect[2] = pts[np.argmax(s)]
      # now, compute the difference between the points, the
      # top-right point will have the smallest difference,
      # whereas the bottom-left will have the largest difference
      diff = np.diff(pts, axis=1)
      rect[1] = pts[np.argmin(diff)]
      rect[3] = pts[np.argmax(diff)]
      # return the ordered coordinates
      return rect.tolist()

  def flatten_list(self, _2d_list):
      flat_list = []
      # Iterate through the outer list
      for element in _2d_list:
          if type(element) is list:
              # If the element is of type list, iterate through the sublist
              for item in element:
                  flat_list.append(item)
          else:
              flat_list.append(element)
      return flat_list

  def refactor_text(self):
    """
    Prepare ukr text dictionary
    """
    refactored_text = self.page_ai_result.text
    self.refactored_text_dict = {}

    words_vertices = [self.flatten_list(self.order_points([[el[1][0].x, el[1][0].y], [el[1][1].x, el[1][1].y], [el[1][2].x, el[1][2].y], [el[1][3].x, el[1][3].y]])) for el in self.cloud_ocr_words]
    words_texts = [el[0] for el in self.cloud_ocr_words]
    tree = spatial.KDTree(words_vertices)

    for idx, old_token in enumerate(self.page_ai_result.pages[self.idx].tokens):
      old_token_vertices = self.flatten_list(self.order_points([[vertice.x, vertice.y] for vertice in old_token.layout.bounding_poly.vertices]))
      old_token_start = self.page_ai_result.pages[self.idx].tokens[idx].layout.text_anchor.text_segments[0].start_index
      old_token_end = self.page_ai_result.pages[self.idx].tokens[idx].layout.text_anchor.text_segments[0].end_index
      old_token_text = self.page_ai_result.text[old_token_start:old_token_end]
      new_text_key = (old_token_start, old_token_end)

      if old_token_text.replace('.','').isdigit():
        print(f"digit value: {old_token_text}")
        self.refactored_text_dict[new_text_key] = old_token_text
        continue

      results = tree.query([old_token_vertices], k=2, workers=-1)
      old_tokens, closest_tokens = [], []
      found_correctly = False

      result_indexes = list(results[1][0])
      for result_idx in result_indexes:
          
        iou_threshold = 0.05
        word_box_iou = self.get_iou(words_vertices[result_idx], old_token_vertices)
        new_token_text = words_texts[result_idx] + " " if word_box_iou >= iou_threshold else old_token_text
        if "0 0" in new_token_text:
          print(f"old_token_text: {old_token_text}, new_token_text: {new_token_text}")
          break

        if word_box_iou < iou_threshold:
          #old_tokens.append(old_token)
          #closest_tokens.append(words_vertices[result_idx])
          pass
        else:
          found_correctly = True
          break
      
      if False:#display_bounds:
        #print(f"old_token_text: {old_token_text}, new_token_text: {new_token_text}")
        self.draw_two_tokens_bounds(old_tokens, closest_tokens)
        pass

      self.refactored_text_dict[new_text_key] = new_token_text



    self.refactored_plain_text = "".join(list(self.refactored_text_dict.values()))

#test_page = Page("", Document(), 0, out_document.pages[0].page_ai_result, out_document.pages[0].cloud_ocr_words) 