In [1]:
import fitz  # PyMuPDF
import math

In [10]:
def calculate_centroid(bbox):
    x0, y0, x1, y1 = bbox
    return ((x0 + x1) / 2, (y0 + y1) / 2)

def euclidean_distance(point1, point2):
    return math.sqrt((point1[0] - point2[0]) ** 2 + (point1[1] - point2[1]) ** 2)

def find_nearest_neighbors(words):
    neighbors = {}
    for i, (word_i, bbox_i) in enumerate(words.items()):
        distances = []
        for j, (word_j, bbox_j) in enumerate(words.items()):
            if i != j:
                dist = euclidean_distance(calculate_centroid(bbox_i), calculate_centroid(bbox_j))
                distances.append((dist, word_j))
        # Sort distances and choose the closest ones, here we take the 3 closest neighbors for example
        distances.sort(key=lambda x: x[0])
        neighbors[word_i] = [neighbor for _, neighbor in distances[:3]]
    return neighbors

def extract_words_with_bbox(pdf_path):
    doc = fitz.open(pdf_path)
    words_with_bbox = {}
    for page_num, page in enumerate(doc):
        words = page.get_text("words")  # This returns a list of words and their bboxes
        for annot in words:
            bbox, word = annot[0:4], annot[4]
            identifier = f"{word}_{page_num}_{bbox}"  # Creating a unique identifier for each word
            words_with_bbox[identifier] = bbox
    return words_with_bbox

In [11]:
# Path to your PDF file
pdf_path = './invoice_3_charspace_4.pdf'
words_with_bbox = extract_words_with_bbox(pdf_path)
word_neighbors = find_nearest_neighbors(words_with_bbox)

In [13]:
word_neighbors

{'Invoice_0_(48.18898010253906, 25.711669921875, 96.63565063476562, 39.680419921875)': ['of_0_(77.97698211669922, 50.39581298828125, 88.58154296875, 63.20050048828125)',
  'Date_0_(48.18898010253906, 50.39581298828125, 74.48155975341797, 63.20050048828125)',
  'no:_0_(100.8089828491211, 25.711669921875, 122.39581298828125, 39.680419921875)'],
 'no:_0_(100.8089828491211, 25.711669921875, 122.39581298828125, 39.680419921875)': ['issue:_0_(92.07898712158203, 50.39581298828125, 124.0440444946289, 63.20050048828125)',
  'of_0_(77.97698211669922, 50.39581298828125, 88.58154296875, 63.20050048828125)',
  'Invoice_0_(48.18898010253906, 25.711669921875, 96.63565063476562, 39.680419921875)'],
 '61890427_0_(126.57299041748047, 25.711669921875, 193.38662719726562, 39.680419921875)': ['no:_0_(100.8089828491211, 25.711669921875, 122.39581298828125, 39.680419921875)',
  'issue:_0_(92.07898712158203, 50.39581298828125, 124.0440444946289, 63.20050048828125)',
  'of_0_(77.97698211669922, 50.395812988281

In [5]:
doc = fitz.open(pdf_path)

In [14]:
doc[0].get_text()

'Invoice no: 61890427\nDate of issue:\n04/18/2021\nSeller:\nGrant, Green and Mathews\n030 Miller Lock\nEast Robert, NY 99396\nTax Id: 974-85-6740\nIBAN: GB94LWTW89276702451796\nClient:\nWilliams-Russell\n11834 Elizabeth Orchard\nLoveburgh, OH 67733\nTax Id: 999-98-6887\nITEMS\nNo.\nDescription\nQty\nUM\nNet price\nNet worth\nVAT [%]\nGross\nworth\nHOME ESSENTIALS GRADIENT\nSTEMLESS WINE GLASSES SET\nOF 4 20 FL OZ (591 ml) NEW\n1.\n4,00\neach\n12,99\n51,96\n10%\n57,16\nFabulous at 50 wine glass, Out\nof the Box, 33\n2.\n2,00\neach\n20,00\n40,00\n10%\n44,00\nChampagne Glass Holder\nStorage Bar Accessory with\nShelf Wall Mount Wine Holder\n3.\n1,00\neach\n19,99\n19,99\n10%\n21,99\nSUMMARY\nVAT [%]\nNet worth\nVAT\nGross worth\n10%\n111,95\n11,20\n123,15\nTotal\n$ 111,95\n$ 11,20\n$ 123,15\n'