In [65]:
import pdfplumber
import pandas as pd

filename = "mvfodevarer testpage.pdf"

### Clean up rectangles
1. Ignore rects with height or width < 1
2. Ignore nested rects

In [66]:
selected_rects = []

def extract_coordinates(rect):
    return (rect['x0'], rect['x1'],rect['y0'],rect['y1'])

def convert_float_to_int(t):
    return tuple(int(x) for x in t)    

def is_nested(rect, other_rect):
    # print(f"other_rect['x0']: {other_rect['x0']}")
    # print(f"rect['x0']: {rect['x0']}")
    # print(f"other_rect['x1']: {other_rect['x1']}")
    # print(f"rect['x1']: {rect['x1']}")
    # print(f"other_rect['y0']: {other_rect['y0']}")
    # print(f"rect['y0']: {rect['y0']}")
    # print(f"other_rect['y1']: {other_rect['y1']}")
    # print(f"rect['y1']: {rect['y1']}")
    
    rect_coords = extract_coordinates(rect)
    other_rect_coords = extract_coordinates(other_rect)
    rect_coords = convert_float_to_int(rect_coords) #Convert floating point to integers to avoid issues caused by rounding errors
    other_rect_coords = convert_float_to_int(other_rect_coords) #Convert floating point to integers to avoid issues caused by rounding errors
    (x0,x1,y0,y1) = rect_coords
    (x0_o,x1_o,y0_o,y1_o) = other_rect_coords

    return x0_o <= x0 and x1_o >= x1 and y0_o <= y0 and y1_o >= y1

with pdfplumber.open(filename) as pdf:
    print(pdf.pages)
    page = pdf.pages[0]

    initial_j = 0

    for i in range(0,len(page.rects)):
        rect = page.rects[i]
        if rect['height'] < 1 or rect['width'] < 1: continue #1 Ignore small rectangles

        #Look for rectangles that might encapsulate current rectangle on page
        for j in range(initial_j,len(page.rects)):
            #print(f"(i,j): ({i},{j})")
            other_rect = page.rects[j]
            if j == i: #Assumption: rectangles with index > i will never encapsulate rectangle i.
                #print(f"added rect {i} to selected rects")
                selected_rects.append(rect)
                break
            elif is_nested(rect, other_rect):
                initial_j = j #Displace the starting point to search for rectangles that nest rectangle i.
                break

[<Page:1>]


### Drawing

In [67]:
im = page.to_image(resolution = 400)
im = im.draw_rects(selected_rects)
im.show()

### Extract data based on the selected rectangles (single page only)

In [68]:
def replace_newlines(cell):
    if isinstance(cell, str):
        cell = cell.replace("\n", " ")
        cell = cell.replace("-\n", " ")
    return cell

with pdfplumber.open(filename) as pdf:
    print(pdf.pages)
    page = pdf.pages[0]

    page_tables = page.extract_tables({
        "vertical_strategy": "explicit",
        "horizontal_strategy": "explicit",
        "explicit_vertical_lines": selected_rects,
        "explicit_horizontal_lines": selected_rects,
    })

page_tables = page_tables[0] #Removing outermost nesting level
page_tables = page_tables[1:] #Removing first item since it is irrelevant
page_tables = [[replace_newlines(cell) for cell in row] for row in page_tables] #Replace newline characters with whitespace
headers = page_tables[0] #Extracting headers
page_tables = page_tables[1:] #Removing headers from data
df = pd.DataFrame(page_tables, columns=headers)
# df = df.applymap(replace_newlines) 
print(df.to_markdown())


[<Page:1>]
|    | Madvare                      | Lille   |   Mellem | Stor   |
|---:|:-----------------------------|:--------|---------:|:-------|
|  0 | Tacoskal                     |         |       10 |        |
|  1 | Tartelet                     |         |       10 | 20     |
|  2 | Toastbrød, almindelig, skive |         |       30 |        |
|  3 | Toastbrød, fuldkorns,skive   |         |       35 |        |
|  4 | Tortilla                     | 25      |       40 | 60     |
|  5 | Tærtedej, rå (ubagt)         |         |      250 |        |


### Scale approach to all pages of PDF

In [72]:
def extract_coordinates(rect):
    return (rect['x0'], rect['x1'],rect['y0'],rect['y1'])

def convert_float_to_int(t):
    return tuple(int(x) for x in t)    

def is_nested(rect, other_rect):    
    rect_coords = extract_coordinates(rect)
    other_rect_coords = extract_coordinates(other_rect)
    rect_coords = convert_float_to_int(rect_coords) #Convert floating point to integers to avoid issues caused by rounding errors
    other_rect_coords = convert_float_to_int(other_rect_coords) #Convert floating point to integers to avoid issues caused by rounding errors
    (x0,x1,y0,y1) = rect_coords
    (x0_o,x1_o,y0_o,y1_o) = other_rect_coords

    return x0_o <= x0 and x1_o >= x1 and y0_o <= y0 and y1_o >= y1

def find_cells_on_page(page):
    selected_rects = []

    initial_j = 0

    for i in range(0,len(page.rects)):
        rect = page.rects[i]
        if rect['height'] < 1 or rect['width'] < 1: continue #1 Ignore small rectangles

        #Look for rectangles that might encapsulate current rectangle on page
        for j in range(initial_j,len(page.rects)):
            #print(f"(i,j): ({i},{j})")
            other_rect = page.rects[j]
            if j == i: #Assumption: rectangles with index > i will never encapsulate rectangle i.
                #print(f"added rect {i} to selected rects")
                selected_rects.append(rect)
                break
            elif is_nested(rect, other_rect):
                initial_j = j #Displace the starting point to search for rectangles that nest rectangle i.
                break
    
    return selected_rects

def curate_table(table):
    #page_tables = page_tables[0] #Removing outermost nesting level
    table = table[1:] #Removing first item since it is irrelevant
    table = [[replace_newlines(cell) for cell in row] for row in page_tables] #Replace newline characters with whitespace
    headers = table[0] #Extracting headers
    table = table[1:] #Removing headers from data
    return pd.DataFrame(table, columns=headers)

def extract_pdf_tables(filename : str) -> dict:
    #tables = []

    with pdfplumber.open(filename) as pdf:
        # Iterate through each page
        for page in pdf.pages:

            #Find cells on page
            cells_found = find_cells_on_page(page) 
            
            # Extract tables from the page
            page_tables = page.extract_tables({
                "vertical_strategy": "explicit",
                "horizontal_strategy": "explicit",
                "explicit_vertical_lines": cells_found,
                "explicit_horizontal_lines": cells_found,
            })
    
            if page_tables:
                for table in page_tables:
                    if table:
                        curated_table = curate_table(table)
                        print(curated_table.to_markdown())

extract_pdf_tables(filename)

|    | Tacoskal                     |    |   10 |    |
|---:|:-----------------------------|:---|-----:|:---|
|  0 | Tartelet                     |    |   10 | 20 |
|  1 | Toastbrød, almindelig, skive |    |   30 |    |
|  2 | Toastbrød, fuldkorns,skive   |    |   35 |    |
|  3 | Tortilla                     | 25 |   40 | 60 |
|  4 | Tærtedej, rå (ubagt)         |    |  250 |    |
|    | Tacoskal                     |    |   10 |    |
|---:|:-----------------------------|:---|-----:|:---|
|  0 | Tartelet                     |    |   10 | 20 |
|  1 | Toastbrød, almindelig, skive |    |   30 |    |
|  2 | Toastbrød, fuldkorns,skive   |    |   35 |    |
|  3 | Tortilla                     | 25 |   40 | 60 |
|  4 | Tærtedej, rå (ubagt)         |    |  250 |    |
|    | Tacoskal                     |    |   10 |    |
|---:|:-----------------------------|:---|-----:|:---|
|  0 | Tartelet                     |    |   10 | 20 |
|  1 | Toastbrød, almindelig, skive |    |   30 |    |
|  2 | Toa

In [None]:
#Drawing rects
cells_found = []

with pdfplumber.open("../mvfodevarer_side12.pdf") as pdf:
    print(pdf.pages)
    page = pdf.pages[0]
    #TODO: 
    # 1. Ignore rects with height = 0? (or under certain threshold)
    # 2. Ignore nested rects
    im = page.to_image(resolution = 400)

    # for i in range(0,len(page.rects)):
    for i in range(0,20):
    #Selected rects
        rect = page.rects[i]
        # if rect['height'] < 1 or rect['width'] < 1: continue #1 Ignore small rectangles
        cells_found.append(rect)
        # print(f"Rect index: {i}")
        # print(f"Rect height: {rect['height']}")
        # print(f"Rect width: {rect['width']}")
    im = im.draw_rects(cells_found)
    im.show()

[<Page:1>]


In [25]:
int((24.9,23.6,29.3,25.5))

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'tuple'

2 Remove nested rects