In [138]:
import pdfplumber
from pandas import DataFrame
import re

filename = "maal_vaegt_portionsstoerrelser_marts_13_cropped.pdf"

### Simple approach

Notes for first page: 37 rows in original table
* 34 single rows
* 3 double rows
* i.e., 40 in total

In [193]:
def pprint(matrix):
    for row in matrix:
        for element in row:
            print(f'"{element}"', end="\t")  # Use "\t" for tab spacing
        print()  # Move to the next line after each row

def select_rects(rects):
    selected_rects = []
    threshold = 3

    for i, rect in enumerate(rects):
        if rect['height'] < threshold or rect['width'] < threshold:
            selected_rects.append(rect)
    
    return selected_rects

def draw_rects(page, rects):
    im = page.to_image(resolution = 400)
    im = im.draw_rects(rects)
    im.show()    

def is_double_row(table : DataFrame) -> bool:
    double_header_indicator = table.loc[1,1]
    
    return bool(re.fullmatch(r'^[^\d]+$', double_header_indicator))

def set_header(table : DataFrame):
    first_row = table.loc[0,].to_list()
    if is_double_row(table):
        #Handle empty strings in row 0 (indicator of merged cells)
        second_row = table.loc[1,].to_list()
        for i, cell in enumerate(first_row):
            for j in range(i+1,len(first_row)-1): #Excluding last cell for consideration since this would always be None, due to "Kilde" in second row (hence -1)

                if first_row[j] == "":
                    first_row[j] = first_row[i]
                else:
                    break
        first_row[len(first_row)-1] = "Kilde"

        #Merge second row into first row        
        for i in range(0,len(first_row)-1): #Excluding last cell as this is already set to "Kilde" as it should be (hence -1)
            first_row[i] = first_row[i] + "\n" + second_row[i]
        
        table = table.loc[2:,]        

    else: #Single row only
        table = table.loc[1:,] 
    
    table.columns = first_row

    return table

tables = []

with pdfplumber.open(filename) as pdf:


    for i, page in enumerate(pdf.pages):
        selected_rects = select_rects(page.rects)
        # draw_rects(page, page.rects)
        

        page_tables = page.extract_tables({
            "vertical_strategy": "explicit",
            "horizontal_strategy": "explicit",
            "explicit_vertical_lines": selected_rects,
            "explicit_horizontal_lines": selected_rects,
        })

        if page_tables:
            for i, table in enumerate(page_tables):
                if table:
                    df_table = DataFrame(table)
                    df_table = df_table.fillna("") #Setting all NoneTypes to empty string
                    df_table = set_header(df_table)
                    
                    tables.append({
                        'page': pdf.pages.index(page) + 12, #Add 12 to handle page displacement
                        'data': df_table
                    })

### Print dataframes

In [194]:
for i, table in enumerate(tables):
    print(f"Page: {table['page']}")
    df = table['data']
    print(df.to_markdown())

Page: 12
|    | Brød                                             | Vægt pr stk. eller skive, g   | Vægt pr stk. eller skive, g   | Vægt pr stk. eller skive, g   | Kilde   |
|    |                                                  | Lille                         | Mellem                        | Stor                          |         |
|---:|:-------------------------------------------------|:------------------------------|:------------------------------|:------------------------------|:--------|
|  2 | Bagel                                            |                               | 90                            |                               | d       |
|  3 | Bolle, almindelig/fuldkorn - bager/industribolle | 40                            | 60                            | 80                            | d       |
|  4 | Bolle, almindelig/fuldkorn - hjemmebagt          | 50                            | 70                            | 120                           | r       |
|  5 | 

In [12]:
#Drawing 
cells = []

with pdfplumber.open(filename) as pdf:
    print(pdf.pages)
    page = pdf.pages[0]
    element_type = page.edges
    #TODO: 
    im = page.to_image(resolution = 400)
    for i in range(0,len(element_type)):
    # for i in range(0,20):
    #Selected rects
        rect = element_type[i]
        # if rect['height'] < 1 or rect['width'] < 1: continue #1 Ignore small rectangles
        cells.append(rect)
    im = im.draw_rects(cells)
    im.show()

[<Page:1>]
