In [138]:
import pdfplumber
from pandas import DataFrame
import re

filename = "maal_vaegt_portionsstoerrelser_marts_13_cropped.pdf"

### Simple approach

Notes for first page: 37 rows in original table
* 34 single rows
* 3 double rows
* i.e., 40 in total

In [210]:
def pprint(matrix):
    for row in matrix:
        for element in row:
            print(f'"{element}"', end="\t")  # Use "\t" for tab spacing
        print()  # Move to the next line after each row

def select_rects(rects):
    selected_rects = []
    threshold = 3

    for i, rect in enumerate(rects):
        if rect['height'] < threshold or rect['width'] < threshold:
            selected_rects.append(rect)
    
    return selected_rects

def draw_rects(page, rects):
    im = page.to_image(resolution = 400)
    im = im.draw_rects(rects)
    im.show()    

def is_double_row(table : DataFrame) -> bool:
    double_header_indicator = table.loc[1,1]
    
    return bool(re.fullmatch(r'^[^\d]+$', double_header_indicator))

def set_header(table : DataFrame):
    first_row = table.loc[0,].to_list()
    if is_double_row(table):
        #Handle empty strings in row 0 (indicator of merged cells)
        second_row = table.loc[1,].to_list()
        for i, cell in enumerate(first_row):
            for j in range(i+1,len(first_row)-1): #Excluding last cell for consideration since this would always be None, due to "Kilde" in second row (hence -1)

                if first_row[j] == "":
                    first_row[j] = first_row[i]
                else:
                    break
        first_row[len(first_row)-1] = "Kilde"

        #Merge second row into first row        
        for i in range(0,len(first_row)-1): #Excluding last cell as this is already set to "Kilde" as it should be (hence -1)
            first_row[i] = first_row[i] + "\n" + second_row[i]
        
        table = table.loc[2:,]        

    else: #Single row only
        table = table.loc[1:,] 
    
    table.columns = first_row

    return table

def handle_source(table):
    pass

def transform_table(table : list[dict]) -> list:

    #TODO: handle_source(table)

    headers = table.columns.to_list()
    headers[0] = "Madvare" #Setting the first column header to "Madvare" to create consistency for transformed table
    table.columns = headers

    tf_tbl = table.melt(
            id_vars=["Madvare"],
            value_vars=headers[1:],
            var_name="Enhed",
            value_name="Konverteringsfaktor"
            )
    
    return tf_tbl
    
def extract_pdf_tables(filename) -> list[DataFrame]:
    tables = []
    
    with pdfplumber.open(filename) as pdf:
        for i, page in enumerate(pdf.pages):
            selected_rects = select_rects(page.rects)
            # draw_rects(page, page.rects)
            

            page_tables = page.extract_tables({
                "vertical_strategy": "explicit",
                "horizontal_strategy": "explicit",
                "explicit_vertical_lines": selected_rects,
                "explicit_horizontal_lines": selected_rects,
            })

            if page_tables:
                for i, table in enumerate(page_tables):
                    if table:
                        df_table = DataFrame(table)
                        df_table = df_table.fillna("") #Setting all NoneTypes to empty string
                        df_table = set_header(df_table)
                        tables.append(df_table)
    
    return tables
    

def create_conversion_factor_table(filename : str) -> DataFrame:

    pdf_tables = extract_pdf_tables(filename)

    for i, table in enumerate(pdf_tables):
        transformed = transform_table(table)
        print(transformed.to_markdown())




In [199]:
#Drawing 
cells = []

with pdfplumber.open(filename) as pdf:
    print(pdf.pages)
    page = pdf.pages[0]
    element_type = page.edges
    #TODO: 
    im = page.to_image(resolution = 400)
    for i in range(0,len(element_type)):
    # for i in range(0,20):
    #Selected rects
        rect = element_type[i]
        # if rect['height'] < 1 or rect['width'] < 1: continue #1 Ignore small rectangles
        cells.append(rect)
    im = im.draw_rects(cells)
    im.show()

[<Page:1>, <Page:2>, <Page:3>, <Page:4>, <Page:5>, <Page:6>, <Page:7>, <Page:8>, <Page:9>, <Page:10>, <Page:11>, <Page:12>, <Page:13>, <Page:14>, <Page:15>, <Page:16>, <Page:17>, <Page:18>, <Page:19>, <Page:20>, <Page:21>, <Page:22>, <Page:23>, <Page:24>, <Page:25>, <Page:26>, <Page:27>, <Page:28>, <Page:29>, <Page:30>, <Page:31>, <Page:32>, <Page:33>, <Page:34>, <Page:35>, <Page:36>, <Page:37>, <Page:38>, <Page:39>, <Page:40>, <Page:41>, <Page:42>, <Page:43>, <Page:44>]


### Main

In [211]:
def main():
    create_conversion_factor_table("maal_vaegt_portionsstoerrelser_marts_13_cropped.pdf")

main()

|     | Madvare                                          | Enhed                       | Konverteringsfaktor   |
|----:|:-------------------------------------------------|:----------------------------|:----------------------|
|   0 | Bagel                                            | Vægt pr stk. eller skive, g |                       |
|     |                                                  | Lille                       |                       |
|   1 | Bolle, almindelig/fuldkorn - bager/industribolle | Vægt pr stk. eller skive, g | 40                    |
|     |                                                  | Lille                       |                       |
|   2 | Bolle, almindelig/fuldkorn - hjemmebagt          | Vægt pr stk. eller skive, g | 50                    |
|     |                                                  | Lille                       |                       |
|   3 | Bondebrød, skive                                 | Vægt pr stk. eller skive, g |        