In [25]:
import pdfplumber
import pandas as pd
import re

filename = "maal_vaegt_portionsstoerrelser_marts_13_cropped.pdf"

### Simple approach

Notes for first page: 37 rows in original table
* 34 single rows
* 3 double rows
* i.e., 40 in total

In [26]:
def pprint(matrix):
    for row in matrix:
        for element in row:
            print(f'"{element}"', end="\t")  # Use "\t" for tab spacing
        print()  # Move to the next line after each row

def select_rects(rects):
    selected_rects = []
    threshold = 3

    for i, rect in enumerate(rects):
        if rect['height'] < threshold or rect['width'] < threshold:
            selected_rects.append(rect)
    
    return selected_rects

def draw_rects(page, rects):
    im = page.to_image(resolution = 400)
    im = im.draw_rects(rects)
    im.show()    

def is_double_row(table : pd.DataFrame) -> bool:
    double_header_indicator = table.loc[1,1]
    
    return bool(re.fullmatch(r'^[^\d]+$', double_header_indicator))

def set_header(table : pd.DataFrame):
    first_row = table.loc[0,].to_list()
    if is_double_row(table):
        #Handle empty strings in row 0 (indicator of merged cells)
        second_row = table.loc[1,].to_list()
        for i, cell in enumerate(first_row):
            for j in range(i+1,len(first_row)-1): #Excluding last cell for consideration since this would always be None, due to "Kilde" in second row (hence -1)

                if first_row[j] == "":
                    first_row[j] = first_row[i]
                else:
                    break
        first_row[len(first_row)-1] = "Kilde"

        #Merge second row into first row        
        for i in range(0,len(first_row)-1): #Excluding last cell as this is already set to "Kilde" as it should be (hence -1)
            first_row[i] = first_row[i] + "\n" + second_row[i]
        
        table = table.loc[2:,]        

    else: #Single row only
        table = table.loc[1:,] 
    
    table.columns = first_row

    return table

def handle_source(table):
    pass

def transform_table(table : list[dict]) -> list:

    #TODO: handle_source(table)

    headers = table.columns.to_list()
    headers[0] = "Madvare" #Setting the first column header to "Madvare" to create consistency for transformed table
    table.columns = headers

    tf_tbl = table.melt(
            id_vars=["Madvare"],
            value_vars=headers[1:],
            var_name="Enhed",
            value_name="Konverteringsfaktor"
            )
    
    return tf_tbl
    
def extract_pdf_tables(filename) -> list[pd.DataFrame]:
    tables = []
    
    with pdfplumber.open(filename) as pdf:
        for i, page in enumerate(pdf.pages):
            selected_rects = select_rects(page.rects)
            # draw_rects(page, page.rects)
            

            page_tables = page.extract_tables({
                "vertical_strategy": "explicit",
                "horizontal_strategy": "explicit",
                "explicit_vertical_lines": selected_rects,
                "explicit_horizontal_lines": selected_rects,
            })

            if page_tables:
                for i, table in enumerate(page_tables):
                    if table:
                        df_table = pd.DataFrame(table)
                        df_table = df_table.fillna("") #Setting all NoneTypes to empty string
                        df_table = set_header(df_table)
                        tables.append(df_table)
    
    return tables
    

def create_conversion_factor_table(filename : str) -> pd.DataFrame:

    pdf_tables = extract_pdf_tables(filename)

    result = pd.DataFrame(columns=["Madvare","Enhed","Konverteringsfaktor"])

    for i, table in enumerate(pdf_tables):
        transformed = transform_table(table)

        #Merge transformed
        result = pd.concat([result, transformed], ignore_index=True)
        #print(transformed.to_markdown())
    return result


result = create_conversion_factor_table("maal_vaegt_portionsstoerrelser_marts_13_cropped.pdf")

In [27]:
result_copy = result.copy()
#print(result_copy.to_markdown())

### Data cleaning

In [28]:
def clean_data(df : pd.DataFrame) -> pd.DataFrame:
    #List containing string values that if present in units will exclude the row of the dataframe
    excl_list = ["Kilde","svind","ortion", "Indhold, ml"]
    df = df[~df.Enhed.str.contains('|'.join(excl_list))]
    
    #Remove row where unit conversion factors == "" or "-"
    df = df[~df.Konverteringsfaktor.isin(["","-"])]
    # df = df[df.Konverteringsfaktor != ""]

    #Reset indices
    df = df.reset_index(drop=True)

    return df

### Main

In [None]:
result_copy = clean_data(result_copy)
#print(result_copy.head(100).to_markdown())
# def main():
#     result = create_conversion_factor_table("maal_vaegt_portionsstoerrelser_marts_13_cropped.pdf")
#     result = clean_data(result)
#     print(result.to_markdown())

# main()

|    | Madvare                                          | Enhed                       | Konverteringsfaktor   |
|---:|:-------------------------------------------------|:----------------------------|:----------------------|
|  0 | Bolle, almindelig/fuldkorn - bager/industribolle | Vægt pr stk. eller skive, g | 40                    |
|    |                                                  | Lille                       |                       |
|  1 | Bolle, almindelig/fuldkorn - hjemmebagt          | Vægt pr stk. eller skive, g | 50                    |
|    |                                                  | Lille                       |                       |
|  2 | Croissant                                        | Vægt pr stk. eller skive, g | 25                    |
|    |                                                  | Lille                       |                       |
|  3 | Crouton, naturel                                 | Vægt pr stk. eller skive, g | 0,4             

In [30]:
result.loc[result['Konverteringsfaktor'] == "-"]

Unnamed: 0,Madvare,Enhed,Konverteringsfaktor
639,Filet Royal,"Ikke\nspiselig\ndel,\nben osv,\n%",-
642,Kæbeklump (svinekæber eller\ngrisekind),"Ikke\nspiselig\ndel,\nben osv,\n%",-
643,"Medaljon, 2½ cm tyk","Ikke\nspiselig\ndel,\nben osv,\n%",-
644,Nakkefilet,"Ikke\nspiselig\ndel,\nben osv,\n%",-
645,Nakkekam (er med svær),"Ikke\nspiselig\ndel,\nben osv,\n%",-
...,...,...,...
3402,Saltstænger,g/dl,-
3790,"Gajol, Läkkerol",g/pose eller g/pakke\nLille,-
3818,"Gajol, Läkkerol",g/pose eller g/pakke\nStor,-
3903,Cappuccino,"Portion\nLille, g",-


In [31]:
#Drawing 
# cells = []

# with pdfplumber.open(filename) as pdf:
#     print(pdf.pages)
#     page = pdf.pages[0]
#     element_type = page.edges
#     #TODO: 
#     im = page.to_image(resolution = 400)
#     for i in range(0,len(element_type)):
#     # for i in range(0,20):
#     #Selected rects
#         rect = element_type[i]
#         # if rect['height'] < 1 or rect['width'] < 1: continue #1 Ignore small rectangles
#         cells.append(rect)
#     im = im.draw_rects(cells)
#     im.show()