In [5]:
import json
import time
import pdfplumber
from pdfplumber.pdf import Page as PDFPlumberPage
from pdfplumber.utils.text import WordExtractor, objects_to_bbox
from dataclasses import dataclass, asdict, field
SCALING_FACTOR = 2.77793494

In [3]:
def merge_text_elements(text_elements, max_x_distance=10/SCALING_FACTOR, max_y_distance=5/SCALING_FACTOR):
    """
    Merges text elements that are horizontally close and vertically aligned based on bounding box sides.

    Args:
        text_elements (list): List of text elements with 'description' and 'bounding_box'.
        max_x_distance (int): Maximum horizontal distance in pixels to consider for merging.
        max_y_distance (int): Maximum vertical distance in pixels to consider for merging.

    Returns:
        list: A new list of merged text elements.
    """
    if not text_elements:
        return []

    # Sort text elements by top-left y-coordinate, then by left x-coordinate
    sorted_elements = sorted(text_elements, key=lambda el: (el['bounding_box'][0][1], el['bounding_box'][0][0]))
    elements_to_keep = []
    while True:
        merged = False
        merged_elements = []
        current_element = sorted_elements[0].copy()
        
        for next_element in sorted_elements[1:]:
            #print("Considering merging ", current_element['description'], " and ", next_element['description'])
            #print("  y coordinates: ", [coord[1] for coord in current_element['bounding_box']], " and ", [coord[1] for coord in next_element['bounding_box']])
            # Extract bounding box coordinates
            current_bbox = current_element['bounding_box']
            next_bbox = next_element['bounding_box']

            # Determine right side of current and left side of next
            current_right_x = max([coord[0] for coord in current_bbox])
            current_left_x = min([coord[0] for coord in current_bbox])
            next_left_x = min([coord[0] for coord in next_bbox])
            next_right_x = max([coord[0] for coord in next_bbox])

            # Calculate horizontal distance between right of current and left of next
            horizontal_distance1 = next_left_x - current_right_x
            horizontal_distance2 = current_left_x - next_right_x

            # Extract top y-coordinates for vertical alignment
            current_top_y = current_bbox[0][1]
            next_top_y = next_bbox[0][1]
            vertical_distance = abs(next_top_y - current_top_y)
            # print("  Vertical distance: ", vertical_distance)
            # print("  Horizontal distance1: ", horizontal_distance1)
            # print("  Horizontal distance2: ", horizontal_distance2)
            # Decide whether to merge
            if (abs(horizontal_distance1) < max_x_distance or abs(horizontal_distance2) < max_x_distance) and abs(vertical_distance) <= max_y_distance:
                # Merge descriptions
                # Decide which is left and which is right
                if current_right_x < next_left_x:
                    current_element['description'] += ' ' +  next_element['description']
                else:
                    current_element['description'] = next_element['description'] + ' ' + current_element['description']
                # Merge bounding boxes
                # New left_x is min of current and next
                new_left_x = min([coord[0] for coord in current_bbox] + [coord[0] for coord in next_bbox])
                # New right_x is max of current and next
                new_right_x = max([coord[0] for coord in current_bbox] + [coord[0] for coord in next_bbox])
                # New top_y is min of current and next
                new_top_y = min([coord[1] for coord in current_bbox] + [coord[1] for coord in next_bbox])
                # New bottom_y is max of current and next
                new_bottom_y = max([coord[1] for coord in current_bbox] + [coord[1] for coord in next_bbox])

                # Define new bounding box
                new_bounding_box = [
                    (new_left_x, new_top_y),
                    (new_right_x, new_top_y),
                    (new_right_x, new_bottom_y),
                    (new_left_x, new_bottom_y)
                ]
                # print("  Merged them!")
                current_element['bounding_box'] = new_bounding_box
                merged = True
            else:
                # No merge; add the current element to the list
                merged_elements.append(next_element)

        # Add the last element
        # merged_elements.append(current_element)
        
        # Update sorted_elements for next iteration
        sorted_elements = merged_elements.copy()
        # if merged:
        #     merged_elements.insert(0, current_element)
        elements_to_keep.append(current_element)
        
        # print("Nothing else to merge. Keeping ", current_element)
        # elements_to_keep.append(current_element)
        if len(sorted_elements) == 0:
            # Finished with all elements
            break
        else:
            current_element = merged_elements[0]
                
    return elements_to_keep

def find_text(target_text, text_elements):
    """
    Finds the first occurrence of the target text in the list of text elements.

    Args:
        target_text (str): The text to search for.
        text_elements (list): List of text elements with 'description' and 'bounding_box'.

    Returns:
        dict or None: The text element containing the target text or None if not found.
    """
    target_lower = target_text.strip().lower().replace(',', '').replace(' ', '').replace('.', '')
    for element in text_elements:
        if element['description'].strip().lower().replace(',', '').replace(' ', '').replace('.', '') == target_lower:
            return element
    return None

def find_element_containing_point(x, y, text_elements):
    """
    Finds all text elements whose bounding boxes contain the given (x, y) point.

    Args:
        x (int): The x-coordinate of the point.
        y (int): The y-coordinate of the point.
        text_elements (list): List of text elements with 'description' and 'bounding_box'.

    Returns:
        list: A list of text elements containing the point.
    """
    containing_elements = []
    for element in text_elements:
        bbox = element['bounding_box']
        # Extract x and y coordinates separately
        x_coords = [coord[0] for coord in bbox]
        y_coords = [coord[1] for coord in bbox]
        min_x, max_x = min(x_coords), max(x_coords)
        min_y, max_y = min(y_coords), max(y_coords)

        if min_x <= x <= max_x and min_y <= y <= max_y:
            containing_elements.append(element)
    if len(containing_elements) > 1:
        print(f"WARNING! Found {len(containing_elements)} elements containing point {x}, {y}")
    return containing_elements[0] if len(containing_elements) > 0 else None

def find_text_element_containing_point(x, y, text_elements):
    element = find_element_containing_point(x, y, text_elements)
    return element['description'] if element else None

def extract_text_elements(pdf_page):
    elements = []

    lines = pdf_page.extract_text_lines(layout=True, x_density=3)
    # print("lines", lines)
    annotations_by_lines = {}
    for idx, line in enumerate(lines):
        # print("line", line)
        
        wordPointer = 0  # noqa: N806
        chars = line["chars"]
        annotations = []
        for word in line["text"].split():
            # print("  word", word)
            ordered_chars = chars[wordPointer : wordPointer + len(word)]
            wordPointer += len(word)  # noqa: N806

            x0, y0, x1, y1 = objects_to_bbox(ordered_chars)
            elements.append({
                'description': word,
                'bounding_box': [(x0, y0), (x1, y0), (x1, y1), (x0, y1)]
            })
            # print("  ", ordered_chars, x0, y0, x1, y1)
    return elements
    
def perform_text_extraction(pdf_page):
    texts_with_bounding_box = extract_text_elements(pdf_page)
    
    # Merge text elements based on proximity
    merged_texts = merge_text_elements(texts_with_bounding_box, max_x_distance=10/SCALING_FACTOR, max_y_distance=5/SCALING_FACTOR)
    return merged_texts

In [29]:
def parse_lighthouses(text_elements):
    global total_number_of_lighthouses
    @dataclass
    class Lighthouse:
        fyrnr: str
        bounding_box: list[tuple[int, int]]
        name: str | None = None
        latitude: str | None = None
        longitude: str | None = None
        height_over_sea_level: str | None = None
        lysvidde_r: str | None = None
        lysvidde_g: str | None = None
        lysvidde_w: str | None = None
        sector_colors: list[dict] = field(default_factory=list)
        
    lighthouses_on_page = {}
    fyrnr_with_bounding_boxes = []
    # Find Fyrnr. elements. y coordinates start at 180 and increase by 10 for each line 
    for y in range(int(180/SCALING_FACTOR), int(1550/SCALING_FACTOR), int(10/SCALING_FACTOR)):
        FYRNR_X_COORDINATE = 163/SCALING_FACTOR
        element = find_element_containing_point(FYRNR_X_COORDINATE, y, text_elements)
        if element:
            fyrnr = element['description']
            if not fyrnr in lighthouses_on_page:
                lighthouses_on_page[fyrnr] = True # Found it
                fyrnr_with_bounding_boxes.append({
                    'fyrnr': fyrnr,
                    'bounding_box': element['bounding_box']
                })
    # Compute the full bounding box for each lighthouse
    for index, fyrnr_with_bounding_box in enumerate(fyrnr_with_bounding_boxes):
        DELTA_Y_FROM_TOP_OF_NEXT_BOX_TO_BOTTOM_OF_THIS_LIGHTHOUSE = 11/SCALING_FACTOR
        Y_FOR_LAST_LIGHTHOUSE = 1590/SCALING_FACTOR
        X_MAX_FOR_LIGHTHOUSE_BOUNDING_BOX = 2250/SCALING_FACTOR
        if index < len(fyrnr_with_bounding_boxes) - 1:
            next_fyrnr_with_bounding_box = fyrnr_with_bounding_boxes[index + 1]
            fyrnr = fyrnr_with_bounding_box['fyrnr']
            bounding_box = fyrnr_with_bounding_box['bounding_box']
            next_bounding_box = next_fyrnr_with_bounding_box['bounding_box']
            full_bounding_box = [bounding_box[0], (X_MAX_FOR_LIGHTHOUSE_BOUNDING_BOX, next_bounding_box[0][1] - DELTA_Y_FROM_TOP_OF_NEXT_BOX_TO_BOTTOM_OF_THIS_LIGHTHOUSE)]
        else:
            # This is the last lighthouse
            fyrnr = fyrnr_with_bounding_box['fyrnr']
            bounding_box = fyrnr_with_bounding_box['bounding_box']
            full_bounding_box = [bounding_box[0], (X_MAX_FOR_LIGHTHOUSE_BOUNDING_BOX, Y_FOR_LAST_LIGHTHOUSE)]
        
        lighthouses_on_page[fyrnr] = Lighthouse(fyrnr, full_bounding_box)
    for fyrnr, lighthouse in lighthouses_on_page.items():
        total_number_of_lighthouses += 1
        NAME_X_COORDINATE = 220/SCALING_FACTOR
        NAME_Y_COORDINATE = lighthouse.bounding_box[0][1] + 36/SCALING_FACTOR
        name = find_text_element_containing_point(NAME_X_COORDINATE, NAME_Y_COORDINATE, text_elements)
        lighthouses_on_page[fyrnr].name = name
        
        LATITUDE_X_COORDINATE = 650/SCALING_FACTOR
        LATITUDE_Y_COORDINATE = lighthouse.bounding_box[0][1] + 7/SCALING_FACTOR
        # 78 12.6340
        latitude = find_text_element_containing_point(LATITUDE_X_COORDINATE, LATITUDE_Y_COORDINATE, text_elements)
        # Convert to degrees
        latitude = float(latitude.split()[0]) + float(latitude.split()[1]) / 60
        
        LONGITUDE_X_COORDINATE = 650/SCALING_FACTOR
        LONGITUDE_Y_COORDINATE = lighthouse.bounding_box[0][1] + 32/SCALING_FACTOR
        
        longitude = find_text_element_containing_point(LONGITUDE_X_COORDINATE, LONGITUDE_Y_COORDINATE, text_elements)
        # Convert to degrees
        longitude = float(longitude.split()[0]) + float(longitude.split()[1]) / 60

        lighthouses_on_page[fyrnr].latitude = latitude
        lighthouses_on_page[fyrnr].longitude = longitude

        KARAKTER_X_COORDINATE = 800/SCALING_FACTOR
        KARAKTER_Y_COORDINATE = lighthouse.bounding_box[0][1] + 7/SCALING_FACTOR
        karakter = find_text_element_containing_point(KARAKTER_X_COORDINATE, KARAKTER_Y_COORDINATE, text_elements)
        lighthouses_on_page[fyrnr].karakter = karakter

        HEIGHT_OVER_SEA_LEVEL_X_COORDINATE = 940/SCALING_FACTOR
        HEIGHT_OVER_SEA_LEVEL_Y_COORDINATE = lighthouse.bounding_box[0][1] + 7/SCALING_FACTOR
        height_over_sea_level = find_text_element_containing_point(HEIGHT_OVER_SEA_LEVEL_X_COORDINATE, HEIGHT_OVER_SEA_LEVEL_Y_COORDINATE, text_elements)    
        lighthouses_on_page[fyrnr].height_over_sea_level = float(height_over_sea_level.replace(",", ".")) if height_over_sea_level else None

        LYSVIDDE_X_COORDINATE = 1338/SCALING_FACTOR
        LYSVIDDE_R_Y_COORDINATE = lighthouse.bounding_box[0][1] + 7/SCALING_FACTOR
        LYSVIDDE_G_Y_COORDINATE = lighthouse.bounding_box[0][1] + 32/SCALING_FACTOR
        LYSVIDDE_W_Y_COORDINATE = lighthouse.bounding_box[0][1] + 57/SCALING_FACTOR
        lysvidde_r = find_text_element_containing_point(LYSVIDDE_X_COORDINATE, LYSVIDDE_R_Y_COORDINATE, text_elements)
        lysvidde_g = find_text_element_containing_point(LYSVIDDE_X_COORDINATE, LYSVIDDE_G_Y_COORDINATE, text_elements)
        lysvidde_w = find_text_element_containing_point(LYSVIDDE_X_COORDINATE, LYSVIDDE_W_Y_COORDINATE, text_elements)
        lighthouses_on_page[fyrnr].lysvidde_r = float(lysvidde_r.replace(",", ".")) if lysvidde_r else None
        lighthouses_on_page[fyrnr].lysvidde_g = float(lysvidde_g.replace(",", ".")) if lysvidde_g else None
        lighthouses_on_page[fyrnr].lysvidde_w = float(lysvidde_w.replace(",", ".")) if lysvidde_w else None

        # Find sectors
        SECTOR_COLOR_X_COORDINATE = 1482/SCALING_FACTOR
        SECTOR_FIRST_COLOR_Y_COORDINATE = lighthouse.bounding_box[0][1] + 9/SCALING_FACTOR
        SECTOR_COLOR_LINE_HEIGHT = 5/SCALING_FACTOR
        current_y_coordinate = SECTOR_FIRST_COLOR_Y_COORDINATE
        current_color = None

        SINGLE_LINE_HEIGHT = 28/SCALING_FACTOR
        SECTOR_FIRST_ANGLE_X_COORDINATE = 1536/SCALING_FACTOR
        SECTOR_SECOND_ANGLE_X_COORDINATE = 1614/SCALING_FACTOR
        
        while current_y_coordinate < lighthouse.bounding_box[1][1]:
            sector_color = find_element_containing_point(SECTOR_COLOR_X_COORDINATE, current_y_coordinate, text_elements)
            if sector_color and sector_color['description'] != current_color:
                current_color = sector_color['description']
                if current_color in ['R', 'G', 'W']:
                    mean_y_coordinate = (sector_color['bounding_box'][0][1] + sector_color['bounding_box'][2][1]) / 2
                    sector_from = find_text_element_containing_point(SECTOR_FIRST_ANGLE_X_COORDINATE, mean_y_coordinate, text_elements)
                    sector_to = find_text_element_containing_point(SECTOR_SECOND_ANGLE_X_COORDINATE, mean_y_coordinate, text_elements)
                    sector_from_float = float(sector_from.replace(",", ".")) if sector_from else None
                    sector_to_float = float(sector_to.replace(",", ".").replace("-", "")) if sector_to else None
                    if sector_from_float is None:
                        print(f"WARNING! Sector from is None for {fyrnr} {sector_color['description']} {sector_from}. Choosing 0.0 instead.")
                        sector_from_float = 0.0
                    

                    lighthouses_on_page[fyrnr].sector_colors.append({
                        'color': sector_color['description'],
                        'from': sector_from_float,
                        'to': float(sector_to.replace(",", ".").replace("-", ""))
                    })
                
            current_y_coordinate += SECTOR_COLOR_LINE_HEIGHT
    
    def should_keep_lighthouse(lighthouse):
        if lighthouse.lysvidde_r is None and lighthouse.lysvidde_g is None and lighthouse.lysvidde_w is None:
            return False
        if lighthouse.height_over_sea_level is None:
            return False
        return True

    lighthouses_on_page = {
        fyrnr: lighthouse 
        for fyrnr, lighthouse in lighthouses_on_page.items() 
        if should_keep_lighthouse(lighthouse)
    }
    return lighthouses_on_page.values()




In [30]:
from tqdm import tqdm
pdf_path = "Fyrliste_HeleLandet.pdf"

total_number_of_lighthouses = 0
lighthouses = []
with pdfplumber.open(pdf_path) as pdf:  # type: ignore
    i = 0
    #for pdf_page in tqdm(pdf.pages, desc="Processing pages"):
    for pdf_page in pdf.pages:
        i += 1
        # if i < 700:
        #     continue
        text_on_page = pdf_page.extract_text()
        search_text = ["Lysvidde", "Fyrnr.", "Kartnr."]
        should_parse_page = all(map(lambda needle: needle in text_on_page, search_text))
        if not should_parse_page:
            continue
        print(f"Processing page {i} of {len(pdf.pages)}")
        text_elements = perform_text_extraction(pdf_page)
        # # print("text_elements: ", json.dumps(text_elements, indent=2, ensure_ascii=False))
        lighthouses_on_page = parse_lighthouses(text_elements)
        lighthouses.extend(lighthouses_on_page)
        print(f"Found {len(lighthouses)} lighthouses on page {pdf_page.page_number}")
        # for fyrnr, lighthouse in lighthouses.items():
        #     print(fyrnr, asdict(lighthouse))

print("total_number_of_lighthouses: ", total_number_of_lighthouses)
print("total_real_number_of_lighthouses: ", len(lighthouses))


Processing page 30 of 847
Found 9 lighthouses on page 30
Processing page 31 of 847
Found 23 lighthouses on page 31
Processing page 32 of 847
Found 27 lighthouses on page 32
Processing page 33 of 847
Found 32 lighthouses on page 33
Processing page 34 of 847
Found 40 lighthouses on page 34
Processing page 35 of 847
Found 49 lighthouses on page 35
Processing page 36 of 847
Found 59 lighthouses on page 36
Processing page 37 of 847
Found 69 lighthouses on page 37
Processing page 38 of 847
Found 79 lighthouses on page 38
Processing page 39 of 847
Found 89 lighthouses on page 39
Processing page 40 of 847
Found 100 lighthouses on page 40
Processing page 41 of 847
Found 112 lighthouses on page 41
Processing page 42 of 847
Found 122 lighthouses on page 42
Processing page 43 of 847
Found 130 lighthouses on page 43
Processing page 44 of 847
Found 137 lighthouses on page 44
Processing page 45 of 847
Found 143 lighthouses on page 45
Processing page 46 of 847
Found 150 lighthouses on page 46
Processi