In [2]:
import json
import pymupdf
from PIL import Image
import time

In [24]:
from google.cloud import vision
import io
from PIL import Image, ImageDraw


def ocr_image(image_path):
    """
    Performs OCR on the given image and returns detected text elements with bounding boxes.

    Args:
        image_path (str): Path to the input image.

    Returns:
        list: A list of dictionaries containing 'description' and 'bounding_box'.
    """
    client = vision.ImageAnnotatorClient()

    with io.open(image_path, 'rb') as image_file:
        content = image_file.read()

    image = vision.Image(content=content)

    response = client.text_detection(image=image)
    texts = response.text_annotations

    if response.error.message:
        raise Exception(f'API Error: {response.error.message}')

    if not texts:
        print('No text detected.')
        return []

    # Each text annotation includes description and bounding_poly
    # The first element is the entire detected text, subsequent elements are individual words
    text_elements = []
    for text in texts[1:]:  # Skip the first element
        element = {
            'description': text.description,
            'bounding_box': [(vertex.x, vertex.y) for vertex in text.bounding_poly.vertices]
        }
        text_elements.append(element)

    return text_elements

def merge_text_elements(text_elements, max_x_distance=10, max_y_distance=5):
    """
    Merges text elements that are horizontally close and vertically aligned based on bounding box sides.

    Args:
        text_elements (list): List of text elements with 'description' and 'bounding_box'.
        max_x_distance (int): Maximum horizontal distance in pixels to consider for merging.
        max_y_distance (int): Maximum vertical distance in pixels to consider for merging.

    Returns:
        list: A new list of merged text elements.
    """
    if not text_elements:
        return []

    # Sort text elements by top-left y-coordinate, then by left x-coordinate
    sorted_elements = sorted(text_elements, key=lambda el: (el['bounding_box'][0][1], el['bounding_box'][0][0]))
    elements_to_keep = []
    while True:
        merged = False
        merged_elements = []
        current_element = sorted_elements[0].copy()
        
        for next_element in sorted_elements[1:]:
            #print("Considering merging ", current_element['description'], " and ", next_element['description'])
            #print("  y coordinates: ", [coord[1] for coord in current_element['bounding_box']], " and ", [coord[1] for coord in next_element['bounding_box']])
            # Extract bounding box coordinates
            current_bbox = current_element['bounding_box']
            next_bbox = next_element['bounding_box']

            # Determine right side of current and left side of next
            current_right_x = max([coord[0] for coord in current_bbox])
            current_left_x = min([coord[0] for coord in current_bbox])
            next_left_x = min([coord[0] for coord in next_bbox])
            next_right_x = max([coord[0] for coord in next_bbox])

            # Calculate horizontal distance between right of current and left of next
            horizontal_distance1 = next_left_x - current_right_x
            horizontal_distance2 = current_left_x - next_right_x

            # Extract top y-coordinates for vertical alignment
            current_top_y = current_bbox[0][1]
            next_top_y = next_bbox[0][1]
            vertical_distance = abs(next_top_y - current_top_y)
            # print("  Vertical distance: ", vertical_distance)
            # print("  Horizontal distance1: ", horizontal_distance1)
            # print("  Horizontal distance2: ", horizontal_distance2)
            # Decide whether to merge
            if (abs(horizontal_distance1) < max_x_distance or abs(horizontal_distance2) < max_x_distance) and abs(vertical_distance) <= max_y_distance:
                # Merge descriptions
                # Decide which is left and which is right
                if current_right_x < next_left_x:
                    current_element['description'] += ' ' +  next_element['description']
                else:
                    current_element['description'] = next_element['description'] + ' ' + current_element['description']
                # Merge bounding boxes
                # New left_x is min of current and next
                new_left_x = min([coord[0] for coord in current_bbox] + [coord[0] for coord in next_bbox])
                # New right_x is max of current and next
                new_right_x = max([coord[0] for coord in current_bbox] + [coord[0] for coord in next_bbox])
                # New top_y is min of current and next
                new_top_y = min([coord[1] for coord in current_bbox] + [coord[1] for coord in next_bbox])
                # New bottom_y is max of current and next
                new_bottom_y = max([coord[1] for coord in current_bbox] + [coord[1] for coord in next_bbox])

                # Define new bounding box
                new_bounding_box = [
                    (new_left_x, new_top_y),
                    (new_right_x, new_top_y),
                    (new_right_x, new_bottom_y),
                    (new_left_x, new_bottom_y)
                ]
                # print("  Merged them!")
                current_element['bounding_box'] = new_bounding_box
                merged = True
            else:
                # No merge; add the current element to the list
                merged_elements.append(next_element)

        # Add the last element
        # merged_elements.append(current_element)
        
        # Update sorted_elements for next iteration
        sorted_elements = merged_elements.copy()
        # if merged:
        #     merged_elements.insert(0, current_element)
        elements_to_keep.append(current_element)
        
        # print("Nothing else to merge. Keeping ", current_element)
        # elements_to_keep.append(current_element)
        if len(sorted_elements) == 0:
            # Finished with all elements
            break
        else:
            current_element = merged_elements[0]
                
    return elements_to_keep

def find_text(target_text, text_elements):
    """
    Finds the first occurrence of the target text in the list of text elements.

    Args:
        target_text (str): The text to search for.
        text_elements (list): List of text elements with 'description' and 'bounding_box'.

    Returns:
        dict or None: The text element containing the target text or None if not found.
    """
    target_lower = target_text.strip().lower().replace(',', '').replace(' ', '').replace('.', '')
    for element in text_elements:
        if element['description'].strip().lower().replace(',', '').replace(' ', '').replace('.', '') == target_lower:
            return element
    return None


def compute_coordinates_lysvidde(target_element):
    """
    Computes new coordinates based on the position of the target text element.

    Args:
        target_element (dict): The text element containing the target text.

    Returns:
        dict: A dictionary containing the new coordinates.
    """
    # Assuming the bounding box is a rectangle with vertices ordered clockwise starting from top-left
    # We'll use the top-left vertex as the reference point
    top_left = target_element['bounding_box'][0]
    x_target = top_left[0]
    y_target = top_left[1]

    new_coords = {
        'x_target': x_target,
        'y_target': y_target,
        'x_1': x_target + 1130,
        'y_1': y_target - 20,
        'x_2': x_target + 1130,
        'y_2': y_target + 3,
        'x_3': x_target + 1130,
        'y_3': y_target + 29
    }

    return new_coords


def find_element_containing_point(x, y, text_elements):
    """
    Finds all text elements whose bounding boxes contain the given (x, y) point.

    Args:
        x (int): The x-coordinate of the point.
        y (int): The y-coordinate of the point.
        text_elements (list): List of text elements with 'description' and 'bounding_box'.

    Returns:
        list: A list of text elements containing the point.
    """
    containing_elements = []
    for element in text_elements:
        bbox = element['bounding_box']
        # Extract x and y coordinates separately
        x_coords = [coord[0] for coord in bbox]
        y_coords = [coord[1] for coord in bbox]
        min_x, max_x = min(x_coords), max(x_coords)
        min_y, max_y = min(y_coords), max(y_coords)

        if min_x <= x <= max_x and min_y <= y <= max_y:
            containing_elements.append(element)
    if len(containing_elements) > 1:
        print(f"WARNING! Found {len(containing_elements)} elements containing point {x}, {y}")
    return containing_elements[0] if len(containing_elements) > 0 else None

def find_text_element_containing_point(x, y, text_elements):
    element = find_element_containing_point(x, y, text_elements)
    return element['description'] if element else None

def draw_image_with_bounding_boxes(image_path, text_elements, output_path='annotated_image.jpg'):
    image = Image.open(image_path)
    draw = ImageDraw.Draw(image)
    for element in text_elements:
        draw.line(element['bounding_box'], fill='blue', width=2)
    image.save(output_path)

def perform_ocr(image_filename):
    # Perform OCR on the image
    texts_with_bounding_box = ocr_image(image_filename)
    # texts_with_bounding_box = [{"description":"Fredrikstad","bounding_box":[[207,1084],[303,1083],[303,1101],[207,1102]]},{"description":"bru","bounding_box":[[309,1083],[336,1083],[336,1101],[309,1101]]},{"description":",","bounding_box":[[337,1083],[342,1083],[342,1100],[337,1100]]},{"description":"G","bounding_box":[[348,1083],[362,1083],[362,1100],[348,1100]]}]
    
    draw_image_with_bounding_boxes(image_filename, texts_with_bounding_box, f"annotated_image.jpg")
    
    # Merge text elements based on proximity
    merged_texts = merge_text_elements(texts_with_bounding_box, max_x_distance=10, max_y_distance=5)
    print(len(merged_texts))
    draw_image_with_bounding_boxes(image_filename, merged_texts, f"annotated_image_merged.jpg")
    return merged_texts


def find_lysvidde(ocr_elements, lighthouse_name):
    # Find the target text
    target_text = find_text(lighthouse_name, ocr_elements)

    if not target_text:
        print(f"Text '{lighthouse_name}' not found in the image.")
        return None

    # Compute coordinates for Lysvidde
    coords = compute_coordinates_lysvidde(target_text)

    # Extract the new points
    new_points = [
        (coords['x_1'], coords['y_1']),
        (coords['x_2'], coords['y_2']),
        (coords['x_3'], coords['y_3'])
    ]

    lysvidde = None
    # Find text elements containing these new points
    containing_texts_dict = {}
    for idx, (x, y) in enumerate(new_points, start=1):
        containing_texts = find_elements_containing_point(x, y, ocr_elements)
        containing_texts_dict[(x, y)] = containing_texts
        if containing_texts:
            for text in containing_texts:
                value = float(text['description'].replace(",", "."))
                if lysvidde is None:
                    lysvidde = value
                lysvidde = max(lysvidde, value)
        
    return lysvidde

In [43]:
def convert_pdf_page_to_image(page, image_file_name):
    pix = page.get_pixmap(dpi=200)
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    img.save(image_file_name, quality=100)

def parse_lighthouses_for_page(document, page_number):
    image_file_name = f"pages/page{page_number}.png"
    page = document.load_page(page_number)
    lighthouses_on_page = []
    convert_pdf_page_to_image(page, image_file_name)
    ocr_elements = perform_ocr(image_file_name)
    print("ocr_elements: ", json.dumps(ocr_elements, indent=2, ensure_ascii=False))
    return ocr_elements

pdf_path = "Fyrliste_HeleLandet.pdf"
document = pymupdf.open(pdf_path)
ocr_elements = parse_lighthouses_for_page(document, 51)


274
ocr_elements:  [
  {
    "description": "Område",
    "bounding_box": [
      [
        216,
        91
      ],
      [
        288,
        91
      ],
      [
        288,
        104
      ],
      [
        216,
        104
      ]
    ]
  },
  {
    "description": "Posisjon",
    "bounding_box": [
      [
        609,
        91
      ],
      [
        689,
        91
      ],
      [
        689,
        107
      ],
      [
        609,
        107
      ]
    ]
  },
  {
    "description": "Karakter",
    "bounding_box": [
      [
        767,
        91
      ],
      [
        845,
        91
      ],
      [
        845,
        105
      ],
      [
        767,
        105
      ]
    ]
  },
  {
    "description": "Beskrivelse",
    "bounding_box": [
      [
        1020,
        91
      ],
      [
        1126,
        91
      ],
      [
        1126,
        105
      ],
      [
        1020,
        105
      ]
    ]
  },
  {
    "description": "Lysvidde",
    "bo

In [50]:
from dataclasses import dataclass, asdict, field
@dataclass
class Lighthouse:
    fyrnr: str
    bounding_box: list[tuple[int, int]]
    name: str | None = None
    latitude: str | None = None
    longitude: str | None = None
    height_over_sea_level: str | None = None
    lysvidde_r: str | None = None
    lysvidde_g: str | None = None
    lysvidde_w: str | None = None
    sector_colors: list[dict] = field(default_factory=list)
    
lighthouses_on_page = {}
fyrnr_with_bounding_boxes = []
# Find Fyrnr. elements. y coordinates start at 180 and increase by 10 for each line 
for y in range(180, 1550, 10):
    FYRNR_X_COORDINATE = 163
    element = find_element_containing_point(FYRNR_X_COORDINATE, y, ocr_elements)
    if element:
        fyrnr = element['description']
        if not fyrnr in lighthouses_on_page:
            lighthouses_on_page[fyrnr] = True # Found it
            fyrnr_with_bounding_boxes.append({
                'fyrnr': fyrnr,
                'bounding_box': element['bounding_box']
            })
# Compute the full bounding box for each lighthouse
for index, fyrnr_with_bounding_box in enumerate(fyrnr_with_bounding_boxes):
    DELTA_Y_FROM_TOP_OF_NEXT_BOX_TO_BOTTOM_OF_THIS_LIGHTHOUSE = 11
    Y_FOR_LAST_LIGHTHOUSE = 1590
    X_MAX_FOR_LIGHTHOUSE_BOUNDING_BOX = 2250
    if index < len(fyrnr_with_bounding_boxes) - 1:
        next_fyrnr_with_bounding_box = fyrnr_with_bounding_boxes[index + 1]
        fyrnr = fyrnr_with_bounding_box['fyrnr']
        bounding_box = fyrnr_with_bounding_box['bounding_box']
        next_bounding_box = next_fyrnr_with_bounding_box['bounding_box']
        full_bounding_box = [bounding_box[0], (X_MAX_FOR_LIGHTHOUSE_BOUNDING_BOX, next_bounding_box[0][1] - DELTA_Y_FROM_TOP_OF_NEXT_BOX_TO_BOTTOM_OF_THIS_LIGHTHOUSE)]
    else:
        # This is the last lighthouse
        fyrnr = fyrnr_with_bounding_box['fyrnr']
        bounding_box = fyrnr_with_bounding_box['bounding_box']
        full_bounding_box = [bounding_box[0], (X_MAX_FOR_LIGHTHOUSE_BOUNDING_BOX, Y_FOR_LAST_LIGHTHOUSE)]
    
    lighthouses_on_page[fyrnr] = Lighthouse(fyrnr, full_bounding_box)
for fyrnr, lighthouse in lighthouses_on_page.items():

    NAME_X_COORDINATE = 220
    NAME_Y_COORDINATE = lighthouse.bounding_box[0][1] + 36
    name = find_text_element_containing_point(NAME_X_COORDINATE, NAME_Y_COORDINATE, ocr_elements)
    lighthouses_on_page[fyrnr].name = name
    
    LATITUDE_X_COORDINATE = 650
    LATITUDE_Y_COORDINATE = lighthouse.bounding_box[0][1] + 7
    latitude = find_text_element_containing_point(LATITUDE_X_COORDINATE, LATITUDE_Y_COORDINATE, ocr_elements)
    
    LONGITUDE_X_COORDINATE = 650
    LONGITUDE_Y_COORDINATE = lighthouse.bounding_box[0][1] + 32
    
    longitude = find_text_element_containing_point(LONGITUDE_X_COORDINATE, LONGITUDE_Y_COORDINATE, ocr_elements)
    lighthouses_on_page[fyrnr].latitude = latitude
    lighthouses_on_page[fyrnr].longitude = longitude

    KARAKTER_X_COORDINATE = 800
    KARAKTER_Y_COORDINATE = lighthouse.bounding_box[0][1] + 7
    karakter = find_text_element_containing_point(KARAKTER_X_COORDINATE, KARAKTER_Y_COORDINATE, ocr_elements)
    lighthouses_on_page[fyrnr].karakter = karakter

    HEIGHT_OVER_SEA_LEVEL_X_COORDINATE = 940
    HEIGHT_OVER_SEA_LEVEL_Y_COORDINATE = lighthouse.bounding_box[0][1] + 7
    height_over_sea_level = find_text_element_containing_point(HEIGHT_OVER_SEA_LEVEL_X_COORDINATE, HEIGHT_OVER_SEA_LEVEL_Y_COORDINATE, ocr_elements)    
    lighthouses_on_page[fyrnr].height_over_sea_level = height_over_sea_level

    LYSVIDDE_X_COORDINATE = 1338
    LYSVIDDE_R_Y_COORDINATE = lighthouse.bounding_box[0][1] + 7
    LYSVIDDE_G_Y_COORDINATE = lighthouse.bounding_box[0][1] + 32
    LYSVIDDE_W_Y_COORDINATE = lighthouse.bounding_box[0][1] + 57
    lysvidde_r = find_text_element_containing_point(LYSVIDDE_X_COORDINATE, LYSVIDDE_R_Y_COORDINATE, ocr_elements)
    lysvidde_g = find_text_element_containing_point(LYSVIDDE_X_COORDINATE, LYSVIDDE_G_Y_COORDINATE, ocr_elements)
    lysvidde_w = find_text_element_containing_point(LYSVIDDE_X_COORDINATE, LYSVIDDE_W_Y_COORDINATE, ocr_elements)
    lighthouses_on_page[fyrnr].lysvidde_r = lysvidde_r
    lighthouses_on_page[fyrnr].lysvidde_g = lysvidde_g
    lighthouses_on_page[fyrnr].lysvidde_w = lysvidde_w

    # Find sectors
    SECTOR_COLOR_X_COORDINATE = 1482
    SECTOR_FIRST_COLOR_Y_COORDINATE = lighthouse.bounding_box[0][1] + 9
    SECTOR_COLOR_LINE_HEIGHT = 5
    current_y_coordinate = SECTOR_FIRST_COLOR_Y_COORDINATE
    current_color = None

    SINGLE_LINE_HEIGHT = 28
    SECTOR_FIRST_ANGLE_X_COORDINATE = 1536
    SECTOR_SECOND_ANGLE_X_COORDINATE = 1614
    
    while current_y_coordinate < lighthouse.bounding_box[1][1]:
        sector_color = find_element_containing_point(SECTOR_COLOR_X_COORDINATE, current_y_coordinate, ocr_elements)
        if sector_color and sector_color['description'] != current_color:
            current_color = sector_color['description']
            if len(sector_color['description']) == 1:
                mean_y_coordinate = (sector_color['bounding_box'][0][1] + sector_color['bounding_box'][2][1]) / 2
                sector_from = find_text_element_containing_point(SECTOR_FIRST_ANGLE_X_COORDINATE, mean_y_coordinate, ocr_elements)
                sector_to = find_text_element_containing_point(SECTOR_SECOND_ANGLE_X_COORDINATE, mean_y_coordinate, ocr_elements)
                lighthouses_on_page[fyrnr].sector_colors.append({
                    'color': sector_color['description'],
                    'from': sector_from,
                    'to': sector_to
                })
                #print(f"SINGLE '{sector_color['description']}': '{sector_from}' - '{sector_to}'")
            else:
                # Multiple colors merged into same element
                for index, single_color in enumerate(sector_color['description']):
                    mean_y_coordinate = (sector_color['bounding_box'][0][1] +  + index * SINGLE_LINE_HEIGHT)
                    sector_from = find_text_element_containing_point(SECTOR_FIRST_ANGLE_X_COORDINATE, mean_y_coordinate, ocr_elements)
                    sector_to = find_text_element_containing_point(SECTOR_SECOND_ANGLE_X_COORDINATE, mean_y_coordinate, ocr_elements)
                    lighthouses_on_page[fyrnr].sector_colors.append({
                        'color': single_color,
                        'from': sector_from,
                        'to': sector_to
                    })
                    #print(f"MULTIPLE'{sector_color['description']}': '{sector_from}' - '{sector_to}'")
                current_y_coordinate += len(sector_color['description']) * SINGLE_LINE_HEIGHT
                continue

        current_y_coordinate += SECTOR_COLOR_LINE_HEIGHT
    print(fyrnr, name, latitude, longitude, karakter, height_over_sea_level, lysvidde_r, lysvidde_g, lysvidde_w, lighthouses_on_page[fyrnr].sector_colors)
print(len(lighthouses_on_page))


020171 Sydvestgrunnen 59 44.7112 010 33.7109 QW 5,5 None None 1,9 [{'color': 'W', 'from': '0,0', 'to': '- 360,0'}]
020191 Djupgrunnen 59 43.4293 010 32.6103 FI G 3,0 None 3,1 None [{'color': 'G', 'from': '0,0', 'to': '- 360,0'}]
020200 Nordre Sundbyholmen 59 43.5451 010 32.1165 ) Oc ( 3 WRG 10s 11,5 2,8 2,8 3,7 [{'color': 'R', 'from': '144,1', 'to': '182,4'}, {'color': 'G', 'from': '182,4', 'to': '195,9'}, {'color': 'W', 'from': None, 'to': None}, {'color': 'R', 'from': '200,2', 'to': '220,9'}, {'color': 'G', 'from': None, 'to': None}, {'color': 'M', 'from': None, 'to': None}, {'color': 'R', 'from': None, 'to': None}, {'color': 'G', 'from': None, 'to': None}, {'color': 'W', 'from': '342,7', 'to': '-347,5'}, {'color': 'R', 'from': '347,5', 'to': '-15,1'}, {'color': 'G', 'from': '15,1', 'to': '144,1'}]
020274 Torpene 59 46.0594 010 32.9974 Iso R 2s 9,0 1,5 None None [{'color': 'R', 'from': '0,0', 'to': '360,0'}]
020545 Torodden 59 43.8808 010 35.2843 Fast R lys 6,5 0.8 None None [{'color

In [15]:
ocr_elements

[{'description': 'Område',
  'bounding_box': [(216, 90), (288, 91), (288, 105), (216, 104)]},
 {'description': 'Posisjon',
  'bounding_box': [(609, 91), (689, 91), (689, 107), (609, 107)]},
 {'description': 'Karakter',
  'bounding_box': [(767, 91), (845, 91), (845, 105), (767, 105)]},
 {'description': 'Beskrivelse',
  'bounding_box': [(1020, 91), (1126, 91), (1126, 105), (1020, 105)]},
 {'description': 'Lysvidde',
  'bounding_box': [(1318, 91), (1401, 90), (1401, 105), (1318, 106)]},
 {'description': 'Sektor-farge-retning-beskrivelse',
  'bounding_box': [(1435, 91), (1767, 91), (1767, 107), (1435, 107)]},
 {'description': 'Fyrnr.',
  'bounding_box': [(90, 92), (144, 92), (144, 106), (90, 106)]},
 {'description': 'Lys',
  'bounding_box': [(924, 92), (958, 92), (958, 107), (924, 107)]},
 {'description': 'høyde',
  'bounding_box': [(924, 114), (981, 114), (981, 130), (924, 130)]},
 {'description': 'R',
  'bounding_box': [(1349, 114), (1362, 114), (1362, 125), (1349, 125)]},
 {'description

In [None]:
def parse_page(document, page_number):
    # Define the text you are looking for
    search_text = ["Lysvidde", "Fyrnr.", "Kartnr."]
    
    page = document.load_page(page_number)
    
    # Extract text from the page
    text = page.get_text()

    # Check if the page contains the search text
    should_parse_page = all(map(lambda needle: needle in text, search_text))
    if not should_parse_page:
        return [], []
    return parse_lighthouses_for_page(document, page_number)

In [None]:
import concurrent.futures
all_errors = []
all_lighthouses = []

pdf_path = "Fyrliste_HeleLandet.pdf"
document = pymupdf.open(pdf_path)

lighthouses_per_page = [0] * len(document)
errors_per_page = [0] * len(document)

with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor:
    futures = {executor.submit(parse_page, document, page_number): page_number for page_number in range(len(document))}
    
    for future in concurrent.futures.as_completed(futures):
        page_number = futures[future]
        try:
            lighthouses_on_page, errors = future.result()
            if len(lighthouses_on_page) > 0:
                with open(f"lighthouses/page_{page_number+1}.json", "w", encoding="utf-8") as f:
                    json.dump(lighthouses_on_page, f, indent=2, ensure_ascii=False)
            print(f"Found {len(lighthouses_on_page)} lighthouses and {len(errors)} errors on page {page_number+1}")
            all_errors.extend(errors)
            all_lighthouses.extend(lighthouses_on_page)
            lighthouses_per_page[page_number] = len(lighthouses_on_page)
            errors_per_page[page_number] = len(errors)
            
        except Exception as e:
            print(f"Error parsing page {page_number + 1}: {e}")


In [None]:
pdf_path = "Fyrliste_HeleLandet.pdf"
document = pymupdf.open(pdf_path)
lighthouses_on_page, errors = parse_page(document, 45)
print(json.dumps(lighthouses_on_page, indent=2, ensure_ascii=False))
len(lighthouses_on_page)

In [None]:
with open("parsed_lighthouses.json", "w") as f:
    json.dump(all_lighthouses, f)
with open("parsed_lighthouse_errors.json", "w") as f:
    json.dump(all_errors, f)

In [None]:
all_errors

In [None]:
perform_ocr('pages/page41_masked_rest.png')