In [None]:
import json
import pymupdf
from PIL import Image
import time

In [None]:
def extract_lighthouses_using_gpt(image_file_name):
    import requests
    import os
    import json
    import base64
    file = open(image_file_name, "rb")
    image_bytes = file.read()
    base64_str = "data:image/jpeg;base64," + base64.b64encode(image_bytes).decode("utf-8")
    
    openai_api_key = os.environ["OPENAI_API_KEY"]
    headers = {
        'Content-Type': 'application/json',
        'Authorization': 'Bearer ' + os.getenv('OPENAI_API_KEY', ''),
    }
    
    json_data = {
        'model': 'gpt-4o-2024-08-06',
        'messages': [
            {
                'role': 'system',
                'content': [
                    {
                        'type': 'text',
                        'text': 'Generate JSON for these lighthouses. When reading sectors, use the sector color column with a single letter. Ensure that all sectors are included.',
                    },
                ],
            },
            {
                'role': 'user',
                'content': [
                    {
                        'type': 'image_url',
                        'image_url': {
                            'url': base64_str
                        },
                    },
                ],
            },
        ],
        'response_format': {
            'type': 'json_schema',
            'json_schema': {
                'name': 'lighthouse_response',
                'strict': True,
                'schema': {
                    '$schema': 'http://json-schema.org/draft-07/schema#',
                    'type': 'object',
                    'properties': {
                        'items': {
                            'type': 'array',
                            'items': {
                                'type': 'object',
                                'properties': {
                                    'latitude': {
                                        'type': 'object',
                                        'properties': {
                                            'degrees': {
                                                'type': 'integer',
                                                'description': 'Degrees of latitude, ranging from -90 to 90.',
                                            },
                                            'minutes': {
                                                'type': 'number',
                                                'description': 'Minutes of latitude, ranging from 0 to 60.',
                                            },
                                        },
                                        'required': [
                                            'degrees',
                                            'minutes',
                                        ],
                                        'additionalProperties': False,
                                    },
                                    'longitude': {
                                        'type': 'object',
                                        'properties': {
                                            'degrees': {
                                                'type': 'integer',
                                                'description': 'Degrees of longitude, ranging from -180 to 180.',
                                            },
                                            'minutes': {
                                                'type': 'number',
                                                'description': 'Minutes of longitude, ranging from 0 to 60.',
                                            },
                                        },
                                        'required': [
                                            'degrees',
                                            'minutes',
                                        ],
                                        'additionalProperties': False,
                                    },
                                    'pattern': {
                                        'type': 'string',
                                    },
                                    'description': {
                                        'type': 'string',
                                    },
                                    'heightOverGround': {
                                        'type': 'number',
                                    },
                                    'height': {
                                        'type': 'number',
                                    },
                                    'sectors': {
                                        'type': 'array',
                                        'items': {
                                            'type': 'object',
                                            'properties': {
                                                'color': {
                                                    'type': 'string',
                                                },
                                                'start': {
                                                    'type': 'number',
                                                },
                                                'stop': {
                                                    'type': 'number',
                                                },
                                                'description': {
                                                    'type': 'string',
                                                },
                                            },
                                            'required': [
                                                'color',
                                                'start',
                                                'stop',
                                                'description',
                                            ],
                                            'additionalProperties': False,
                                        },
                                    },
                                    'area': {
                                        'type': 'string',
                                    },
                                    'name': {
                                        'type': 'string',
                                    },
                                    'location': {
                                        'type': 'string',
                                    },
                                    'maxRange': {
                                        'type': 'number',
                                    },
                                },
                                'required': [
                                    'latitude',
                                    'longitude',
                                    'pattern',
                                    'description',
                                    'heightOverGround',
                                    'height',
                                    'sectors',
                                    'name',
                                    'location',
                                    'area',
                                    'maxRange',
                                ],
                                'additionalProperties': False,
                            },
                        },
                    },
                    'required': [
                        'items',
                    ],
                    'additionalProperties': False,
                },
            },
        },
        'temperature': 0.0,
        'max_tokens': 4096,
    }
    
    response = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, json=json_data)
    data = response.json()
    lighthouses_on_page_str = data['choices'][0]['message']['content']
    lighthouses_on_page = json.loads(lighthouses_on_page_str)
    return lighthouses_on_page['items']


In [16]:
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part
import vertexai.generative_models as generative_models
import requests
import os
import json
import base64
import random

locations = ["us-central1",
    "asia-east1",
    "asia-east2",
    "asia-northeast1",
    "asia-northeast3",
    "asia-south1",
    "asia-southeast1",
    "australia-southeast1",
    # "europe-central2",
    # "europe-north1",
    # "europe-southwest1",
    # "europe-west1",
    # "europe-west2",
    # "europe-west3",
    # "europe-west4",
    # "europe-west6",
    # "europe-west8",
    # "europe-west9",
    # "me-central1",
    # "me-central2",
    # "me-west1",
    "northamerica-northeast1",
    "southamerica-east1",
    "us-east1",
    "us-east4",
    "us-east5",
    "us-south1",
    "us-west1",
    "us-west4"]
    
def extract_lighthouses_using_gemini(image_file_name, location):
    file = open(image_file_name, "rb")
    image_bytes = file.read()
    
    vertexai.init(project="cognitedata-development", location=location)
    model = GenerativeModel(
        "gemini-1.5-flash-001",
    )
    chat = model.start_chat()

    image = Part.from_data(mime_type="image/png", data=image_bytes)

    generation_config = {
        "max_output_tokens": 8192,
        "temperature": 0.0,
        "top_p": 0.95,
        "response_mime_type": "application/json",
        "response_schema": {
            'type_': 'OBJECT',
            'properties': {
                'items': {
                    'type_': 'ARRAY',
                    'items': {
                        'type_': 'OBJECT',
                        'properties': {
                            'latitude': {
                                'type_': 'OBJECT',
                                'properties': {
                                    'degrees': {
                                        'type_': 'INTEGER',
                                        'description': 'Degrees of latitude, ranging from -90 to 90.',
                                    },
                                    'minutes': {
                                        'type_': 'NUMBER',
                                        'description': 'Minutes of latitude, ranging from 0 to 60.',
                                    },
                                },
                                'required': [
                                    'degrees',
                                    'minutes',
                                ]
                            },
                            'longitude': {
                                'type_': 'OBJECT',
                                'properties': {
                                    'degrees': {
                                        'type_': 'INTEGER',
                                        'description': 'Degrees of longitude, ranging from -180 to 180.',
                                    },
                                    'minutes': {
                                        'type_': 'NUMBER',
                                        'description': 'Minutes of longitude, ranging from 0 to 60.',
                                    },
                                },
                                'required': [
                                    'degrees',
                                    'minutes',
                                ]
                            },
                            'pattern': {
                                'type_': 'STRING',
                                'description': 'Flash pattern. Called Karakter in the input.',
                            },
                            'description': {
                                'type_': 'STRING',
                            },
                            'heightOverGround': {
                                'type_': 'NUMBER',
                                'description': 'Height over ground. Must be smaller than or equal height.',
                            },
                            'height': {
                                'type_': 'NUMBER',
                                'description': 'Height over sea. Must be larger than or equal to heightOverGround.',
                            },
                            'sectors': {
                                'type_': 'ARRAY',
                                'items': {
                                    'type_': 'OBJECT',
                                    'properties': {
                                        'color': {
                                            'type_': 'STRING',
                                            'description': 'Color of sector. Typically R, G or W.',
                                        },
                                        'start': {
                                            'type_': 'NUMBER',
                                            'description': 'Start angle of sector [degrees in range 0-360].',
                                        },
                                        'stop': {
                                            'type_': 'NUMBER',
                                            'description': 'Start angle of sector [degrees in range 0-360].',
                                        },
                                        'description': {
                                            'type_': 'STRING',
                                        },
                                    },
                                    'required': [
                                        'color',
                                        'start',
                                        'stop',
                                        'description',
                                    ]
                                },
                            },
                            'area': {
                                'type_': 'STRING',
                            },
                            'name': {
                                'type_': 'STRING',
                            },
                            'location': {
                                'type_': 'STRING',
                            },
                            'range': {
                                'type_': 'NUMBER',
                                'description': 'Range of the light house. Less than 20 nautic miles and rarely equal to any sector start/stop. Can be zero value.',
                            }
                        },
                        'required': [
                            'latitude',
                            'longitude',
                            'height',
                            'sectors',
                            'name',
                            'area',
                            'range'
                        ]
                    },
                },
            },
            'required': [
                'items',
            ]
        },
    }

    safety_settings = {
        generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_NONE,
        generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
        generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_NONE,
        generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    }
    
    response = chat.send_message(
      [image, "Generate JSON for these lighthouses. 'Karakter' is used for flash pattern. When reading sectors, use the sector color column with a single letter. Ensure that all sectors are included. All sector start/stop are positive numbers. If no sectors exist, or no maxRange, ignore the lighthouse."],
      generation_config=generation_config,
      safety_settings=safety_settings
    )
    
    return json.loads(response.candidates[0].content.text)['items']
#parse_page(document, 839)

In [None]:
def convert_pdf_page_to_image(page, image_file_name):
    pix = page.get_pixmap(dpi=200)
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    img.save(image_file_name, quality=100)

def validate_text_existence(page_text, page_number, lighthouse):
    # TODO: in case the same coordinates appear more than once, we can verify
    # that the value appears the right amount of times (or more).
    errors = []
    
    needles = {
        "latitude_minutes": str(lighthouse['latitude']['minutes']),
        "latitude_degrees": str(lighthouse['latitude']['degrees']),
        "longitude_minutes": str(lighthouse['longitude']['minutes']),
        "longitude_degrees": str(lighthouse['longitude']['degrees']),
        "height": str(lighthouse['height']).replace(".", ","),
    }
    
    for sector_index, sector in enumerate(lighthouse['sectors']):
        start_str = str(sector['start']).replace(".", ",")
        stop_str = str(sector['stop']).replace(".", ",")
        needles[f"{sector_index}_start"] = start_str
        needles[f"{sector_index}_stop"] = stop_str
    
    if "pattern" in lighthouse:
        needles["pattern"] = lighthouse["pattern"]

    if "range" in lighthouse:
        if lighthouse["range"] == int(lighthouse["range"]):
            # maxRange will appear without decimal if it is an integer
            needles["range"] = str(int(lighthouse["range"]))
        else:
            needles["range"] = str(lighthouse["range"]).replace(".", ",")
        
    minimum_value_count = {}
    for value in needles.values():
        if not value in minimum_value_count:
            minimum_value_count[value] = 0
        minimum_value_count[value] += 1
    
    for key, value in needles.items():
        value_count_on_page = page_text.count(value)
        if value_count_on_page < minimum_value_count[value]:
            errors.append(f"Value {key} missing at least once for {lighthouse['name']} on page {page_number+1} (value is {value} should appear {minimum_value_count[value]} times)")
    return errors

def validate_extracted_lighthouses(page, page_number, lighthouses):
    page_text = page.get_text()
    
    errors = []
    
    for lighthouse in lighthouses:
        errors.extend(validate_text_existence(page_text, page_number, lighthouse))
        
    return errors

import re
import random

def ensure_space_before_rgw(input_string):
    # Sometimes, there should be a space (e.g. Q W) where 
    # the language model thinks it is QW without space.
    
    # Regex pattern to match a letter followed by R, G, or W, but not if they follow another R, G, or W
    pattern = r'(?<=[a-zA-Z])(?=[RGW])(?<![RGW])'
    
    # Replace function to add a space before R, G, or W
    result = re.sub(pattern, ' ', input_string)
    return result
    
def parse_lighthouses_for_page(document, page_number):
    image_file_name = f"pages/page{page_number}.png"
    page = document.load_page(page_number)
    
    convert_pdf_page_to_image(page, image_file_name)
    # lighthouses_on_page = extract_lighthouses_using_gpt(image_file_name)
    decay_factor = 1
    maximum_backoff = 32000
    while True:
        location = random.choice(locations)
        try:
            lighthouses_on_page = extract_lighthouses_using_gemini(image_file_name, location)
            break
        except Exception as e:
            print(f"Error calling gemini on {location}: {e}", flush=True)
            wait_time = min(maximum_backoff, (decay_factor + random.randint(1,1000)))
            time.sleep(wait_time / 1000) # Sleep random number of ms
            decay_factor *= 2
    
    # Since lower case L and upper case I looks similar,
    # the models confuse them sometimes. We don't. Since FI 
    # is not a valid flash pattern, replace with its valid Fl value.
    for lighthouse in lighthouses_on_page:
        if "pattern" in lighthouse:
            if "FI" in lighthouse["pattern"]:
                lighthouse["pattern"] = lighthouse["pattern"].replace("FI", "Fl")
            # Sometimes lys is clipped so it looks like lvs
            lighthouse["pattern"] = lighthouse["pattern"].replace("lvs", "lys")
            

            lighthouse["pattern"] = ensure_space_before_rgw(lighthouse["pattern"])
            
    errors = validate_extracted_lighthouses(page, page_number, lighthouses_on_page)
    return lighthouses_on_page, errors

# pdf_path = "Fyrliste_HeleLandet.pdf"
# document = pymupdf.open(pdf_path)
# lighthouses_on_page, errors = parse_lighthouses_for_page(document, 29)
# errors

In [None]:
def parse_page(document, page_number):
    # Define the text you are looking for
    search_text = ["Lysvidde", "Fyrnr.", "Kartnr."]
    
    page = document.load_page(page_number)
    
    # Extract text from the page
    text = page.get_text()

    # Check if the page contains the search text
    should_parse_page = all(map(lambda needle: needle in text, search_text))
    if not should_parse_page:
        return [], []
    return parse_lighthouses_for_page(document, page_number)

In [17]:
import concurrent.futures
all_errors = []
all_lighthouses = []

pdf_path = "Fyrliste_HeleLandet.pdf"
document = pymupdf.open(pdf_path)

lighthouses_per_page = [0] * len(document)
errors_per_page = [0] * len(document)

with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor:
    futures = {executor.submit(parse_page, document, page_number): page_number for page_number in range(len(document))}
    
    for future in concurrent.futures.as_completed(futures):
        page_number = futures[future]
        try:
            lighthouses_on_page, errors = future.result()
            print(f"Found {len(lighthouses_on_page)} lighthouses and {len(errors)} errors on page {page_number+1}")
            all_errors.extend(errors)
            all_lighthouses.extend(lighthouses_on_page)
            lighthouses_per_page[page_number] = len(lighthouses_on_page)
            errors_per_page[page_number] = len(errors)
            
        except Exception as e:
            print(f"Error parsing page {page_number + 1}: {e}")


Found 0 lighthouses and 0 errors on page 3
Found 0 lighthouses and 0 errors on page 28
Found 0 lighthouses and 0 errors on page 15
Found 0 lighthouses and 0 errors on page 16
Found 0 lighthouses and 0 errors on page 29
Found 0 lighthouses and 0 errors on page 7
Found 0 lighthouses and 0 errors on page 19
Found 0 lighthouses and 0 errors on page 5
Found 0 lighthouses and 0 errors on page 8
Found 0 lighthouses and 0 errors on page 18
Found 0 lighthouses and 0 errors on page 17
Found 0 lighthouses and 0 errors on page 24
Found 0 lighthouses and 0 errors on page 21
Found 0 lighthouses and 0 errors on page 25
Found 0 lighthouses and 0 errors on page 4
Found 0 lighthouses and 0 errors on page 26
Found 0 lighthouses and 0 errors on page 23
Found 0 lighthouses and 0 errors on page 11
Found 0 lighthouses and 0 errors on page 9
Found 0 lighthouses and 0 errors on page 6
Found 0 lighthouses and 0 errors on page 14
Found 0 lighthouses and 0 errors on page 10
Found 0 lighthouses and 0 errors on pag

KeyboardInterrupt: 

Error calling gemini on asia-east2: The model response did not complete successfully.
Finish reason: 4.
Finish message: .
Safety ratings: [category: HARM_CATEGORY_HATE_SPEECH
probability: MEDIUM
probability_score: 0.435546875
severity: HARM_SEVERITY_NEGLIGIBLE
severity_score: 0.0927734375
, category: HARM_CATEGORY_DANGEROUS_CONTENT
probability: NEGLIGIBLE
probability_score: 0.21484375
severity: HARM_SEVERITY_LOW
severity_score: 0.30078125
, category: HARM_CATEGORY_HARASSMENT
probability: NEGLIGIBLE
probability_score: 0.455078125
severity: HARM_SEVERITY_LOW
severity_score: 0.3984375
, category: HARM_CATEGORY_SEXUALLY_EXPLICIT
probability: NEGLIGIBLE
probability_score: 0.142578125
severity: HARM_SEVERITY_NEGLIGIBLE
severity_score: 0.135742188
].
To protect the integrity of the chat session, the request and response were not added to chat history.
To skip the response validation, specify `model.start_chat(response_validation=False)`.
Note that letting blocked or otherwise incomplete respo

In [None]:
lighthouses_on_page, errors = parse_page(document, 29)
lighthouses_on_page

In [18]:
with open("parsed_lighthouses.json", "w") as f:
    json.dump(all_lighthouses, f)
with open("parsed_lighthouse_errors.json", "w") as f:
    json.dump(all_errors, f)

In [None]:
all_errors

In [19]:
len(all_lighthouses)

7327

In [None]:
[l for l in all_lighthouses if l['name'] == "Slottsfjellet"]

In [None]:
# Also remember to check sector colors with their starting sector angle as it seemed like it could make mistakes sometimes. 
# This can be used to generate a list of potential problems we can/should manually fix before updating the lighthouses.json file.