In [None]:
import json

In [None]:
import pymupdf
from PIL import Image

# Define the text you are looking for
search_text = ["Lysvidde", "Fyrnr.", "Kartnr."]

# Open the PDF file
pdf_path = "Fyrliste_HeleLandet.pdf"
doc = pymupdf.open(pdf_path)

# Iterate over all pages
#for page_num in range(len(doc)):
for page_num in range(140):
    page = doc.load_page(page_num)
    
    # Extract text from the page
    text = page.get_text()

    # Check if the page contains the search text
    should_parse_page = all(map(lambda needle: needle in text, search_text))
    if should_parse_page:
        print(f"Text found on page {page_num + 1}")
        
        # Convert the page to an image
        pix = page.get_pixmap(dpi=200)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        
        # Save the image (optional)
        img_path = f"pages/page_{page_num + 1}.png"
        img.save(img_path, quality=100)

# Close the PDF document
doc.close()


In [38]:
def convert_pdf_page_to_image(page, image_file_name):
    pix = page.get_pixmap(dpi=200)
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    img.save(image_file_name, quality=100)

def extract_lighthouses_using_gpt(image_file_name):
    import requests
    import os
    import json
    import base64
    file = open(image_file_name, "rb")
    image_bytes = file.read()
    base64_str = "data:image/jpeg;base64," + base64.b64encode(image_bytes).decode("utf-8")
    
    openai_api_key = os.environ["OPENAI_API_KEY"]
    headers = {
        'Content-Type': 'application/json',
        'Authorization': 'Bearer ' + os.getenv('OPENAI_API_KEY', ''),
    }
    
    json_data = {
        'model': 'gpt-4o-2024-08-06',
        'messages': [
            {
                'role': 'system',
                'content': [
                    {
                        'type': 'text',
                        'text': 'Generate JSON for these lighthouses. When reading sectors, use the sector color column with a single letter. Ensure that all sectors are included.',
                    },
                ],
            },
            {
                'role': 'user',
                'content': [
                    {
                        'type': 'image_url',
                        'image_url': {
                            'url': base64_str
                        },
                    },
                ],
            },
        ],
        'response_format': {
            'type': 'json_schema',
            'json_schema': {
                'name': 'lighthouse_response',
                'strict': True,
                'schema': {
                    '$schema': 'http://json-schema.org/draft-07/schema#',
                    'type': 'object',
                    'properties': {
                        'items': {
                            'type': 'array',
                            'items': {
                                'type': 'object',
                                'properties': {
                                    'latitude': {
                                        'type': 'object',
                                        'properties': {
                                            'degrees': {
                                                'type': 'integer',
                                                'description': 'Degrees of latitude, ranging from -90 to 90.',
                                            },
                                            'minutes': {
                                                'type': 'number',
                                                'description': 'Minutes of latitude, ranging from 0 to 60.',
                                            },
                                        },
                                        'required': [
                                            'degrees',
                                            'minutes',
                                        ],
                                        'additionalProperties': False,
                                    },
                                    'longitude': {
                                        'type': 'object',
                                        'properties': {
                                            'degrees': {
                                                'type': 'integer',
                                                'description': 'Degrees of longitude, ranging from -180 to 180.',
                                            },
                                            'minutes': {
                                                'type': 'number',
                                                'description': 'Minutes of longitude, ranging from 0 to 60.',
                                            },
                                        },
                                        'required': [
                                            'degrees',
                                            'minutes',
                                        ],
                                        'additionalProperties': False,
                                    },
                                    'pattern': {
                                        'type': 'string',
                                    },
                                    'description': {
                                        'type': 'string',
                                    },
                                    'heightOverGround': {
                                        'type': 'number',
                                    },
                                    'height': {
                                        'type': 'number',
                                    },
                                    'sectors': {
                                        'type': 'array',
                                        'items': {
                                            'type': 'object',
                                            'properties': {
                                                'color': {
                                                    'type': 'string',
                                                },
                                                'start': {
                                                    'type': 'number',
                                                },
                                                'stop': {
                                                    'type': 'number',
                                                },
                                                'description': {
                                                    'type': 'string',
                                                },
                                            },
                                            'required': [
                                                'color',
                                                'start',
                                                'stop',
                                                'description',
                                            ],
                                            'additionalProperties': False,
                                        },
                                    },
                                    'area': {
                                        'type': 'string',
                                    },
                                    'name': {
                                        'type': 'string',
                                    },
                                    'location': {
                                        'type': 'string',
                                    },
                                    'maxRange': {
                                        'type': 'number',
                                    },
                                },
                                'required': [
                                    'latitude',
                                    'longitude',
                                    'pattern',
                                    'description',
                                    'heightOverGround',
                                    'height',
                                    'sectors',
                                    'name',
                                    'location',
                                    'area',
                                    'maxRange',
                                ],
                                'additionalProperties': False,
                            },
                        },
                    },
                    'required': [
                        'items',
                    ],
                    'additionalProperties': False,
                },
            },
        },
        'temperature': 0.0,
        'max_tokens': 4096,
    }
    
    response = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, json=json_data)
    data = response.json()
    lighthouses_on_page_str = data['choices'][0]['message']['content']
    lighthouses_on_page = json.loads(lighthouses_on_page_str)
    return lighthouses_on_page['items']

def validate_text_existence(page_text, lighthouse):
    # TODO: in case the same coordinates appear more than once, we can verify
    # that the value appears the right amount of times (or more).
    errors = []
    
    needles = {
        "latitude_minutes": str(lighthouse['latitude']['minutes']),
        "latitude_degrees": str(lighthouse['latitude']['degrees']),
        "longitude_minutes": str(lighthouse['longitude']['minutes']),
        "longitude_degrees": str(lighthouse['longitude']['degrees'])
    }
    
    for sector_index, sector in enumerate(lighthouse['sectors']):
        start_str = str(sector['start']).replace(".", ",")
        stop_str = str(sector['stop']).replace(".", ",")
        needles[f"{sector_index}_start"] = start_str
        needles[f"{sector_index}_stop"] = stop_str

    minimum_value_count = {}
    for value in needles.values():
        if not value in minimum_value_count:
            minimum_value_count[value] = 0
        minimum_value_count[value] += 1
    
    for key, value in needles.items():
        value_count_on_page = page_text.count(value)
        if value_count_on_page < minimum_value_count[value]:
            errors.append(f"Value {key} missing at least once for {lighthouse['name']} on page {page_number+1} (value is {value} should appear {minimum_value_count[value]} times)")
    return errors

def validate_extracted_lighthouses(page, lighthouses):
    page_text = page.get_text()
    
    errors = []
    
    for lighthouse in lighthouses:
        errors.extend(validate_text_existence(page_text, lighthouse))
        
    return errors

pdf_path = "Fyrliste_HeleLandet.pdf"
document = pymupdf.open(pdf_path)

image_file_name = "pages/page.png"
page_number = 45
page = document.load_page(page_number)

convert_pdf_page_to_image(page, image_file_name)
lighthouses_on_page = extract_lighthouses_using_gpt(image_file_name)
errors = validate_extracted_lighthouses(page, lighthouses_on_page)
errors

['Value 5_stop missing at least once for Garnholmen on page 46 (value is 9,0 should appear 2 times)',
 'Value 6_start missing at least once for Garnholmen on page 46 (value is 9,0 should appear 2 times)',
 'Value 1_stop missing at least once for Slevik on page 46 (value is 32,3 should appear 1 times)']

In [None]:
# Also remember to check sector colors with their starting sector angle as it seemed like it could make mistakes sometimes. 
# This can be used to generate a list of potential problems we can/should manually fix before updating the lighthouses.json file.