In [None]:
import pymupdf
from PIL import Image

# Define the text you are looking for
search_text = ["Lysvidde", "Fyrnr.", "Kartnr."]

# Open the PDF file
pdf_path = "Fyrliste_HeleLandet.pdf"
doc = pymupdf.open(pdf_path)

# Iterate over all pages
#for page_num in range(len(doc)):
for page_num in range(140):
    page = doc.load_page(page_num)
    
    # Extract text from the page
    text = page.get_text()

    # Check if the page contains the search text
    should_parse_page = all(map(lambda needle: needle in text, search_text))
    if should_parse_page:
        print(f"Text found on page {page_num + 1}")
        
        # Convert the page to an image
        pix = page.get_pixmap(dpi=200)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        
        # Save the image (optional)
        img_path = f"pages/page_{page_num + 1}.png"
        img.save(img_path, quality=100)

# Close the PDF document
doc.close()


In [None]:
def extract_lighthouses_using_gpt(image_file_name):
    import requests
    import os
    import json
    import base64
    file = open(image_file_name, "rb")
    image_bytes = file.read()
    base64_str = "data:image/jpeg;base64," + base64.b64encode(image_bytes).decode("utf-8")
    
    openai_api_key = os.environ["OPENAI_API_KEY"]
    headers = {
        'Content-Type': 'application/json',
        'Authorization': 'Bearer ' + os.getenv('OPENAI_API_KEY', ''),
    }
    
    json_data = {
        'model': 'gpt-4o-2024-08-06',
        'messages': [
            {
                'role': 'system',
                'content': [
                    {
                        'type': 'text',
                        'text': 'Generate JSON for these lighthouses. When reading sectors, use the sector color column with a single letter. Ensure that all sectors are included.',
                    },
                ],
            },
            {
                'role': 'user',
                'content': [
                    {
                        'type': 'image_url',
                        'image_url': {
                            'url': base64_str
                        },
                    },
                ],
            },
        ],
        'response_format': {
            'type': 'json_schema',
            'json_schema': {
                'name': 'lighthouse_response',
                'strict': True,
                'schema': {
                    '$schema': 'http://json-schema.org/draft-07/schema#',
                    'type': 'object',
                    'properties': {
                        'items': {
                            'type': 'array',
                            'items': {
                                'type': 'object',
                                'properties': {
                                    'latitude': {
                                        'type': 'object',
                                        'properties': {
                                            'degrees': {
                                                'type': 'integer',
                                                'description': 'Degrees of latitude, ranging from -90 to 90.',
                                            },
                                            'minutes': {
                                                'type': 'number',
                                                'description': 'Minutes of latitude, ranging from 0 to 60.',
                                            },
                                        },
                                        'required': [
                                            'degrees',
                                            'minutes',
                                        ],
                                        'additionalProperties': False,
                                    },
                                    'longitude': {
                                        'type': 'object',
                                        'properties': {
                                            'degrees': {
                                                'type': 'integer',
                                                'description': 'Degrees of longitude, ranging from -180 to 180.',
                                            },
                                            'minutes': {
                                                'type': 'number',
                                                'description': 'Minutes of longitude, ranging from 0 to 60.',
                                            },
                                        },
                                        'required': [
                                            'degrees',
                                            'minutes',
                                        ],
                                        'additionalProperties': False,
                                    },
                                    'pattern': {
                                        'type': 'string',
                                    },
                                    'description': {
                                        'type': 'string',
                                    },
                                    'heightOverGround': {
                                        'type': 'number',
                                    },
                                    'height': {
                                        'type': 'number',
                                    },
                                    'sectors': {
                                        'type': 'array',
                                        'items': {
                                            'type': 'object',
                                            'properties': {
                                                'color': {
                                                    'type': 'string',
                                                },
                                                'start': {
                                                    'type': 'number',
                                                },
                                                'stop': {
                                                    'type': 'number',
                                                },
                                                'description': {
                                                    'type': 'string',
                                                },
                                            },
                                            'required': [
                                                'color',
                                                'start',
                                                'stop',
                                                'description',
                                            ],
                                            'additionalProperties': False,
                                        },
                                    },
                                    'area': {
                                        'type': 'string',
                                    },
                                    'name': {
                                        'type': 'string',
                                    },
                                    'location': {
                                        'type': 'string',
                                    },
                                    'maxRange': {
                                        'type': 'number',
                                    },
                                },
                                'required': [
                                    'latitude',
                                    'longitude',
                                    'pattern',
                                    'description',
                                    'heightOverGround',
                                    'height',
                                    'sectors',
                                    'name',
                                    'location',
                                    'area',
                                    'maxRange',
                                ],
                                'additionalProperties': False,
                            },
                        },
                    },
                    'required': [
                        'items',
                    ],
                    'additionalProperties': False,
                },
            },
        },
        'temperature': 0.0,
        'max_tokens': 4096,
    }
    
    response = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, json=json_data)
    data = response.json()
    lighthouses_on_page = data['choices'][0]['message']['content']
    return lighthouses_on_page
lighthouses_on_page = extract_lighthouses_using_gpt("pages/page_30.png")
print(lighthouses_on_page)

In [None]:
lighthouses = json.loads(lighthouses_on_page)

In [None]:
lighthouses['items'][0]['sectors']