In [1]:
import base64
import requests
from PyPDF2 import PdfReader
from PIL import Image
import io
import fitz  # PyMuPDF

In [None]:
uploaded_file = ("D:\Try21\pages_153_to_205.pdf")
api_key = ("")
output_tex_file = ("D:/H5_pages_153_to_205.tex")

In [3]:


def create_message(content: str, role: str) -> dict:
    return {"content": content, "role": role}

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def get_response(messages, api_key):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    payload = {
        "model": "gpt-4o",
        "messages": messages,
        "max_tokens": 4000
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    return response.json()

def process_pdf(uploaded_file, api_key, output_tex_file):
    # reader = PdfReader(uploaded_file)
    # num_of_pages = len(reader.pages)

    doc = fitz.open(uploaded_file)
    num_of_pages = doc.page_count

    user_prompt_template = """
    I have a PDF document in Dutch that includes text, tables, and images across multiple pages. 
    I need assistance with several tasks for each page of the document. 
    First, translate all the text from Dutch to English. 
    For any figures that depict tables, interpret and extract the data, then represent this data as an editable text table in English. 
    Additionally, provide descriptions and relevant interpretations for other images within the document. If not possible to make them as a table, instead of leaving a placeholder for these images/figures. 
    If a figure image was as a table and you replicated it as a table, you should NOT write a placeholder for that image.
    This will ensure that both the textual and visual content are fully accessible and usable in English.
    Also, I want you to just directly go respond, and you don't need, for example, to say that this is translated.
    We need to replicate each page.
    Finally, everything needs to be written in LaTeX format, i.e., if it is a section write as a section if it is a table write as a table.

    PS. Document class article, usepackage graphicx begin document, and end are all already defined and you shouldn't write them
    text: {text}
    """

    # Open the PDF with PyMuPDF
    # doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")

    # Collect all LaTeX content to save in one file
    latex_content = ""

    # Add LaTeX preamble at the beginning
    latex_content += """
    \\documentclass{article}
    \\usepackage{geometry}
    \\usepackage{array}
    \\usepackage{graphicx}
    \\usepackage{color}

    \\begin{document}
    """

    # Iterate through each page of the PDF
    for page_num in range(num_of_pages):
        page = doc.load_page(page_num)
        text = page.get_text("text")
        
        # page = reader.pages[page_num]
        # text = page.extract_text()
        
        # Extract images using PyMuPDF
        images = []
        pdf_page = doc.load_page(page_num)
        image_list = pdf_page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))
            image_path = f"temp_image_{page_num}_{img_index}.png"
            image.save(image_path)
            images.append(image_path)
        
        system_message = create_message(
            "I have a PDF document in Dutch that includes text, tables, and images across multiple pages. I need assistance with several tasks for each page of the document. First, translate all the text from Dutch to English. For any images that depict tables, interpret and extract the data, then represent this data as an editable text table in English. Additionally, provide descriptions and relevant interpretations for other images within the document. If a figure image was as a table and you replicated it as a table, you should NOT write a placeholder for that image. This will ensure that both the textual and visual content are fully accessible and usable in English. Also, I want you to just directly respond, and you don't need for example to say that this is translated. We need to replicate each page. Finally, everything needs to be written in latex format, i mean if it is section write as section if it is table write as table. ps. document class article, usepackage graphicx begin document, end are all already defined and you shouldn't not write them ",
            "system"
        )

            
        user_message = create_message(content=user_prompt_template.format(text=text), role="user")

        messages = [system_message, user_message]

        # Include images in the request
        for image_path in images:
            image_data = encode_image(image_path)
            image_message = {
                "role": "user",
                "content": f"data:image/jpeg;base64,{image_data}"
            }
            messages.append(image_message)

        try:
            # Get response from GPT-4o
            response = get_response(messages, api_key)
            if 'choices' in response and response['choices']:
                translated_content = response['choices'][0]['message']['content']
                latex_content += f"% Page {page_num + 1} Content START\n"
                latex_content += translated_content
                latex_content += f"\n% Page {page_num + 1} Content END\n\n"
            else:
                raise Exception(f"Error processing page {page_num + 1}: {response}")
        except Exception as e:
            print(f"Error processing page {page_num + 1}: {e}")
            latex_content += f"% Page {page_num + 1} skipped due to error.\n"



    # Add LaTeX ending
    latex_content += "\\end{document}"

    # Save the LaTeX content to a file
    with open(output_tex_file, "w", encoding="utf-8") as f:
        f.write(latex_content)


In [14]:


process_pdf(uploaded_file, api_key, output_tex_file)


In [None]:
"D:\Try21\pages_1_to_9.pdf"
"D:\Try21\pages_10_to_10.pdf"
"D:\Try21\pages_11_to_18.pdf"
"D:\Try21\H1_pages_19_to_49.pdf"
"D:\Try21\H2_pages_50_to_67.pdf"
"D:\Try21\H3_pages_68_to_124.pdf"
"D:\Try21\H4_pages_125_to_152.pdf"
"D:\Try21\H5_pages_153_to_205.pdf"
"D:\Try21\H6_pages_206_to_267.pdf"
"D:\Try21\H7_pages_268_to_349.pdf"
"D:\Try21\H8_pages_350_to_409.pdf"
"D:\Try21\H9_pages_410_to_503.pdf"
"D:\Try21\H10_pages_504_to_508.pdf"
"D:\Try21\H11_pages_509_to_638.pdf"
"D:\Try21\H12_pages_639_to_659.pdf"
"D:\Try21\H13_pages_660_to_660.pdf"
"D:\Try21\H14_pages_661_to_672.pdf"
"D:\Try21\H15_pages_673_to_745.pdf"
"D:\Try21\H16_pages_746_to_750.pdf"
"D:\Try21\H17_pages_751_to_752.pdf"
"D:\Try21\pages_753_to_769.pdf"
"D:\Try21\pages_770_to_776.pdf"
"D:\Try21\pages_777_to_794.pdf"
"D:\Try21\pages_795_to_798.pdf"
"D:\Try21\pages_799_to_800.pdf"
"D:\Try21\pages_801_to_801.pdf"
"D:\Try21\pages_802_to_823.pdf"
"D:\Try21\pages_824_to_826.pdf"
"D:\Try21\pages_827_to_847.pdf"
"D:\Try21\pages_848_to_850.pdf"
"D:\Try21\pages_851_to_860.pdf"
"D:\Try21\pages_861_to_902.pdf"
"D:\Try21\pages_903_to_911.pdf"
"D:\Try21\pages_912_to_1006.pdf"
"D:\Try21\pages_1007_to_1052.pdf"
"D:\Try21\pages_1053_to_1054.pdf"
"D:\Try21\pages_1055_to_1071.pdf"
"D:\Try21\pages_1072_to_1089.pdf"
"D:\Try21\pages_1090_to_1093.pdf"
"D:\Try21\pages_1094_to_1097.pdf"
"D:\Try21\pages_1098_to_1108.pdf"
"D:\Try21\pages_1109_to_1110.pdf"
"D:\Try21\pages_1111_to_1112.pdf"
"D:\Try21\pages_1113_to_1114.pdf"
"D:\Try21\pages_1115_to_1126.pdf"
"D:\Try21\pages_1127_to_1131.pdf"
