In [1]:
# Libraries
from pathlib import Path
from mistralai import Mistral, DocumentURLChunk
from mistralai.models import OCRResponse

In [None]:
# API key
api_key = "_____"

In [None]:
# File path
pdf_file = Path("_____")

In [4]:
# Mistral client
client = Mistral(api_key=api_key)

In [5]:
# Verifies the file exists
try:
    assert pdf_file.is_file()
except AssertionError:
    print(f"File [{pdf_file}] does not exist.")

In [6]:
# Stores the response after Uploading the PDF file to Mistrals servers
uploaded_file = client.files.upload(
    file={
        "file_name": pdf_file.stem,
        "content": pdf_file.read_bytes(),
    },
    purpose="ocr",
)

In [7]:
# Stores a temporary, secure URL that allows access to the uploaded PDF file on Mistrlals servers
signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)

In [8]:
# Stores the results of the OCR process from the uploaded PDF file
pdf_response = client.ocr.process(
    document=DocumentURLChunk(document_url=signed_url.url),
    model="mistral-ocr-latest",
    include_image_base64=True
)

In [9]:
# Replaces the image placeholders in the Markdown string with their corresponding base64-encoded image data
def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
    for img_name, base64_str in images_dict.items():
        markdown_str = markdown_str.replace(
            f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})"
        )

    # Returns the modified Markdown string
    return markdown_str

In [10]:
# Combines the extracted text and images from the OCR response into a single string in Markdown format
def get_combined_markdown(ocr_response: OCRResponse) -> str:
    markdowns: list[str] = []

    # Creates a new empty dictionary for each page in the OCR response
    for page in ocr_response.pages:
        image_data = {}

        # Iterates through each image on the page and stores the base64 image data
        for img in page.images:
            image_data[img.id] = img.image_base64

        # Appends the result of the function to the markdowns list
        markdowns.append(replace_images_in_markdown(page.markdown, image_data))

    # Combines all the Markdown strings into a single string
    return "\n\n".join(markdowns)

In [11]:
# Get the combined Markdown content from the OCR response
combined_markdown = get_combined_markdown(pdf_response)

In [12]:
# Create the name of the Markdown file
output_file = f'{pdf_file.stem}.md'

In [13]:
# Write the combined Markdown content to a file
with open(output_file, 'w', encoding='utf-8') as f:
    f.write(combined_markdown)

In [14]:
# Print the location of the saved Markdown file
print(f"Markdown content has been saved to: {output_file}")

Markdown content has been saved to: Grazed Foundation (Profile and Proposal).V1 02192025.md
