# Doc To Graph

In [43]:
import base64
import os
from mistralai import Mistral
from mistralai.extra import response_format_from_pydantic_model
from dotenv import load_dotenv

In [44]:
load_dotenv()

True

In [45]:
from pydantic import BaseModel, Field

# BBOX Annotation response formats
class Image(BaseModel):
  image_type: str = Field(..., description="The type of the image.")
  short_description: str = Field(..., description="A description in english describing the image.")
  summary: str = Field(..., description="Summarize the image.")

In [46]:
def encode_pdf(pdf_path):
    """Encode the pdf to base64."""
    try:
        with open(pdf_path, "rb") as pdf_file:
            return base64.b64encode(pdf_file.read()).decode('utf-8')
    except FileNotFoundError:
        print(f"Error: The file {pdf_path} was not found.")
        return None
    except Exception as e:  # Added general exception handling
        print(f"Error: {e}")
        return None

In [47]:
# Path to your pdf
pdf_path = "../dataset/docs/FSAE_Rules_2024_V1.pdf"

# Getting the base64 string
base64_pdf = encode_pdf(pdf_path)

In [48]:
api_key = os.environ["MISTRAL_API_KEY"]

client = Mistral(api_key=api_key)

In [49]:
ocr_response = client.ocr.process(
    model="mistral-ocr-latest",
    document={
        "type": "document_url",
        "document_url": f"data:application/pdf;base64,{base64_pdf}" 
    },
    include_image_base64=True,
    bbox_annotation_format=response_format_from_pydantic_model(Image)
)

In [18]:
import json

with open('results.txt', 'w') as f:
    f.write(str(ocr_response))
    # json.dump(ocr_response, f)

In [22]:
len(ocr_response.pages)

140

In [41]:
ocr_response.pages[18].images[0].id

'img-0.jpeg'

In [51]:
import base64

# Example OCRImageObject data
ocr_image = {
    'id': ocr_response.pages[18].images[0].id,
    'image_base64': ocr_response.pages[18].images[0].image_base64  # shortened for example
}

# Extract the base64 part (remove the prefix "data:image/jpeg;base64,")
base64_str = ocr_image['image_base64'].split(',')[1]

# Decode the base64 string to bytes
image_data = base64.b64decode(base64_str)

# Save to a file
with open(ocr_image['id'], 'wb') as f:
    f.write(image_data)

In [55]:
print(ocr_response.pages[18].images[0].image_annotation)

{
  "image_type": "Technical Drawing",
  "short_description": "A technical drawing showing restricted and unrestricted areas around circular components.",
  "summary": "This technical drawing illustrates specific spatial constraints around two circular components. The drawing indicates areas where no parts are allowed, marked with a specific pattern, and areas where the view of the surface must remain unobstructed, marked with a different pattern. Each circular component has a diameter of 75 mm, and the restricted areas are clearly defined around these components. The drawing also includes a legend explaining the patterns used to denote the restricted and unrestricted areas."
}


In [None]:
entities = [
    "figure",
    "formula",
    "section",
    "table"
]

In [56]:
with open('test.md', 'w') as f:
    f.write(ocr_response.pages[5].markdown)