In [1]:
import base64
import os

from pypdfium2 import PdfDocument
from langchain_core.messages import HumanMessage
from unstructured.partition.pdf import partition_pdf
from langchain_openai.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain_core.messages import AIMessage
from langchain_core.runnables import RunnableLambda

In [2]:
pdf_name = 'ref_data/TWINGO/MR413X4416A000.pdf'

In [3]:
pdf = PdfDocument(pdf_name)
for i in range(len(pdf)):
    page = pdf[i]
    image = page.render(scale=4).to_pil()
    image.save(f"ref_data/parsed_images/page_{i}.jpg")

In [3]:
images_path = "ref_data/parsed_images/"
pdf_elements = partition_pdf(
    pdf_name,
    chunking_strategy="by_title",
    extract_images_in_pdf=True,
    max_characters=3000,
    new_after_n_chars=2800,
    combine_text_under_n_chars=2000,
    image_output_dir_path=images_path,
    # poppler_path=r'C:\Program Files (x86)\poppler-24.07.0\Library\bin',
    )

TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information.

In [None]:
tables = []
texts = []
for element in pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        tables.append(str(element))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        texts.append(str(element))

In [None]:
prompt_text = """You are an assistant tasked with summarizing tables and text for retrieval. \
These summaries will be embedded and used to retrieve the raw text or table elements. \
Give a concise summary of the table or text that is well-optimized for retrieval. Table \
or text: {element} """
prompt = PromptTemplate.from_template(prompt_text)
empty_response = RunnableLambda(
    lambda x: AIMessage(content="Error processing document")
)
model = ChatOpenAI(model="gpt-4o-mini")
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

text_summaries = []
table_summaries = []
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 1})
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 1})

In [None]:
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


def image_summarize(img_base64, prompt):
    model = ChatOpenAI(model="gpt-4o-mini")
    msg = model(
        [
            HumanMessage(
                content=[
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
                    },
                ]
            )
        ]
    )
    return msg.content

img_base64_list = []
images_summaries = []

# Prompt
prompt = """You are an assistant tasked with summarizing images for retrieval. \
These summaries will be embedded and used to retrieve the raw image. \
Give a concise summary of the image that is well optimized for retrieval."""

# Apply to images
for img_file in sorted(os.listdir(images_path)):
    if img_file.endswith(".jpg"):
        img_path = os.path.join(images_path, img_file)
        base64_image = encode_image(img_path)
        img_base64_list.append(base64_image)
        images_summaries.append(image_summarize(base64_image, prompt))

In [None]:
texts[0]

In [None]:
text_summaries[0]

In [None]:
tables[0]

In [None]:
table_summaries[0]

In [None]:
images_summaries[0]