In [24]:
import base64
import os

from pypdfium2 import PdfDocument
from natsort import natsorted
from langchain_core.messages import HumanMessage
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings
import uuid
from langchain.schema.output_parser import StrOutputParser
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document
from langchain.storage import InMemoryStore
from langchain.vectorstores import Chroma
import io
import re
from IPython.display import HTML, display
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from PIL import Image

In [2]:
pdf_name = 'ref_data/TWINGO/MR413X4416A000.pdf'

In [3]:
images_path = "ref_data/parsed_images/TWINGO/"
pdf = PdfDocument(pdf_name)
for i in range(len(pdf)):
    page = pdf[i]
    image = page.render(scale=4).to_pil()
    image.save(f"ref_data/parsed_images/page_{i}.jpg")

In [4]:
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


def image_summarize(img_base64, prev_base64_image, prompt):
    model = ChatOpenAI(model="gpt-4o-mini")
    msg = model(
        [
            HumanMessage(
                content=[
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{prev_base64_image}"},
                    },
                ]
            )
        ]
    )
    return msg.content


prompt = """
You are an assistant tasked with describing images for the RAG system.
These summaries will be embedded and used to produce the raw image.
Provide a short description of the image that will be well optimized for search.
Some images are sequential and connected, for example, the APN1 diagram continues
on 4 pages with notes Continuation 1-4, indicate this in your description in the form:
Continuation of the scheme [Scheme name] from page [page number].
Start the description with the page number of the document.
Pay attention that pages numbers could be not in regular format, in that case use page number as it is in the document.
Use previous page image to improve current page description, for example you could add to the
description that at the current page #5 there is some schema, that starts on the page #4.
"""

prev_image_path = ''
images_base64_list = []
images_summaries = []
for img_file in natsorted(os.listdir(images_path)):
    if img_file.endswith(".jpg"):
        image_path = os.path.join(images_path, img_file)
        base64_image = encode_image(image_path)
        images_base64_list.append(base64_image)
        prev_base64_image = encode_image(prev_image_path) if prev_image_path != '' else encode_image(image_path)
        images_summaries.append(image_summarize(base64_image, prev_base64_image, prompt))
        prev_image_path = image_path

  warn_deprecated(


In [5]:
images_summaries

['**Page 2:** This page includes the table of contents for the Twingo II manual, specifically section 16A, which covers the engine start and battery charging diagnostics. It lists various diagnostic topics such as element placement and function, along with their corresponding page numbers. Continuation of the scheme from page 1.',
 '**Page 16A-2 Description**: This page features detailed diagrams related to the engine start and battery charging system of the Twingo II. It includes labeled illustrations of the starter (1) and generator (2) positioned on the engine, along with a close-up of the fuse box (3) located inside the vehicle. The diagrams provide a visual reference for identifying key components. This page is part of the continuation of the diagnostic series for the engine and its systems, following the overview found on page 16A-1.',
 "**Page 16A-3**: This page provides detailed information on the components of the Twingo II's starting and charging system, focusing on the accum

In [6]:
vectorstore = Chroma(collection_name="mm_rag_mistral", embedding_function=OpenAIEmbeddings())
store = InMemoryStore()
id_key = "doc_id"
retriever = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key=id_key)
doc_ids = [str(uuid.uuid4()) for _ in images_base64_list]
summary_docs = [Document(page_content=s, metadata={id_key: doc_ids[i]})
                for i, s in enumerate(images_summaries)]
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, images_base64_list)))

  warn_deprecated(


In [13]:
def looks_like_base64(sb):
    """Check if the string looks like base64"""
    return re.match("^[A-Za-z0-9+/]+[=]{0,2}$", sb) is not None


def is_image_data(b64data):
    """
    Check if the base64 data is an image by looking at the start of the data
    """
    image_signatures = {
        b"\xFF\xD8\xFF": "jpg",
        b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A": "png",
        b"\x47\x49\x46\x38": "gif",
        b"\x52\x49\x46\x46": "webp",
    }
    try:
        header = base64.b64decode(b64data)[:8]  # Decode and get the first 8 bytes
        for sig, format in image_signatures.items():
            if header.startswith(sig):
                return True
        return False
    except Exception:
        return False

def split_image_text_types(docs):
    """
    Split base64-encoded images and texts
    """
    b64_images = []
    texts = []
    for doc in docs:
        if isinstance(doc, Document):
            doc = doc.page_content
        if looks_like_base64(doc) and is_image_data(doc):
            b64_images.append(doc)
        else:
            texts.append(doc)
    if len(b64_images) > 0:
        return {"images": b64_images[:1], "texts": []}
    return {"images": b64_images, "texts": texts}

In [30]:
query = """Приведи блок-схему запуска двигателя"""
docs = retriever.get_relevant_documents(query, limit=10)
# stplitted_docs = split_image_text_types(docs)
# Image.open(io.BytesIO(base64.b64decode(stplitted_docs['images'][0])))
docs

['/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCA0cCUQDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD3+iiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigA