In [1]:
from unstructured.partition.pdf import partition_pdf
import os
from langchain_chroma import Chroma
from langchain_experimental.open_clip import OpenCLIPEmbeddings
from PIL import Image
import base64
import io
from langchain_core.messages import HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_openai import ChatOpenAI
from IPython.display import HTML, display
import pytesseract
from langchain_community.llms import LlamaCpp

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def resize_base64_image(base64_string, size=(128, 128)):
    """
    Resize an image encoded as a Base64 string.

    Args:
    base64_string (str): Base64 string of the original image.
    size (tuple): Desired size of the image as (width, height).

    Returns:
    str: Base64 string of the resized image.
    """
    # Decode the Base64 string
    img_data = base64.b64decode(base64_string)
    img = Image.open(io.BytesIO(img_data))

    # Resize the image
    resized_img = img.resize(size, Image.LANCZOS)

    # Save the resized image to a bytes buffer
    buffered = io.BytesIO()
    resized_img.save(buffered, format=img.format)

    # Encode the resized image to Base64
    return base64.b64encode(buffered.getvalue()).decode("utf-8")


def is_base64(s):
    """Check if a string is Base64 encoded"""
    try:
        return base64.b64encode(base64.b64decode(s)) == s.encode()
    except Exception:
        return False


def split_image_text_types(docs):
    """Split numpy array images and texts"""
    images = []
    text = []
    for doc in docs:
        doc = doc.page_content  # Extract Document contents
        if is_base64(doc):
            # Resize image to avoid OAI server error
            images.append(
                resize_base64_image(doc, size=(250, 250))
            )  # base64 encoded str
        else:
            text.append(doc)
    return {"images": images, "texts": text}

def prompt_func(data_dict):
    # Joining the context texts into a single string
    formatted_texts = "\n".join(data_dict["context"]["texts"])
    messages = []

    # Adding image(s) to the messages if present
    if data_dict["context"]["images"]:
        image_message = {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{data_dict['context']['images'][0]}"
            },
        }
        messages.append(image_message)

    # Adding the text message for analysis
    text_message = {
        "type": "text",
        "text": (
            "As an expert art critic and historian, your task is to analyze and interpret images, "
            "considering their historical and cultural significance. Alongside the images, you will be "
            "provided with related text to offer context. Both will be retrieved from a vectorstore based "
            "on user-input keywords. Please use your extensive knowledge and analytical skills to provide a "
            "comprehensive summary that includes:\n"
            "- A detailed description of the visual elements in the image.\n"
            "- The historical and cultural context of the image.\n"
            "- An interpretation of the image's symbolism and meaning.\n"
            "- Connections between the image and the related text.\n\n"
            f"User-provided keywords: {data_dict['question']}\n\n"
            "Text and / or tables:\n"
            f"{formatted_texts}"
        ),
    }
    messages.append(text_message)

    return [HumanMessage(content=messages)]

def plt_img_base64(img_base64):
    # Create an HTML img tag with the base64 string as the source
    image_html = f'<img src="data:image/jpeg;base64,{img_base64}" />'

    # Display the image by rendering the HTML
    display(HTML(image_html))

In [9]:
if __name__ == "__main__":
    # Extract images, tables, and chunk text
    pdf_file = "sample\deepseek-r1.pdf"
    raw_pdf_elements = partition_pdf(
        filename=pdf_file,
        extract_images_in_pdf=True,
        infer_table_structure=True,
        chunking_strategy="by_title",
        max_characters=4000,
        new_after_n_chars=3800,
        combine_text_under_n_chars=2000,
        image_output_dir_path="extracted_img",
    )
    print("Done extracting elements...")

    # Categorize text elements by type
    tables = []
    texts = []
    t = ""
    for element in raw_pdf_elements:
        if "unstructured.documents.elements.Table" in str(type(element)):
            tables.append(str(element))
        elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
            texts.append(str(element))
            t += str(element) + "\n"
    print("Done categorizing elements...")

    # Record the text content of the pdf
    with open("extracted_txt/test_output.txt", "w", encoding="utf-8") as txt_file:
        txt_file.write(t)
    print("Done recording texts...")

    # Create chroma
    vectorstore = Chroma(
        collection_name="mm_rag_clip_photos", embedding_function=OpenCLIPEmbeddings()
    )
    print("Done creating vector DB...")

    # Get image URIs with .jpg extension only
    image_uris = sorted(
        [
            os.path.join("figures", image_name)
            for image_name in os.listdir("figures")
                if image_name.endswith(".jpg")
        ]
    )

    # Add images
    vectorstore.add_images(uris=image_uris)
    # Add documents
    vectorstore.add_texts(texts=texts)   
    print("Done adding texts and images in vectorDB...")

    # Make retriever
    retriever = vectorstore.as_retriever()
    print("Done creating retriver...")

  pdf_file = "sample\deepseek-r1.pdf"
  model_path="model\Mistral-7B-Instruct-v0.2-Q4_K_M-GGUF\mistral-7b-instruct-v0.2-q4_k_m.gguf",  # models/Meta-Llama-3-8B.Q4_K_M.gguf


Done extracting elements...
Done categorizing elements...
Done recording texts...
Done creating vector DB...


                context_length was transferred to model_kwargs.
                Please confirm that context_length is what you intended.
  if await self.run_code(code, result, async_=asy):
llama_model_loader: loaded meta data with 33 key-value pairs and 291 tensors from model\Mistral-7B-Instruct-v0.2-Q4_K_M-GGUF\mistral-7b-instruct-v0.2-q4_k_m.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Mistral 7B Instruct v0.2
llama_model_loader: - kv   3:                            general.version str              = v0.2
llama_model_loader: - kv   4:                           general.finetune str              = Instruct
llama_model_l

Done adding texts and images in vectorDB...
Done creating retriver...


llama_model_loader: - kv  22:                      tokenizer.ggml.tokens arr[str,32000]   = ["<unk>", "<s>", "</s>", "<0x00>", "<...
llama_model_loader: - kv  23:                      tokenizer.ggml.scores arr[f32,32000]   = [-1000.000000, -1000.000000, -1000.00...
llama_model_loader: - kv  24:                  tokenizer.ggml.token_type arr[i32,32000]   = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
llama_model_loader: - kv  25:                tokenizer.ggml.bos_token_id u32              = 1
llama_model_loader: - kv  26:                tokenizer.ggml.eos_token_id u32              = 2
llama_model_loader: - kv  27:            tokenizer.ggml.unknown_token_id u32              = 0
llama_model_loader: - kv  28:               tokenizer.ggml.add_bos_token bool             = true
llama_model_loader: - kv  29:               tokenizer.ggml.add_eos_token bool             = false
llama_model_loader: - kv  30:                    tokenizer.chat_template str              = {%- if messages[0]['role'] == 's

Done loading model...


In [27]:
# model = ChatOpenAI(temperature=0, model="gpt-4-vision-preview", max_tokens=100)
model = LlamaCpp(
    model_path="model\Mistral-7B-Instruct-v0.2-Q4_K_M-GGUF\mistral-7b-instruct-v0.2-q4_k_m.gguf",  # models/Meta-Llama-3-8B.Q4_K_M.gguf 
    temperature=0.2,
    max_tokens=100,
    top_p=0.9,
    n_gpu_layers=50,
    n_batch=512,
    n_ctx=4096
)

  model_path="model\Mistral-7B-Instruct-v0.2-Q4_K_M-GGUF\mistral-7b-instruct-v0.2-q4_k_m.gguf",  # models/Meta-Llama-3-8B.Q4_K_M.gguf
llama_model_loader: loaded meta data with 33 key-value pairs and 291 tensors from model\Mistral-7B-Instruct-v0.2-Q4_K_M-GGUF\mistral-7b-instruct-v0.2-q4_k_m.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Mistral 7B Instruct v0.2
llama_model_loader: - kv   3:                            general.version str              = v0.2
llama_model_loader: - kv   4:                           general.finetune str              = Instruct
llama_model_loader: - kv   5:                           general.base

In [28]:
# RAG pipeline
chain = (
    {
        "context": retriever | RunnableLambda(split_image_text_types),
        "question": RunnablePassthrough(),
    }
    | RunnableLambda(prompt_func)
    | model
    | StrOutputParser()
)

In [12]:
docs = retriever.invoke("abstract")
for doc in docs:
    if is_base64(doc.page_content):
        plt_img_base64(doc.page_content)
    else:
        print(doc.page_content)

Zhenda Xie

Zhengyan Zhang

Zhewen Hao

Zhicheng Ma

Zhigang Yan

Zhiyu Wu

Zihui Gu

21

Zijia Zhu Zhen Huang Zijun Liu* Zhipeng Xu Zilin Li Ziwei Xie Zhen Zhang Ziyang Song

Zhongyu Zhang

Zizheng Pan

Within each role, authors are listed alphabetically by the first name. Names marked with * denote individuals who have departed from our team.

22
1. Introduction

In recent years, Large Language Models (LLMs) have been undergoing rapid iteration and evolution (Anthropic, 2024; Google, 2024; OpenAI, 2024a), progressively diminishing the gap towards Artificial General Intelligence (AGI).

Recently, post-training has emerged as an important component of the full training pipeline. It has been shown to enhance accuracy on reasoning tasks, align with social values, and adapt to user preferences, all while requiring relatively minimal computational resources against pre-training. In the context of reasoning capabilities, OpenAI’s o1 (OpenAI, 2024b) series models were the first to introduce 

In [29]:
chain.invoke("abstract")

llama_perf_context_print:        load time =  453282.10 ms
llama_perf_context_print: prompt eval time =  453267.55 ms /  2800 tokens (  161.88 ms per token,     6.18 tokens per second)
llama_perf_context_print:        eval time =     530.24 ms /     1 runs   (  530.24 ms per token,     1.89 tokens per second)
llama_perf_context_print:       total time =  453821.40 ms /  2801 tokens


'>'