In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os

from dotenv import load_dotenv

os.chdir(os.path.join(os.getcwd(), "../.."))
print(os.getcwd())
load_dotenv(override=True)

In [None]:
from backend.utils.unstructured import (
    select_images,
    select_texts,
    select_tables,
    load_chunking_func,
)
from backend.utils.elements import langchain_doc_to_element
from backend.utils.retriever import get_retriever, add_documents
from pathlib import Path
from unstructured.partition.pdf import partition_pdf

from hydra import initialize, compose

from backend.rag_3.ingest import (
    apply_summarize_text,
    apply_summarize_image,
    apply_summarize_table,
)
from backend.rag_3.chain import get_chain

In [None]:
with initialize(config_path=".", version_base=None):
    config = compose(config_name="config")
    print(config)

In [None]:
# Folder with pdf and extracted images
filename = "LLaVA_small.pdf"
file_path = Path(config.path.docs, filename)
print(file_path)

In [None]:
# Get elements
raw_pdf_elements = partition_pdf(
    filename=file_path,
    infer_table_structure=True,
    extract_image_block_types=["image", "table"],
    extract_image_block_to_payload=True,
)

In [None]:
raw_pdf_elements

In [None]:
# Get images
images = select_images(raw_pdf_elements)
for image in images:
    display(image)

In [None]:
# Get chunks
if config.ingest.chunking.enable:
    chunk_func = load_chunking_func(config)
    chunks = chunk_func(raw_pdf_elements)
else:
    chunks = raw_pdf_elements
chunks

In [None]:
# Get text, tables
texts = select_texts(chunks, config.ingest.metadata_keys)
tables = select_tables(chunks, config.ingest.table_format, config.ingest.metadata_keys)

In [None]:
a = """Haotian Liu1∗, Chunyuan Li2∗, Qingyang Wu3, Yong Jae Lee1

1University of Wisconsin–Madison 2Microsoft Research 3Columbia University https://llava-vl.github.io

Abstract

Instruction tuning large language models (LLMs) using machine-generated instruction-following data has improved zero-shot capabilities on new tasks, but the idea is less explored in the multimodal ﬁeld. In this paper, we present the ﬁrst att"""

In [None]:
for text in texts:
    display(text)

In [None]:
for table in tables:
    display(table)

In [None]:
# Summarize text
await apply_summarize_text(texts, config)
for text in texts:
    display(text)

In [None]:
# Summarize tables
await apply_summarize_table(tables, config)
for table in tables:
    display(table)

In [None]:
# Summarize images
await apply_summarize_image(images, config)
for image in images:
    display(image)

In [None]:
retriever = get_retriever(config)

In [None]:
# Add texts to retriever
text_summaries = [text.get_summary() for text in texts]
text_contents = [text.get_content() for text in texts]
text_metadata = [text.get_metadata() for text in texts]

add_documents(
    retriever=retriever,
    doc_summaries=text_summaries,
    doc_contents_str=text_contents,
    doc_metadata=text_metadata,
)

In [None]:
# Add tables to retriever
table_summaries = [table.get_summary() for table in tables]
table_contents = [table.get_content() for table in tables]
table_metadata = [table.get_metadata() for table in tables]

add_documents(
    retriever=retriever,
    doc_summaries=table_summaries,
    doc_contents_str=table_contents,
    doc_metadata=table_metadata,
)

In [None]:
# Add images to retriever
image_summaries = [image.get_summary() for image in images]
image_contents = [image.get_content() for image in images]
image_metadata = [image.get_metadata() for image in images]

add_documents(
    retriever=retriever,
    doc_summaries=image_summaries,
    doc_contents_str=image_contents,
    doc_metadata=image_metadata,
)

In [None]:
docs = retriever.get_relevant_documents(
    query="instruction tuning large language models"
)

elements = langchain_doc_to_element(docs)
for element in elements:
    display(element)

In [None]:
chain = get_chain(config)

In [None]:
chain.invoke("What is the relative score of GPT-4 ?")

In [None]:
chain.invoke("What is the relative score of GPT-4 ?")

In [None]:
chain.invoke("Describe the picture of Mona Lisa")