# RAG 3 : PDF ingestion and RAG

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
from pathlib import Path

from dotenv import load_dotenv

os.chdir(Path.cwd().joinpath("../.."))
print(Path.cwd())
load_dotenv(override=True)

In [None]:
import time
from pathlib import Path

from hydra import compose, initialize
from unstructured.partition.pdf import partition_pdf

from backend.rag_3.chain import get_chain
from backend.rag_3.config import validate_config
from backend.rag_3.ingest import (
    apply_summarize_image,
    apply_summarize_table,
    apply_summarize_text,
)
from backend.utils.elements import langchain_doc_to_element
from backend.utils.retriever import add_documents_multivector, get_retriever
from backend.utils.unstructured import (
    load_chunking_func,
    select_images,
    select_tables,
    select_texts,
)

t = time.time()

In [None]:
N_DISPLAY = 5

In [None]:
with initialize(config_path=".", version_base=None):
    config = compose(config_name="config")
    print(config)

    # validate config
    _ = validate_config(config)

In [None]:
# Folder with pdf and extracted images
filename = "Attention.pdf"
file_path = Path(config.path.docs, filename)
print(file_path)

In [None]:
%%time
# Get elements
raw_pdf_elements = partition_pdf(
    filename=file_path,
    infer_table_structure=True,
    extract_image_block_types=["image", "table"],
    extract_image_block_to_payload=True,
)

In [None]:
raw_pdf_elements

In [None]:
# Get images
images = select_images(raw_pdf_elements)
for image in images[:N_DISPLAY]:
    display(image)

In [None]:
# Get chunks
if config.ingest.chunking_enable:
    chunk_func = load_chunking_func(config)
    chunks = chunk_func(raw_pdf_elements)
else:
    chunks = raw_pdf_elements
chunks

In [None]:
# Get text, tables
texts = select_texts(chunks, config.ingest.metadata_keys)
tables = select_tables(chunks, config.ingest.table_format, config.ingest.metadata_keys)

In [None]:
for text in texts[:N_DISPLAY]:
    display(text)

In [None]:
for table in tables[:N_DISPLAY]:
    display(table)

In [None]:
# Summarize text
await apply_summarize_text(texts, config)
for text in texts[:N_DISPLAY]:
    display(text)

In [None]:
# Summarize tables
await apply_summarize_table(tables, config)
for table in tables[:N_DISPLAY]:
    display(table)

In [None]:
# Summarize images
await apply_summarize_image(images, config)
for image in images[:N_DISPLAY]:
    display(image)

In [None]:
retriever = get_retriever(config)

In [None]:
# Add texts to retriever
text_summaries = [text.get_summary() for text in texts]
text_contents = [text.get_content() for text in texts]
text_metadata = [text.get_metadata() for text in texts]

add_documents_multivector(
    retriever=retriever,
    doc_summaries=text_summaries,
    doc_contents_str=text_contents,
    doc_metadata=text_metadata,
)

In [None]:
# Add tables to retriever
table_summaries = [table.get_summary() for table in tables]
table_contents = [table.get_content() for table in tables]
table_metadata = [table.get_metadata() for table in tables]

add_documents_multivector(
    retriever=retriever,
    doc_summaries=table_summaries,
    doc_contents_str=table_contents,
    doc_metadata=table_metadata,
)

In [None]:
# Add images to retriever
image_summaries = [image.get_summary() for image in images]
image_contents = [image.get_content() for image in images]
image_metadata = [image.get_metadata() for image in images]

add_documents_multivector(
    retriever=retriever,
    doc_summaries=image_summaries,
    doc_contents_str=image_contents,
    doc_metadata=image_metadata,
)

In [None]:
docs = retriever.get_relevant_documents(query="Transformer Architecture")

elements = langchain_doc_to_element(docs)
for element in elements:
    display(element)

In [None]:
chain = get_chain(config)

In [None]:
# Page 3
chain.invoke("Describe the Transformer architecture")

In [None]:
# Page 6
chain.invoke("What is the complexity of self-attention?")

In [None]:
# Page 6
chain.invoke("Explain the formula of positional encoding")

In [None]:
# Page 8
chain.invoke("What are the BLEU performance of Transformer?")

In [None]:
# No mention
chain.invoke("What are the ROUGE performance of Transformer?")

In [None]:
# Page 9
chain.invoke("Describe the variations of Transformer")

In [None]:
# Page 9
chain.invoke("Between variations A and B, which one is better on PPL?")

In [None]:
# Page 10
chain.invoke("Is Transformer performing well on English constituency parsing?")

In [None]:
# Page 13
chain.invoke("What words are connected to `making` in the weight visualization?")

In [None]:
# Page 13
chain.invoke("Is `governments` connected to `making` in the weight visualization?")

In [None]:
# Page 13
chain.invoke("Is `2009` connected to `making`?")

In [None]:
# Page 14
chain.invoke("What words are associated with `its` in heads 5 and 6?")

In [None]:
# Page 14
chain.invoke(
    "What word has connection with `its` on only one head (heads between 5 and 6)?"
)

In [None]:
# Page 14
chain.invoke(
    "What word has connection with `its` on only one head (heads between 1 and 4)?"
)

In [None]:
# Page 14
chain.invoke("What word has connection with `its` on 2 heads?")

In [None]:
print(f"Total time: {time.strftime('%H:%M:%S', time.gmtime(time.time() - t))}")