# RAG Option 3 : PDF ingestion and RAG

**Table of contents**<a id='toc0_'></a>    
- [Load config](#toc1_)    
- [Partition PDF](#toc2_)    
  - [Images](#toc2_1_)    
  - [Chunking text](#toc2_2_)    
  - [Text and tables](#toc2_3_)    
- [Summarization](#toc3_)    
- [Add elements to retriever](#toc4_)    
- [Test retriever](#toc5_)    
- [Test RAG chain](#toc6_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=2
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
from pathlib import Path

from dotenv import load_dotenv

os.chdir(Path.cwd().joinpath("../.."))
print(Path.cwd())
load_dotenv(override=True)

In [None]:
import logging
import shutil
import time
from pathlib import Path

from hydra import compose, initialize

from backend.rag_3 import prompts
from backend.rag_3.chain import get_chain
from backend.rag_3.config import validate_config
from backend.rag_components.elements import convert_documents_to_elements
from backend.rag_components.ingest import (
    add_elements_to_multivector_retriever,
    apply_summarize_image,
    apply_summarize_table,
    apply_summarize_text,
)
from backend.rag_components.retriever import get_retriever
from backend.rag_components.unstructured import (
    load_chunking_func,
    load_partition_pdf_func,
    select_images,
    select_tables,
    select_texts,
)
from backend.utils.utils import format_time_delta

logging.basicConfig(format="[%(asctime)s] - %(name)s - %(levelname)s - %(message)s")
logging.getLogger("backend").setLevel(logging.INFO)
t = time.time()

In [None]:
N_DISPLAY = 5

## <a id='toc1_'></a>[Load config](#toc0_)

In [None]:
with initialize(config_path=".", version_base=None):
    config = compose(config_name="config")
    print(config)

    # validate config
    _ = validate_config(config)

In [None]:
# Folder with pdf and extracted images
filename = "Attention.pdf"
file_path = Path(config.path.docs, filename)
print(file_path)

In [None]:
if config.ingest.clear_database:
    database_folder = Path(config.path.database)
    print(f"Clearing database: {database_folder}")
    shutil.rmtree(database_folder, ignore_errors=True)

## <a id='toc2_'></a>[Partition PDF](#toc0_)

In [None]:
t_partition = time.time()

# Get elements
partition_pdf = load_partition_pdf_func(config)
raw_pdf_elements = partition_pdf(filename=file_path)

print(f"Partition time: {format_time_delta(time.time() - t_partition)}")

In [None]:
raw_pdf_elements

### <a id='toc2_1_'></a>[Images](#toc0_)

In [None]:
# Get images
images = select_images(
    raw_pdf_elements,
    metadata_keys=config.ingest.metadata_keys,
    min_size=config.ingest.image_min_size,
)
for image in images[:N_DISPLAY]:
    display(image)

### <a id='toc2_2_'></a>[Chunking text](#toc0_)

In [None]:
# Get chunks
if config.ingest.chunking_enable:
    chunk_func = load_chunking_func(config)
    chunks = chunk_func(raw_pdf_elements)
else:
    chunks = raw_pdf_elements
chunks

### <a id='toc2_3_'></a>[Text and tables](#toc0_)

In [None]:
# Get text, tables
texts = select_texts(
    chunks,
    metadata_keys=config.ingest.metadata_keys,
)
tables = select_tables(
    chunks,
    table_format=config.ingest.table_format,
    metadata_keys=config.ingest.metadata_keys,
    min_size=config.ingest.table_min_size,
)

In [None]:
for text in texts[:N_DISPLAY]:
    display(text)

In [None]:
for table in tables[:N_DISPLAY]:
    display(table)

## <a id='toc3_'></a>[Summarization](#toc0_)

In [None]:
t_summarization = time.time()

In [None]:
# Summarize text
await apply_summarize_text(
    text_list=texts,
    config=config,
    prompt_template=prompts.TEXT_SUMMARIZATION_PROMPT,
)
for text in texts[:N_DISPLAY]:
    display(text)

In [None]:
# Summarize tables
await apply_summarize_table(
    table_list=tables,
    config=config,
    prompt_template=prompts.TABLE_SUMMARIZATION_PROMPT,
)
for table in tables[:N_DISPLAY]:
    display(table)

In [None]:
# Summarize images
await apply_summarize_image(
    image_list=images,
    config=config,
    prompt_template=prompts.IMAGE_SUMMARIZATION_PROMPT,
)
for image in images[:N_DISPLAY]:
    display(image)

In [None]:
print(f"Summarization time: {format_time_delta(time.time() - t_summarization)}")

## <a id='toc4_'></a>[Add elements to retriever](#toc0_)

In [None]:
retriever = get_retriever(config)

In [None]:
# Add texts to retriever
add_elements_to_multivector_retriever(
    elements=texts,
    retriever=retriever,
    vectorstore_source=config.ingest.vectorstore_source.text,
    docstore_source=config.ingest.docstore_source.text,
)

In [None]:
# Add tables to retriever
add_elements_to_multivector_retriever(
    elements=tables,
    retriever=retriever,
    vectorstore_source=config.ingest.vectorstore_source.table,
    docstore_source=config.ingest.docstore_source.table,
)

In [None]:
# Add images to retriever
add_elements_to_multivector_retriever(
    elements=images,
    retriever=retriever,
    vectorstore_source=config.ingest.vectorstore_source.image,
    docstore_source=config.ingest.docstore_source.image,
)

In [None]:
print(f"Total ingestion time: {format_time_delta(time.time() - t_partition)}")

## <a id='toc5_'></a>[Test retriever](#toc0_)

In [None]:
docs = retriever.get_relevant_documents(query="Transformer Architecture")

elements = convert_documents_to_elements(docs)
for element in elements:
    display(element)

## <a id='toc6_'></a>[Test RAG chain](#toc0_)

In [None]:
chain = get_chain(config)
t_rag = time.time()

In [None]:
# Page 3
chain.invoke("Describe the Transformer architecture")

In [None]:
# Page 6
chain.invoke("What is the complexity of self-attention?")

In [None]:
# Page 6
chain.invoke("Explain the formula of positional encoding")

In [None]:
# Page 8
chain.invoke("What are the BLEU performance of Transformer?")

In [None]:
# No mention
chain.invoke("What are the ROUGE performance of Transformer?")

In [None]:
# Page 9
chain.invoke("Describe the variations of Transformer")

In [None]:
# Page 9
chain.invoke("Between variations A and B, which one is better on PPL?")

In [None]:
# Page 10
chain.invoke("Is Transformer performing well on English constituency parsing?")

In [None]:
# Page 13
chain.invoke("What words are connected to `making` in the weight visualization?")

In [None]:
# Page 13
chain.invoke("Is `governments` connected to `making` in the weight visualization?")

In [None]:
# Page 13
chain.invoke("Is `2009` connected to `making`?")

In [None]:
# Page 14
chain.invoke("What words are associated with `its` in heads 5 and 6?")

In [None]:
# Page 14
chain.invoke(
    "What word has connection with `its` on only one head (heads between 5 and 6)?"
)

In [None]:
# Page 14
chain.invoke(
    "What word has connection with `its` on only one head (heads between 1 and 4)?"
)

In [None]:
# Page 14
chain.invoke("What word has connection with `its` on 2 heads?")

In [None]:
print(f"RAG time: {format_time_delta(time.time() - t_rag)}")
print(f"Total time: {format_time_delta(time.time() - t)}")