# **1. Importing Libraies**

In [1]:
%%capture
# Install Dependencies (Linux Only)
!pip install -U langchain openai langchain-chroma langchain-experimental langchain_openai
!pip install "unstructured[pdf]" pillow pydantic lxml pillow matplotlib tiktoken open_clip_torch
!apt-get install poppler-utils tesseract-ocr

In [11]:
import os
import uuid
import nltk
import base64
import chromadb
import numpy as np
from PIL import Image as _PILImage
from IPython.display import HTML, display
from unstructured.partition.pdf import partition_pdf

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import HumanMessage

from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.schema.document import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever

nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [12]:
# Set API keys
os.environ["OPENAI_API_KEY"] = "sk-..."
os.environ["GROQ_API_KEY"] = "gsk...

# **2. Extracting Text, Images and Tables from the PDF**

In [13]:
# Specify pdf path and folder to store images
file_path = "paper.pdf"
output_path = "images/"

if not os.path.exists(output_path):
    os.makedirs(output_path)

In [14]:
# Parsing the PDF to extract chunks
# Reference: https://docs.unstructured.io/open-source/core-functionality/chunking
chunks = partition_pdf(
    filename=file_path,
    infer_table_structure=True,                 # extract tables
    strategy="hi_res",                          # mandatory to infer tables
    extract_image_block_types=["Image", "Table"],  # Add 'Table' to list to extract image of tables
    image_output_dir_path=output_path,             # if None, images and tables will saved in base64
    extract_image_block_to_payload=True,   # if true, will extract base64 for API usage
    chunking_strategy="by_title",          # or 'basic'
    max_characters=1200,                   # defaults to 500
    combine_text_under_n_chars= 100,        # defaults to 0
    new_after_n_chars=100,
)

In [15]:
# Extract Text, Tables and Images from Composite Elements
texts = []
tables = []
tables_html = []
images_b64 = []

for composite_element in chunks:
  for sub_element in composite_element.metadata.orig_elements:
    if "Table" in str(type(sub_element)):
        tables.append(str(sub_element))
        tables_html.append(sub_element.metadata.text_as_html)
    elif "Image" in str(type(sub_element)):
        images_b64.append(sub_element.metadata.image_base64)
    elif "Footer" not in str(type(sub_element)):
        texts.append(str(sub_element))

In [None]:
# Combine short strings into bigger ones
def combine_short_strings(strings, threshold):
    result = []
    temp = ""
    for string in strings:
        if temp:
            temp += " " + string
        else:
            temp = string

        if len(temp) >= threshold:
            result.append(temp)
            temp = ""

    if temp:
        result.append(temp)

    return result

threshold = 50
texts = combine_short_strings(texts, threshold)

In [17]:
# Save the images to a file
for i in range(len(images_b64)):
    image_data = base64.b64decode(images_b64[i])
    output_file = output_path + f"image_{i}.jpg"
    with open(output_file, "wb") as file:
        file.write(image_data)

# Get image URIs with .jpg extension only
image_paths = sorted(
    [
        os.path.join(output_path, image_name)
        for image_name in os.listdir(output_path)
        if image_name.endswith(".jpg")
    ]
)

# **3. Summarizing Data**

In [37]:
# Prompt
prompt_table = """You are an assistant tasked with summarizing tables and text for retrieval. \
These summaries will be embedded and used to retrieve the raw text or table elements. \
Give a concise summary of the table or text that is well optimized for retrieval in markdown format.
Table or text: {element} """

prompt_text = """You are an assistant tasked with summarizing tables and text for retrieval. \
These summaries will be embedded and used to retrieve the raw text or table elements. \
Give a comprehensive summary of the table or text that is well optimized for retrieval in markdown format.
Table or text: {element} """

prompt_text = ChatPromptTemplate.from_template(prompt_text)
prompt_table = ChatPromptTemplate.from_template(prompt_table)

# Text summary chain
model = ChatOpenAI(temperature=0, model="gpt-4o-mini")
summarize_chain_text = {"element": lambda x: x} | prompt_text | model | StrOutputParser()
summarize_chain_table = {"element": lambda x: x} | prompt_table | model | StrOutputParser()

# Summarize
text_summaries = summarize_chain_text.batch(texts, {"max_concurrency": 3})
table_summaries = summarize_chain_table.batch(tables_html, {"max_concurrency": 3})

In [38]:
def encode_image(image_path):
    """Getting the base64 string"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

def image_summarize(img_base64, prompt):
    """Make image summary"""
    chat = ChatOpenAI(model="gpt-4o-mini", max_tokens=1024)

    msg = chat.invoke(
          [HumanMessage(
              content=[
                    {"type": "text", "text": prompt},
                    {"type": "image_url",
                     "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},},
                ]
            )
        ]
    )
    return msg.content

prompt = """Describe the image in detail. For context,
                  the image is part of a research paper explaining the transformers
                  architecture. Be specific about graphs, such as bar plots."""

image_summaries = []
for image in images_b64:
    image_summaries.append(image_summarize(image, prompt))

In [39]:
print(text_summaries[5])

# Summary of Transformer Sequence Lengths Trend

**Figure Title:** Trend of Transformer Sequence Lengths Over Time

**Key Points:**
- The figure illustrates the trend of sequence lengths used in Transformer models over a specified time period.
- The data is represented in millions of dollars, with a focus on values less than 5 million.
- The trend indicates fluctuations in sequence lengths, which may correlate with advancements in model architecture or training techniques.

**Optimization for Retrieval:**
- **Keywords:** Transformer models, sequence lengths, trend analysis, time series, machine learning, model architecture.
- **Context:** Useful for understanding the evolution of Transformer models and their efficiency in handling varying sequence lengths.

This summary encapsulates the essential information regarding the trend of Transformer sequence lengths, making it easy to retrieve the relevant data or figure.


In [40]:
print(table_summaries[0])

### Summary of Computation Complexity Methods

| Method                | Computation Complexity |
|-----------------------|-----------------------|
| Recurrent             | O(Nd?)                |
| Vanilla Attention     | O(N? d)               |
| Sparse Attention      | O(NVNd)               |
| Dilated Attention     | O(Nd)                 |

This table summarizes various methods and their corresponding computation complexities.


In [41]:
print(image_summaries[0])

The image presents a line graph illustrating the evolution of transformer architectures over time, spanning from the year 2017 to 2023. The x-axis denotes the years, while the y-axis represents the model sizes in terms of parameters, with values ranging from 0 to 1000.

Key features of the graph include:

1. **Data Points**: Each point on the graph corresponds to a specific transformer model, annotated with its name and size:
   - **GPT (512)**, placed at the year 2018.
   - **Sparse Transformer (12K)**, positioned slightly to the right in 2019.
   - **Reformer (64K)**, located in 2020.
   - **Memorizing Transformers (262K)**, appearing in 2022.
   - **RMT (1M)**, marked for the year 2022, indicating a substantial increase in model size.

2. **LongNet (1B)**: The most significant point on the graph, labeled in red, is LongNet (1B). This point is prominently at the far right on the 2023 mark, with a value exceeding 1000, signifying a dramatic increase in model size compared to its prede

# **3. Creating a Multimodal Vectorstore (ChromaDB)**

In [42]:
# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="multi_modal_rag",
                     embedding_function=OpenAIEmbeddings(),
                     persist_directory="chroma_langchain_db")

# The storage layer for the parent documents
id_key = "doc_id"
store = InMemoryStore()

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

  embedding_function=OpenAIEmbeddings(),
  vectorstore = Chroma(collection_name="multi_modal_rag",


In [43]:
# Add texts
doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [Document(page_content=summary, metadata={id_key: doc_ids[i]}) for i, summary in enumerate(text_summaries)]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, texts)))

# Add tables
table_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [Document(page_content=summary, metadata={id_key: table_ids[i]}) for i, summary in enumerate(table_summaries)]
retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(table_ids, tables)))

# Add image summaries
img_ids = [str(uuid.uuid4()) for _ in images_b64]
summary_img = [Document(page_content=summary, metadata={id_key: img_ids[i]}) for i, summary in enumerate(image_summaries)]
retriever.vectorstore.add_documents(summary_img)
retriever.docstore.mset(list(zip(img_ids, images_b64)))

### **Download the vectorstore and images to use locally**

In [44]:
from google.colab import files
!zip -r /content/db.zip /content/chroma_langchain_db/
!zip -r /content/images.zip /content/images/

files.download("/content/db.zip")
files.download("/content/images.zip")

  adding: content/chroma_langchain_db/ (stored 0%)
  adding: content/chroma_langchain_db/0029ac48-b5c5-4755-9b42-a61f8e18b33c/ (stored 0%)
  adding: content/chroma_langchain_db/0029ac48-b5c5-4755-9b42-a61f8e18b33c/data_level0.bin (deflated 65%)
  adding: content/chroma_langchain_db/0029ac48-b5c5-4755-9b42-a61f8e18b33c/length.bin (deflated 51%)
  adding: content/chroma_langchain_db/0029ac48-b5c5-4755-9b42-a61f8e18b33c/link_lists.bin (stored 0%)
  adding: content/chroma_langchain_db/0029ac48-b5c5-4755-9b42-a61f8e18b33c/header.bin (deflated 61%)
  adding: content/chroma_langchain_db/chroma.sqlite3 (deflated 55%)
  adding: content/images/ (stored 0%)
  adding: content/images/image_1.jpg (deflated 35%)
  adding: content/images/image_5.jpg (deflated 29%)
  adding: content/images/image_4.jpg (deflated 37%)
  adding: content/images/image_7.jpg (deflated 27%)
  adding: content/images/image_6.jpg (deflated 33%)
  adding: content/images/image_0.jpg (deflated 32%)
  adding: content/images/image_3.

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **4. Retreival Sanity Check**

In [45]:
def is_base64(s):
    """Check if a string is Base64 encoded"""
    try:
        return base64.b64encode(base64.b64decode(s)) == s.encode()
    except Exception:
        return False

def plt_img_base64(img_base64):
    # Create an HTML img tag with the base64 string as the source
    image_html = f'<img src="data:image/jpeg;base64,{img_base64}" />'

    # Display the image by rendering the HTML
    display(HTML(image_html))

In [46]:
retriever = vectorstore.as_retriever()
docs = retriever.invoke("What is Dialated attention?", k=10)
for doc in docs:
    if is_base64(doc.page_content):
        plt_img_base64(doc.page_content)
        print('---------')
    else:
        print(doc.page_content)
        print('\n---------')

# Summary of Dilated Attention

## Overview
Dilated attention is a mechanism that processes input data by dividing it into segments and applying sparsification. 

## Key Components
- **Input Variables**: The input consists of three components: Query (Q), Key (K), and Value (V).
- **Segmentation**: The input is split into N segments, each of length w, resulting in segments denoted as {( ̃Qi, ̃Ki, ̃Vi)}.
- **Sparsification**: Each segment undergoes a sparsification process along the sequence dimension by selecting rows at a specified interval r.

## Computation
The computation of dilated attention can be expressed mathematically, although the specific formula is not provided in the text.

This summary encapsulates the main aspects of dilated attention, focusing on its structure and processing method, making it suitable for retrieval purposes.

---------
# Summary of Dilated Attention Mechanism

The text discusses the implementation of dilated attention, represented as \( O = [ \hat{O}_0,