In [13]:
import json 
from typing import List 
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title

from langchain_community.docstore.document import Document
from langchain_community.document_loaders import PyMuPDFLoader
from glob import glob
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.messages import HumanMessage
import time
from dotenv import load_dotenv
import os 

load_dotenv()

print("[---- ALL LIB IMPORTED -----]")

[---- ALL LIB IMPORTED -----]


In [15]:
api_key = os.getenv("GROQ_API_KEY")

In [5]:
def partition_document(file_path:str):
    """
    Extract Elements from PDF using unstructured
    """
    print(f"Partitioning Document :{file_path}")

    
    elements = partition_pdf(
        filename=file_path,
        strategy="hi_res",
        infer_table_structure=True,  # keep table as sturctured HTML not jumbled text
        extract_image_block_types=["Image"],
        extract_image_block_to_payload=True # store images as base64
    )

    print(f"----- Elements {len(elements)} elements")
    return elements

file_path = r"C:\Users\bhavi\OneDrive\Desktop\langhcain_learning\MultiModalRAG\docs\attention_paper.pdf"
elements = partition_document(file_path)

Partitioning Document :C:\Users\bhavi\OneDrive\Desktop\langhcain_learning\MultiModalRAG\docs\attention_paper.pdf
----- Elements 220 elements


In [6]:
set([str(type(el)) for el in elements])

{"<class 'unstructured.documents.elements.FigureCaption'>",
 "<class 'unstructured.documents.elements.Footer'>",
 "<class 'unstructured.documents.elements.Formula'>",
 "<class 'unstructured.documents.elements.Header'>",
 "<class 'unstructured.documents.elements.Image'>",
 "<class 'unstructured.documents.elements.ListItem'>",
 "<class 'unstructured.documents.elements.NarrativeText'>",
 "<class 'unstructured.documents.elements.Table'>",
 "<class 'unstructured.documents.elements.Text'>",
 "<class 'unstructured.documents.elements.Title'>"}

In [8]:
print("[--- GATHER ALL IMAGES FROM PDF -----] \n")
images = [ element for element in elements if element.category =="Image"]
print(f"[-----FOUND {len(images)}---- IMAGES]")

images[0].to_dict()

[--- GATHER ALL IMAGES FROM TABLE -----] 

[-----FOUND 7---- IMAGES]


{'type': 'Image',
 'element_id': 'de0a7728c4faf437e94cce87f56ce5d4',
 'text': 'Output Probabilities Add & Norm Feed Forward Add & Norm Multi-Head Attention Add & Norm Masked Multi-Head Attention Add & Norm Feed Forward Add & Norm Multi-Head Attention Nx Positional Encoding O° Positional 4 oe Encoding Input Output Embedding Embedding Inputs Outputs (shifted right)',
 'metadata': {'coordinates': {'points': ((545.9972222222221,
     200.00555555555542),
    (545.9972222222221, 1095.6055555555556),
    (1153.997222222222, 1095.6055555555556),
    (1153.997222222222, 200.00555555555542)),
   'system': 'PixelSpace',
   'layout_width': 1700,
   'layout_height': 2200},
  'last_modified': '2026-01-09T15:51:01',
  'filetype': 'application/pdf',
  'languages': ['eng'],
  'page_number': 3,
  'image_base64': '/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIy

In [9]:
print("[--- GATHER ALL TABLE FROM PDF -----] \n")
tables = [ element for element in elements if element.category =="Table"]
print(f"[-----FOUND {len(tables)}---- IMAGES]")
tables[0].to_dict()

[--- GATHER ALL TABLE FROM PDF -----] 

[-----FOUND 4---- IMAGES]


{'type': 'Table',
 'element_id': '5152df511fda81454867606b78694b44',
 'text': 'Layer Type Complexity per Layer Sequential Maximum Path Length Operations Self-Attention O(n2 · d) O(1) O(1) Recurrent O(n · d2) O(n) O(n) Convolutional O(k · n · d2) O(1) O(logk(n)) Self-Attention (restricted) O(r · n · d) O(1) O(n/r)',
 'metadata': {'detection_class_prob': 0.928255021572113,
  'is_extracted': 'true',
  'coordinates': {'points': ((320.3291931152344, 312.45477294921875),
    (320.3291931152344, 519.1640014648438),
    (1363.98291015625, 519.1640014648438),
    (1363.98291015625, 312.45477294921875)),
   'system': 'PixelSpace',
   'layout_width': 1700,
   'layout_height': 2200},
  'last_modified': '2026-01-09T15:51:01',
  'text_as_html': '<table><thead><tr><th>Layer Type</th><th>Complexity per Layer</th><th>Sequential Operations</th><th>Maximum Path Length</th></tr></thead><tbody><tr><td>Self-Attention</td><td>O(n? - d)</td><td>O(1)</td><td>o(1)</td></tr><tr><td>Recurrent</td><td>O(n- d?)</td

In [10]:
def create_chunks_by_title(elements):
    """
    Create Intelligent chunks by using title-based strategy
    """
    chunks = chunk_by_title(
        elements,
        max_characters=3000,
        new_after_n_chars=2400, # start new chunk after 2400
        combine_text_under_n_chars=500
    )

    print(f"CREATED {len(chunks)} CHUNKS")
    return chunks


# creating chunks 
chunks = create_chunks_by_title(elements)

CREATED 25 CHUNKS


In [11]:
# View original elements
chunks[11].metadata.orig_elements[-1].to_dict()

{'type': 'Table',
 'element_id': '5152df511fda81454867606b78694b44',
 'text': 'Layer Type Complexity per Layer Sequential Maximum Path Length Operations Self-Attention O(n2 · d) O(1) O(1) Recurrent O(n · d2) O(n) O(n) Convolutional O(k · n · d2) O(1) O(logk(n)) Self-Attention (restricted) O(r · n · d) O(1) O(n/r)',
 'metadata': {'detection_class_prob': 0.928255021572113,
  'is_extracted': 'true',
  'coordinates': {'points': ((320.3291931152344, 312.45477294921875),
    (320.3291931152344, 519.1640014648438),
    (1363.98291015625, 519.1640014648438),
    (1363.98291015625, 312.45477294921875)),
   'system': 'PixelSpace',
   'layout_width': 1700,
   'layout_height': 2200},
  'last_modified': '2026-01-09T15:51:01',
  'text_as_html': '<table><thead><tr><th>Layer Type</th><th>Complexity per Layer</th><th>Sequential Operations</th><th>Maximum Path Length</th></tr></thead><tbody><tr><td>Self-Attention</td><td>O(n? - d)</td><td>O(1)</td><td>o(1)</td></tr><tr><td>Recurrent</td><td>O(n- d?)</td

In [27]:

def seperate_content_types(chunk):
    """ Analyze what type of content are in chunk and sort them """
    content_data = {
        'text': chunk.text,
        'tables': [], 
        'images': [],
        'types': ['text']
    }

    if hasattr(chunk, 'metadata') and hasattr(chunk.metadata, 'orig_elements'):
        for element in chunk.metadata.orig_elements:
            element_type = type(element).__name__

            if element_type == "Table":
                content_data['types'].append('table')
                # Use HTML version if available; LLMs read <table> tags better than text
                table_html = getattr(element.metadata, 'text_as_html', element.text)
                content_data['tables'].append(table_html)

            elif element_type == 'Image':
                if hasattr(element, 'metadata') and hasattr(element.metadata, 'image_base64'):
                    content_data['types'].append('image')
                    content_data['images'].append(element.metadata.image_base64)

    content_data['types'] = list(set(content_data['types']))
    return content_data

def create_ai_enhanced_summary(text: str, tables: List[str], images: List[str]) -> str:
    """Create AI-enhanced summary using Llama 4 Scout (Supports up to 5 images)"""
    try:
        # Initializing the latest Llama 4 Scout model
        llm = ChatGroq(
            model="meta-llama/llama-4-scout-17b-16e-instruct", 
            temperature=0,
            api_key=api_key
        )
        
        prompt_text = (
            "You are an expert document analyzer. Describe the following content for a search index.\n"
            "Summarize all facts, data from tables, and visual details from images into a searchable description.\n\n"
            f"TEXT CONTENT:\n{text}\n\n"
        )
        
        if tables:
            prompt_text += "TABLE DATA (HTML):\n" + "\n".join(tables) + "\n\n"

        message_content = [{"type": "text", "text": prompt_text}]
        
        for img_base64 in images[:5]:
            message_content.append({
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}
            })
        
        response = llm.invoke([HumanMessage(content=message_content)])
        return response.content
        
    except Exception as e:
        print(f"AI summary failed: {e}")
        return text  

def summarise_chunks(chunks):
    """Processes all chunks, generates AI summaries for mixed content, and returns LangChain Docs"""
    print(f"[--- Processing {len(chunks)} chunks ---]")
    langchain_documents = []
    
    for i, chunk in enumerate(chunks):
        print(f"Processing chunk {i+1}/{len(chunks)}...")
        content_data = seperate_content_types(chunk)

        if len(content_data['tables']) > 0 or len(content_data['images']) > 0:
            enhanced_content = create_ai_enhanced_summary(
                content_data['text'],
                content_data['tables'], 
                content_data['images']
            )
        else:
            enhanced_content = content_data['text']

        original_content_json = json.dumps({
            "raw_text": content_data['text'],
            "tables_html": content_data['tables'],
            "images_base64": content_data['images']
        })
        
        doc = Document(
            page_content=enhanced_content,
            metadata={
                "original_content": original_content_json,
                "has_tables": len(content_data['tables']) > 0,
                "has_images": len(content_data['images']) > 0
            }
        )
        langchain_documents.append(doc)
    
    print("[--- Processing Complete ---]")
    return langchain_documents



In [28]:
processed_docs = summarise_chunks(chunks)

[--- Processing 25 chunks ---]
Processing chunk 1/25...
Processing chunk 2/25...
Processing chunk 3/25...
Processing chunk 4/25...
Processing chunk 5/25...
Processing chunk 6/25...
Processing chunk 7/25...
Processing chunk 8/25...
Processing chunk 9/25...
Processing chunk 10/25...
Processing chunk 11/25...
Processing chunk 12/25...
Processing chunk 13/25...
Processing chunk 14/25...
Processing chunk 15/25...
Processing chunk 16/25...
Processing chunk 17/25...
Processing chunk 18/25...
Processing chunk 19/25...
Processing chunk 20/25...
Processing chunk 21/25...
Processing chunk 22/25...
Processing chunk 23/25...
Processing chunk 24/25...
Processing chunk 25/25...
[--- Processing Complete ---]


In [30]:
for i, doc in enumerate(processed_docs):
    print(f"--- DOCUMENT CHUNK {i+1} ---")
    print(f"METADATA:")
    print(f"  - DATA --: {doc.metadata['original_content']}")
    print("-" * 30)

--- DOCUMENT CHUNK 1 ---
METADATA:
  - DATA --: {"raw_text": "3\n\n2023\n\n2\n\n0\n\n2\n\ng u A 2 ] L C . s c [ 7 v 2 6 7 3 0 . 6 0\n\n7\n\n1\n\n:\n\nv\n\narXiv\n\ni\n\nX\n\nr\n\na\n\nProvided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works.\n\nAttention Is All You Need\n\nAshish Vaswani\u2217\n\nGoogle Brain\n\navaswani@google.com\n\nNoam Shazeer\u2217 Google Brain noam@google.com\n\nNiki Parmar\u2217 Google Research nikip@google.com\n\nJakob Uszkoreit\u2217\n\nGoogle Research usz@google.com\n\nLlion Jones\u2217\n\nGoogle Research llion@google.com\n\nAidan N. Gomez\u2217 \u2020 University of Toronto aidan@cs.toronto.edu\n\n\u0141ukasz Kaiser\u2217 Google Brain lukaszkaiser@google.com", "tables_html": [], "images_base64": []}
------------------------------
--- DOCUMENT CHUNK 2 ---
METADATA:
  - DATA --: {"raw_text": "Illia Polosukhin\u2217 \u2021\n\nillia.polosukhin@gmail

In [31]:
def export_chunks_to_json(chunks, filename="chunks_export.json"):
    """Export processed chunks to clean JSON format"""
    export_data = []
    
    for i, doc in enumerate(chunks):
        chunk_data = {
            "chunk_id": i + 1,
            "enhanced_content": doc.page_content,
            "metadata": {
                "original_content": json.loads(doc.metadata.get("original_content", "{}"))
            }
        }
        export_data.append(chunk_data)
    

    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(export_data, f, indent=2, ensure_ascii=False)
    
    print(f"Exported {len(export_data)} chunks to {filename}")
    return export_data

# Export your chunks
json_data = export_chunks_to_json(processed_docs)

Exported 25 chunks to chunks_export.json


In [32]:
def create_vector_store(documents, persist_directory="dbv1/mlti_rag"):
    """Create and persist ChromaDB vector store"""
    print(" Creating embeddings and storing in ChromaDB...")
        
    embedding_model = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en')

   
    print("[--- Creating vector store ---]")
    vectorstore = Chroma.from_documents(
        documents=documents,
        embedding=embedding_model,
        persist_directory=persist_directory, 
        collection_metadata={"hnsw:space": "cosine"}
    )
    print("[--- Finished creating vector store ---]")
    
    print(f"Vector store created and saved to {persist_directory}")
    return vectorstore

# Create the vector store
db = create_vector_store(processed_docs)

 Creating embeddings and storing in ChromaDB...




[--- Creating vector store ---]
[--- Finished creating vector store ---]
Vector store created and saved to dbv1/mlti_rag


In [33]:
# After your retrieval
query = "What are the two main components of the Transformer architecture? "
retriever = db.as_retriever(search_kwargs={"k": 3})
chunks = retriever.invoke(query)

# Export to JSON
export_chunks_to_json(chunks, "rag_results.json")

Exported 3 chunks to rag_results.json


[{'chunk_id': 1,
  'enhanced_content': 'Here is a searchable description of the content:\n\n**Model Variations and Performance Evaluation**\n\nThe Transformer model was evaluated on English-to-German translation on the newstest2013 development set. The base model was varied in different ways to measure the change in performance.\n\n**Table 3: Variations on the Transformer Architecture**\n\n| Model | N | dmodel | dff | h | dk | dv | Pdrop | ϵls | train steps | PPL (dev) | BLEU (dev) | params (dev) ×10^6 |\n| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |\n| base | 6 | 512 | 2048 | 8 | 64 | 64 | 0.1 | 0.1 | 100K | 4.92 | 25.8 | 65 |\n| (A) | 1, 4, 16, 32 | 512, 128, 32, 16 | 512, 128, 32, 16 |  |  |  |  |  |  | 5.29, 5.00, 4.91, 5.01 | 24.9, 25.5, 25.8, 25.4 |  |\n| (B) |  | 16, 32 |  |  |  |  |  |  |  | 5.16, 5.01 | 25.1, 25.4 | 58, 60 |\n| (C) | 2, 4, 8, 256, 1024 |  |  |  |  |  |  |  |  | 6.11, 5.19, 4.88, 5.75, 4.66 | 23.7, 25.3, 25.5, 24.5, 26.0 | 36, 

In [35]:
def run_complete_ingestion_pipeline(pdf_path: str):
    """Run the complete RAG ingestion pipeline"""
    print("Starting RAG Ingestion Pipeline")
    print("=" * 50)
    
    # Step 1: Partition
    elements = partition_document(pdf_path)
    
    # Step 2: Chunk
    chunks = create_chunks_by_title(elements)
    
    # Step 3: AI Summarisation
    summarised_chunks = summarise_chunks(chunks)
    
    # Step 4: Vector Store
    db = create_vector_store(summarised_chunks, persist_directory="dbv2/chroma_db")
    
    print(" Pipeline completed successfully!")
    return db

In [36]:
db = run_complete_ingestion_pipeline(file_path)

Starting RAG Ingestion Pipeline
Partitioning Document :C:\Users\bhavi\OneDrive\Desktop\langhcain_learning\MultiModalRAG\docs\attention_paper.pdf
----- Elements 220 elements
CREATED 25 CHUNKS
[--- Processing 25 chunks ---]
Processing chunk 1/25...
Processing chunk 2/25...
Processing chunk 3/25...
Processing chunk 4/25...
Processing chunk 5/25...
Processing chunk 6/25...
Processing chunk 7/25...
Processing chunk 8/25...
Processing chunk 9/25...
Processing chunk 10/25...
Processing chunk 11/25...
Processing chunk 12/25...
Processing chunk 13/25...
Processing chunk 14/25...
Processing chunk 15/25...
Processing chunk 16/25...
Processing chunk 17/25...
Processing chunk 18/25...
Processing chunk 19/25...
Processing chunk 20/25...
Processing chunk 21/25...
Processing chunk 22/25...
Processing chunk 23/25...
Processing chunk 24/25...
Processing chunk 25/25...
[--- Processing Complete ---]
 Creating embeddings and storing in ChromaDB...




[--- Creating vector store ---]
[--- Finished creating vector store ---]
Vector store created and saved to dbv2/chroma_db
 Pipeline completed successfully!


In [37]:
# Query the vector store
query = "How many attention heads does the Transformer use, and what is the dimension of each head? "

retriever = db.as_retriever(search_kwargs={"k": 3})
chunks = retriever.invoke(query)

def generate_final_answer(chunks, query):
    """Generate final answer using multimodal content"""
    
    try:
        llm = ChatGroq(
            model="meta-llama/llama-4-scout-17b-16e-instruct", 
            temperature=0,
            api_key=api_key
        )

        prompt_text = f"""Based on the following documents, please answer this question: {query}

CONTENT TO ANALYZE:
"""
        
        for i, chunk in enumerate(chunks):
            prompt_text += f"--- Document {i+1} ---\n"
            
            if "original_content" in chunk.metadata:
                original_data = json.loads(chunk.metadata["original_content"])
                
                raw_text = original_data.get("raw_text", "")
                if raw_text:
                    prompt_text += f"TEXT:\n{raw_text}\n\n"
          
                tables_html = original_data.get("tables_html", [])
                if tables_html:
                    prompt_text += "TABLES:\n"
                    for j, table in enumerate(tables_html):
                        prompt_text += f"Table {j+1}:\n{table}\n\n"
            
            prompt_text += "\n"
        
        prompt_text += """
Please provide a clear, comprehensive answer using the text, tables, and images above. If the documents don't contain sufficient information to answer the question, say "I don't have enough information to answer that question based on the provided documents."

ANSWER:"""


        message_content = [{"type": "text", "text": prompt_text}]
        
        for chunk in chunks:
            if "original_content" in chunk.metadata:
                original_data = json.loads(chunk.metadata["original_content"])
                images_base64 = original_data.get("images_base64", [])
                
                for image_base64 in images_base64:
                    message_content.append({
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}
                    })
        
        message = HumanMessage(content=message_content)
        response = llm.invoke([message])
        
        return response.content
        
    except Exception as e:
        print(f"Answer generation failed: {e}")
        return "Sorry, I encountered an error while generating the answer."

final_answer = generate_final_answer(chunks, query)
print(final_answer)

The Transformer uses 8 attention heads, and the dimension of each head is 64.

This information can be found in Table 1 of Document 3, which describes the base model as having 8 attention heads (h) with a key and value dimension (dk and dv) of 64. 

To be more specific, the table lists the following parameters for the base model:
- N: 6
- dmodel: 512
- dff: 2048
- h: 8
- dk: 64
- dv: 64

This indicates that the Transformer uses multi-head attention with 8 heads, and each head has a dimension of 64.
