In [1]:
import json

from typing import List

from unstructured.chunking.title import chunk_by_title 
from unstructured.partition.pdf import partition_pdf 

from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings, ChatHuggingFace
from langchain_huggingface.llms import HuggingFaceEndpoint
from langchain_chroma import Chroma 
from langchain_core.messages import HumanMessage, SystemMessage 
from dotenv import load_dotenv 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

True

In [3]:
def partition_document(file_path: str):
    """Extract elements from PDF using unstructured.io"""
    print(f"Partitioning document: {file_path}")
    
    elements = partition_pdf(
        filename=file_path,  # Path to your PDF file
        strategy="hi_res", # Use the most accurate (but slower) processing method of extraction
        infer_table_structure=True, # Keep tables as structured HTML, not jumbled text
        extract_image_block_types=["Image"], # Grab images found in the PDF
        extract_image_block_to_payload=True # Store images as base64 data you can actually use
    )
    
    print(f"Extracted {len(elements)} elements")
    return elements


def create_chunks_by_title(elements):
    """Create intelligent chunks using title based strategy"""
    print("Creating smart chunks.....")
    chunks = chunk_by_title(
        elements=elements,
        max_characters=3000, # Hard limit - never exceed 3000 characters per chunk
        new_after_n_chars= 2400, # Try to start a new chunk after 2400 characters 
        combine_text_under_n_chars= 500, # merge tiny chunks under 500 chars with neighbours
    )
    print(f"Created {len(chunks)} chunks")
    return chunks


def separate_content_types(chunk):
    # basically separate the individual elements by type
    content_types = {
        'text': chunk.text,
        'images': [],
        'tables': [],
        'types' : ['text']
    }

    if hasattr(chunk, 'metadata') and hasattr(chunk.metadata, 'orig_elements'):
        for element in chunk.metadata.orig_elements:
            # print(element.to_dict())
            element_type = type(element).__name__

            if element_type == "Table":
                table_html = getattr(element.metadata, "text_as_html")
                content_types['tables'].append(table_html)
                content_types['types'].append('table')
                
            if element_type == "Image":
                image_base64 = getattr(element.metadata, "image_base64")
                content_types['images'].append(image_base64)
                content_types['types'].append('image')
            
    content_types['types'] = list(set(content_types['types']))

    return content_types

def create_ai_enhanced_summary(text: str, images: List[str] | None, tables: List[str] | None):
    print("Inside the Create AI Enhanced Summary function")
    """Create an AI enhanced summary for multimodal content"""
    try:


        hf_endpoint = HuggingFaceEndpoint(
            model="CohereLabs/aya-vision-32b", # you have to make sure that this model has an InferenceProvider on the HuggingFace Website.
            task="conversational",
            temperature=0,
            provider="auto"
        )

        llm = ChatHuggingFace(llm=hf_endpoint)

        prompt_text = f"""You are creating a searchable description for document content retrieval.
        CONTENT TO ANALYZE:
        TEXT CONTENT:
        {text}
        """

        if tables:
            prompt_text += "TABLES:\n"
            for i, table in enumerate(tables):
                prompt_text += f"Table {i+1}:\n{table}\n\n"
            prompt_text += """
            YOUR TASK:
            Generate a comprehensive, searchable description that covers:

            1. Key facts, numbers, and data points from text and tables
            2. Main topics and concepts discussed  
            3. Questions this content could answer
            4. Visual content analysis (charts, diagrams, patterns in images)
            5. Alternative search terms users might use

            Make it detailed and searchable - prioritize findability over brevity.

            SEARCHABLE DESCRIPTION:
            """
        
        message_content = [{"type": "text", "text": prompt_text}]

        if images:
            for image in images:
                message_content.append({
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{image}"}
                })

        message = HumanMessage(content=message_content)
        response = llm.invoke([message])
        
        return response.content

    except Exception as e:
        print(f"AI Summary Failed BECAUSE {e}") 
        summary = f"{text[:300]}..."
        return summary



def summarise_chunks(chunks):
    """Process all chunks with AI summaries"""
    print(f"Processing chunks with AI summaries....")

    langchain_documents = []
    total_chunks = len(chunks)

    for i, chunk in enumerate(chunks):
        print(f"Procesing chunk {i+1} / {total_chunks}")
        content_data = separate_content_types(chunk)
        if content_data['images'] or content_data['tables']:
            print(f"Creating AI summary for chunk {i+1}.")
            try:
                enhanced_content = create_ai_enhanced_summary(content_data['text'], content_data['images'], content_data['tables'])
            except Exception as e:
                print(f"AI Summary Failed : {e}")
                enhanced_content = content_data['text']
        else:
            print(f"Using only raw text (no tables / images)")
            enhanced_content = content_data['text']
        
        doc = Document(
            page_content=enhanced_content, 
            metadata = {
                "original_content": json.dumps({
                    "raw_text": content_data['text'],
                    "images": content_data['images'],
                    "tables": content_data['tables'],
                })
            }
        )

        langchain_documents.append(doc)
    
    print(f"Processed {len(langchain_documents)} chunks!")
    return langchain_documents
        
def create_vector_store(documents, persist_directory="db/multimodal"):
    """Create and persist ChromaDB vector store"""
    print("ðŸ”® Creating embeddings and storing in ChromaDB...")
        
    embedding_model = HuggingFaceEmbeddings(model="intfloat/e5-large-v2")
    
    # Create ChromaDB vector store
    print("--- Creating vector store ---")
    vectorstore = Chroma.from_documents(
        documents=documents,
        embedding=embedding_model,
        persist_directory=persist_directory, 
        collection_metadata={"hnsw:space": "cosine"}
    )
    print("--- Finished creating vector store ---")
    
    print(f"âœ… Vector store created and saved to {persist_directory}")
    return vectorstore

In [4]:
file_path = "./docs/1706.03762v7.pdf" 
elements = partition_document(file_path)

Partitioning document: ./docs/1706.03762v7.pdf


The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


Extracted 215 elements


In [5]:
chunks = create_chunks_by_title(elements)

Creating smart chunks.....
Created 25 chunks


In [6]:
processed_chunks = summarise_chunks(chunks)

Processing chunks with AI summaries....
Procesing chunk 1 / 25
Using only raw text (no tables / images)
Procesing chunk 2 / 25
Using only raw text (no tables / images)
Procesing chunk 3 / 25
Using only raw text (no tables / images)
Procesing chunk 4 / 25
Using only raw text (no tables / images)
Procesing chunk 5 / 25
Creating AI summary for chunk 5.
Inside the Create AI Enhanced Summary function
Procesing chunk 6 / 25
Using only raw text (no tables / images)
Procesing chunk 7 / 25
Creating AI summary for chunk 7.
Inside the Create AI Enhanced Summary function
Procesing chunk 8 / 25
Using only raw text (no tables / images)
Procesing chunk 9 / 25
Using only raw text (no tables / images)
Procesing chunk 10 / 25
Using only raw text (no tables / images)
Procesing chunk 11 / 25
Using only raw text (no tables / images)
Procesing chunk 12 / 25
Creating AI summary for chunk 12.
Inside the Create AI Enhanced Summary function
Procesing chunk 13 / 25
Using only raw text (no tables / images)
Proces

In [7]:
db = create_vector_store(processed_chunks)

ðŸ”® Creating embeddings and storing in ChromaDB...
--- Creating vector store ---
--- Finished creating vector store ---
âœ… Vector store created and saved to db/multimodal


In [46]:
query = "According to table 1, what is the complexity per layer of self attention?"
retriever = db.as_retriever(search_kwargs={"k": 3})
relevant_docs = retriever.invoke(query)
print(relevant_docs)

[Document(id='b7466f2f-0821-4685-ac43-1f17649ccaf7', metadata={'original_content': '{"raw_text": "4 Why Self-Attention\\n\\nIn this section we compare various aspects of self-attention layers to the recurrent and convolu- tional layers commonly used for mapping one variable-length sequence of symbol representations (x1,...,xn) to another sequence of equal length (z1,...,zn), with xi,zi \\u2208 Rd, such as a hidden layer in a typical sequence transduction encoder or decoder. Motivating our use of self-attention we consider three desiderata.\\n\\nOne is the total computational complexity per layer. Another is the amount of computation that can be parallelized, as measured by the minimum number of sequential operations required.\\n\\nThe third is the path length between long-range dependencies in the network. Learning long-range dependencies is a key challenge in many sequence transduction tasks. One key factor affecting the ability to learn such dependencies is the length of the paths fo

In [43]:
def generate_final_answer(relevant_docs, query):
    try:
        hf_endpoint = HuggingFaceEndpoint(
            model="CohereLabs/aya-vision-32b", # you have to make sure that this model has an InferenceProvider on the HuggingFace Website.
            task="conversational",
            temperature=0,
            provider="auto"
        )

        llm = ChatHuggingFace(llm=hf_endpoint)

        prompt_text = f"""Based on the following documents, please answer this question : {query}.

        CONTENT TO ANALYZE:
        """

        for i, doc in enumerate(relevant_docs):
            prompt_text += f"----- DOCUMENT {i+1} -----\n"
            if "original_content" in doc.metadata:
                og_data = json.loads(doc.metadata["original_content"])
                
                raw_text = og_data.get("raw_text", "")
                if raw_text:
                    prompt_text += f"TEXT : \n{raw_text}\n\n"
                
                raw_tables = og_data.get("tables", [])
                if raw_tables:
                    prompt_text += "TABLES: \n"
                    for j, table in enumerate(raw_tables):
                        prompt_text += f"Table {j+1}:\n{table}\n"
                
        prompt_text += """
        Please provide a clear, comprehensive answer using the text, tables, and images above. If the documents don't contain sufficient information to answer the question, say "I don't have enough information to answer that question based on the provided documents."

        ANSWER:"""

        message_content = [{"type": "text", "text": prompt_text}]

        images = og_data.get("images", [])
        if images:
            for image in images:
                message_content.append({
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{image}"}
                })
        
        message = HumanMessage(content=message_content)
        response = llm.invoke([message])
        
        return response.content
        
        
    except Exception as e:
        print(f"Problem answering the question : {e}")
        return "Sorry, encountered a problem when answering"

In [44]:
final_answer = generate_final_answer(relevant_docs, query)

In [45]:
print(final_answer)

The dimensionality of the embeddings in the model described in the documents is dmodel = 512. This is evident from Document 2, which states that "the dimensionality of input and output is dmodel = 512." This dimensionality is consistent across the input embeddings, output embeddings, and the positional encodings added to the input embeddings. The use of a consistent dimensionality allows for the efficient summation of the input embeddings and positional encodings, as well as the application of the feed-forward network and attention mechanisms within the model.
