In [158]:
%pip install -Uq "unstructured[all-docs]"
%pip install -Uq langchain_chroma
%pip install -Uq langchain langchain-community langchain-openai
%pip install -Uq python_dotenv

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [159]:
import json
from typing import List
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title

from langchain_core.documents import Document # has raw data + internal content
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma 
from langchain_core.messages import HumanMessage
from dotenv import load_dotenv 

load_dotenv()

False

In [None]:
import os
from getpass import getpass

os.environ["OPENAI_API_KEY"] = api_key


In [161]:
def partition_document(file_path):
    """" Extract Documents from pdf using unstructured """
    elements = partition_pdf(
        filename= file_path,
        strategy= "hi_res",
        infer_table_structure= True,
        extract_image_block_types=["Image"],
        extract_image_block_to_payload=True
    )
    print(f"Extracted {len(elements)} elements")
    return elements


In [162]:
file_path = "imagenet.pdf"
elements=partition_document(file_path)

Extracted 137 elements


In [163]:
def create_chunks_by_title(elements):
    """" Create chunking by using title as the main differentiator""" 

    chunks = chunk_by_title(
        elements,
        combine_text_under_n_chars=500,
        max_characters=3000,
        new_after_n_chars=2400

    )
    print(f"Created {len(chunks)} chunks")
    return chunks


In [164]:
chunks=create_chunks_by_title(elements)

Created 19 chunks


In [165]:
def separate_content_types(chunk):
    ''' Analyze what types of content are there in a chunk''' 
    content_data = {
        'text': chunk.text,
        'tables': [],
        'images': [],
        'types' : ['text']
    }
    # check for tables and images in original elements
    
    for element in chunk.metadata.orig_elements:
        element_type = type(element).__name__

        if element_type == 'Table':
            content_data['types'].append('table')
            table_html = getattr(element.metadata,'text_as_html', "table not found")
            #print(table_html)
            content_data['tables'].append(table_html)
            #print(content_data['tables'])
        elif element_type == 'Image':
            content_data['types'].append('image')
            image_base64 = getattr(element.metadata,'image_base64', "image not found" )
            #print(image_base64)
            content_data['images'].append(image_base64)
            #print(content_data['images'])
    content_data['types']= list(set(content_data['types']))
    return content_data

def created_ai_summary(text:str, tables: List[str], images: List[str]) -> str:
    """ Create AI enhanced summary """ 
    try:
        llm=ChatOpenAI(model="gpt-4o", temperature = 0)

        prompt_text = f"""
You are an expert content summarizer. Your job is to analyze the provided text, tables, and images 
and create a clear, concise, and structured summary.

CONTENT TO ANALYZE:
-------------------
TEXT CONTENT:
{text}

"""
        if tables:
            prompt_text += "TABLES:\n"
            for i, table in enumerate(tables):
                prompt_text += f"Table {i+1}:\n{table}\n\n"

        if images:
            prompt_text += "IMAGES are also provided (as base64). Analyze them if relevant.\n\n"

        prompt_text += """
        YOUR TASK:
        1. Provide a well-structured summary (max ~500 words).
        2. Capture the main ideas, trends, and insights across text, tables, and images.
        3. If data is numeric (from tables), highlight key figures and patterns.
        4. If images are included, mention what they add to the context.
        5. End with a "SEARCHABLE DESCRIPTION" – 3-5 keywords or short phrases someone might use to find this content.

        OUTPUT FORMAT:
        ---------------
        Summary:
        [write summary here]

        Searchable Description:
        [keywords/phrases here]
        """

        message_content= [{"type":"text", "text": prompt_text}]

        for image_base64 in images:
            message_content.append({
                "type":"image_url",
                "image_url": {"url":f"data:image/jpeg;base64,{image_base64}"}
            })
        message = HumanMessage(content=message_content)
        response = llm.invoke([message])
        print(response.content)
        return response.content
    except Exception as e:
        print(f" AI summary failed : {e}")
        summary = f"{text[:300]}..."


In [166]:
def summarize_chunks(chunks):
    langchain_documents = []
    total_chunks=len(chunks)

    for i, chunk in enumerate(chunks):
        current_chunk=i+1
        print(f"Processing chunk {current_chunk}/{total_chunks}")

        content_data = separate_content_types(chunk)

        print(f"Types found: {content_data['types']}")
        
        if content_data["tables"] or content_data['images']:
            print("Creating AI summary for this")
            try: 
                enhanced_content = created_ai_summary(
                    content_data['text'],
                    content_data['tables'],
                    content_data['images']
                )
                print(f" AI summary created successfully balle balle")
                print(f" Enhanced content preview {enhanced_content[:200]}...")
            except Exception as e:
                print("AI summary failed sed sed") 
                enhanced_content = content_data['text']
        else:
            print("Using raw text") 
            enhanced_content = content_data['text']
        
        doc = Document(
            page_content= enhanced_content,
            metadata= {
                "original content": json.dumps({
                    "raw_text":content_data['text'],
                    "tables_html": content_data["tables"],
                    "images_base64": content_data['images']

                })
            }
        )
            
        langchain_documents.append(doc)
    print(f"Processed {len(langchain_documents)} chunks")
    return langchain_documents



In [167]:
chunks[4].metadata.orig_elements[7].metadata.image_base64

'/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAF+AdUDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD36iiigApGYKpYnAAyaWmuiyRsjDKsCCPUUAczpXiTULu60uS7soIrHV0ZrQxuTJGQpdRICMfMgJ46EY561N4n1PWNHtnvLJbB7dVVVjmDmWSVjhUULxySoH1qtpWgarbz6Nb30lqbLRkKwPE7F522GNSykYXCk9CefStK/wB

In [168]:
chunks[13].metadata.orig_elements[4].metadata.text_as_html

'<table><tbody><tr><td>publicly available, we cannot report test error rates for all the models that</td><td>+ [24] CNN</td><td>37.5%</td><td>17.0%</td></tr><tr><td>we tried. In the remainder of this paragraph, we use validation and test error rates interchangeably because</td><td>Table 1: Comparison</td><td>of results on</td><td>ILSVRC-</td></tr><tr><td>in our experience they do not differ by more than 0.1%</td><td>2010 test set. In</td><td>italics are</td><td>best results</td></tr></tbody></table>'

In [169]:
docs1 =summarize_chunks(chunks)

Processing chunk 1/19
Types found: ['text']
Using raw text
Processing chunk 2/19
Types found: ['text']
Using raw text
Processing chunk 3/19
Types found: ['text']
Using raw text
Processing chunk 4/19
Types found: ['text']
Using raw text
Processing chunk 5/19
Types found: ['text', 'image']
Creating AI summary for this
Summary:

The architecture of the discussed neural network consists of eight learned layers, including five convolutional and three fully-connected layers. A significant feature of this architecture is the use of Rectified Linear Units (ReLUs) as the nonlinearity function for neurons. Unlike traditional saturating nonlinearities such as tanh or sigmoid functions, ReLUs are non-saturating and defined as \( f(x) = \max(0, x) \). This characteristic allows for much faster training times when using gradient descent, as demonstrated in the provided graph.

The graph illustrates the training error rate over epochs for a four-layer convolutional neural network using ReLUs compared

In [170]:
print(json.loads(docs1[0].metadata["original content"])["tables_html"])

[]


In [171]:
docs1[0]

Document(metadata={'original content': '{"raw_text": "ImageNet Classi\\ufb01cation with Deep Convolutional Neural Networks\\n\\nAlex Krizhevsky\\n\\nUniversity of Toronto kriz@cs.utoronto.ca\\n\\nIlya Sutskever University of Toronto ilya@cs.utoronto.ca\\n\\nGeoffrey E. Hinton\\n\\nUniversity of Toronto hinton@cs.utoronto.ca\\n\\nAbstract\\n\\nWe trained a large, deep convolutional neural network to classify the 1.2 million high-resolution images in the ImageNet LSVRC-2010 contest into the 1000 dif- ferent classes. On the test data, we achieved top-1 and top-5 error rates of 37.5% and 17.0% which is considerably better than the previous state-of-the-art. The neural network, which has 60 million parameters and 650,000 neurons, consists of \\ufb01ve convolutional layers, some of which are followed by max-pooling layers, and three fully-connected layers with a \\ufb01nal 1000-way softmax. To make train- ing faster, we used non-saturating neurons and a very ef\\ufb01cient GPU implemen- tati

In [172]:
docs1[14].page_content

'Summary:\nThe content provided focuses on the performance comparison of different models on the ILSVRC-2012 validation and test sets, specifically highlighting error rates. The models compared include SIFT + FVs, 1 CNN, 5 CNNs, 1 CNN*, and 7 CNNs*. The asterisk (*) denotes models that were pre-trained to classify the entire ImageNet 2011 Fall release, which is a significant detail as it implies these models had additional training data.\n\nThe table provided (Table 1) outlines the performance of these models in terms of Top-1 and Top-5 error rates on both validation and test sets. The Top-1 error rate refers to the percentage of test images for which the correct label is not the top predicted label, while the Top-5 error rate refers to the percentage of test images for which the correct label is not within the top five predicted labels.\n\nKey figures from the table include:\n- The SIFT + FVs model achieved a Top-5 test error rate of 26.2%.\n- The 1 CNN model achieved a Top-1 validati

In [173]:
def create_vector_store(docs, persist_directory="db2/chroma_db"):
    """ Create and persist ChromaDB vector store""" 
    print("Creating embeddings and storing in ChromaDB...")
    embedding_model = OpenAIEmbeddings(model= "text-embedding-3-small")

    print("--Creating vector store ---")
    vectorstore = Chroma.from_documents (
        documents= docs,
        embedding= embedding_model,
        persist_directory=persist_directory,
        collection_metadata = {"hnsw:space" : "cosine"}
    )

    print("Finished Creating vector store")
    print(f"vector store creted and stored to {persist_directory}")
    return vectorstore


In [174]:
db = create_vector_store(docs1)

Creating embeddings and storing in ChromaDB...
--Creating vector store ---
Finished Creating vector store
vector store creted and stored to db2/chroma_db


In [175]:
def generate_final_answer(chunks,query):
    """ generate final answer to the query """
    try: 
        llm = ChatOpenAI(model="gpt-4o", temperature=0)
        prompt_text = f"""You are an expert assistant. Your job is to read the provided documents 
        (text, tables, and images if available) and give a clear, accurate, and well-structured 
        answer to the user query.

        USER QUERY:
        {query}

        CONTENT TO ANALYZE:""" 
        for i,chunk in enumerate(chunks):
            
            prompt_text += f"---Document{i+1} --- \n"
            raw_text=json.loads(chunks[i].metadata["original content"])["raw_text"]
            #print(f"CHUNK NO:{i}\n RAW TEXT:{raw_text}")
            prompt_text+= f"TEXT:\n{raw_text}\n\n"
            
            #print(json.loads(chunks[i].metadata["original_content"])["tables_html"])
            #original_data = json.loads(chunks[i].metadata["original_content"])["tables_html"]
            
            if json.loads(chunks[i].metadata["original content"])["tables_html"]:
                prompt_text += "TABLES: \n"
                for j,table in enumerate(json.loads(chunks[i].metadata["original content"])["tables_html"]):
                    prompt_text +=f"Tables {j+1}: \n {table} \n \n"
            prompt_text += "\n"
            prompt_text += """INSTRUCTIONS:
            1. Use the documents as your main source of truth.
            2. If images are provided, describe what they show and connect it with the text.
            3. Summarize and explain clearly, avoid copying raw text.
            4. Be concise but thorough.
            5. If data is not provided in the retrieved documents the just say that I dont have enough information
            
            FINAL ANSWER:""" 
            
        message_content = [{"type":"text","text": prompt_text}]
        for chunk in chunks:
            if "original content"in chunk.metadata:
                original_data= json.loads(chunk.metadata["original content"])
                images_base64 = original_data.get("images_base64", [])
                for images_base64 in images_base64:
                    message_content.append({
                        "type": "image_url",
                        "image_url" : {"url":  f"data:image/jpeg;base64,{images_base64}"}
                    })
        print(message_content)
        message = HumanMessage(content=message_content)
        response = llm.invoke([message])
        return response.content

    except Exception as e:
        print(f"answer generation failed ")
        return "Problem occured, retry" 


In [176]:
chunks[0]

<unstructured.documents.elements.CompositeElement at 0x7fc81b406210>

In [177]:
print("------------------STARTING THE PIPELINE------------------")
file_path = "imagenet.pdf"
elements=partition_document(file_path)
print("------------------CHUNKING BY TITLE------------------")
chunks=create_chunks_by_title(elements)
print("------------------CREATING SUMMARY FOR MULTIMODAL DATA------------------")
docs1=summarize_chunks(chunks)
print("------------------CREATING VECTOR STORE------------------")
db = create_vector_store(docs1)

------------------STARTING THE PIPELINE------------------
Extracted 137 elements
------------------CHUNKING BY TITLE------------------
Created 19 chunks
------------------CREATING SUMMARY FOR MULTIMODAL DATA------------------
Processing chunk 1/19
Types found: ['text']
Using raw text
Processing chunk 2/19
Types found: ['text']
Using raw text
Processing chunk 3/19
Types found: ['text']
Using raw text
Processing chunk 4/19
Types found: ['text']
Using raw text
Processing chunk 5/19
Types found: ['text', 'image']
Creating AI summary for this
Summary:

The architecture of the discussed neural network consists of eight learned layers, including five convolutional and three fully-connected layers. A significant feature of this architecture is the use of Rectified Linear Units (ReLUs) as the nonlinearity function for neurons. Unlike traditional saturating nonlinearities such as tanh or sigmoid functions, ReLUs are non-saturating and allow for much faster training times when using gradient desc

In [180]:
query = "what does this model give results so much better than any other approaches" 
retriever = db.as_retriever(search_kwargs={"k":3})
chunks = retriever.invoke(query)
print("---------------------RETRIEVAL DONE-------------------")
generate_final_answer(chunks,query)

---------------------RETRIEVAL DONE-------------------
[{'type': 'text', 'text': 'You are an expert assistant. Your job is to read the provided documents \n        (text, tables, and images if available) and give a clear, accurate, and well-structured \n        answer to the user query.\n\n        USER QUERY:\n        what does this model give results so much better than any other approaches\n\n        CONTENT TO ANALYZE:---Document1 --- \nTEXT:\nModel SIFT + FVs [7] 1 CNN 5 CNNs 1 CNN* 7 CNNs* Top-1 (val) Top-5 (val) Top-5 (test) — — 26.2% 40.7% 18.2% — 38.1% 16.4% 16.4% 39.0% 16.6% — 36.7% 15.4% 15.3%\n\nTable 2: Comparison of error rates on ILSVRC-2012 validation and test sets. In italics are best results achieved by others. Models with an asterisk* were “pre-trained” to classify the entire ImageNet 2011 Fall release. See Section 6 for details.\n\n40.9%, attained by the net described above but with an additional, sixth convolutional layer over the last pooling layer. The best publis

"The model described in the documents achieves superior results compared to other approaches primarily due to the use of advanced techniques such as dropout and the architecture of the neural network.\n\n1. **Dropout Technique**: Dropout is a regularization method that helps prevent overfitting in neural networks. By randomly setting the output of each hidden neuron to zero with a probability of 0.5 during training, the network effectively samples different architectures. This forces the network to learn more robust features that are not reliant on specific neurons, thus improving generalization. At test time, all neurons are used, but their outputs are scaled down, which approximates the geometric mean of the predictions from the various dropout networks. This technique allows the model to reduce test errors efficiently without the computational cost of training multiple separate models.\n\n2. **Network Architecture**: The model's architecture, which includes multiple convolutional la