# Importing Packages

In [None]:
import fitz 
from langchain_core.documents import Document
from transformers import CLIPProcessor , CLIPModel #This is used to for summarizing both images & texts ( Contrast Languaeg Image Pre Training )
from PIL import Image
import torch 
import numpy as np
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import os
import base64 #helps to store the image into this format
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage


# Loading Clip Model 

In [15]:
# Initialize CLIP model for unified embedings

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") #train both image & text data from PDF
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") #ensures the format & tokensied remains same from the above model's format
clip_model.eval()


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

# Embedding Functions

In [None]:
# Create an embedding function for Images 

def embedd_image(image_data) :

    """Embedd Image using CLIP """
    
    if isinstance(image_data, str) : #if given as path
        image = Image.open(image_data).convert("RGB")
    else : # if given as PIL Image only
        image = image_data

    inputs = clip_processor(images=image , return_tensors="pt" ) # coverts entire format to tensors ( DL concpet )

    with torch.no_grad() : # disbaled to save memory
        features = clip_model.get_image_features(**inputs)

        #Normalizing the embeddings to unit vector ( covert all different dimensions to a unit vector )
        features = features / features.norm(dim = 1, keepdim = True)
        return features.squeeze().cpu().numpy()

In [17]:
# Create an embedding function for Text 

def embedd_text(text) :

    """Embedd Text using CLIP"""

    inputs = clip_processor(
        text = text ,
        return_tensors="pt",
        padding = True , 
        truncation = True ,
        max_length = 77 # CLIP max token length
    )

    with torch.no_grad() :
        features = clip_model.get_text_features(**inputs)

        #Normalizing the embeddings to unit vector ( covert all different dimensions to a unit vector )
        features = features / features.norm(dim = 1, keepdim = True)
        return features.squeeze().cpu().numpy()

# Process PDF

In [None]:
pdf_path = r"C:\Users\Sunay\Desktop\agentic-ai-the-new-frontier-in-genai-an-executive-playbook"
doc = fitz.open(pdf_path)

FileNotFoundError: no such file: 'C:\Users\Sunay\Desktop\agentic-ai-the-new-frontier-in-genai-an-executive-playbook'

In [19]:
# Sotring for all documents and embeddings 

all_docs = []
all_embeddings = []
image_data_store = {}

In [20]:
# Text Splitter 

splitter = RecursiveCharacterTextSplitter(chunk_size = 1000 , chunk_overlap = 500 )

In [21]:
doc

Document('C:\Users\Sunay\Desktop\Ebook-Agentic-AI.pdf')

In [22]:
for i , page in enumerate(doc) :
    # process text 
    text = page.get_text()

    if text.strip():
        # creat a temp doc for splitting 
        temp_doc = Document(page_content = text , metadata={"page":i , "type":text})
        text_chunks = splitter.split_documents([temp_doc])

        # Embed each text converted chunk using CLIP 
        for chunk in text_chunks:
            embedding = embedd_text(chunk.page_content) #created function
            all_embeddings.append(embedding)
            all_docs.append(chunk)

    # Process Images

    # 3 Important Actions - 
    # i. Convert PDF Image to PIL format
    # ii. Store as base64 for Gemini Multimodal LLM ( should be in base 64 )
    # iii. Create CLIP embedding for retirval

    for img_index , img in enumerate(page.get_images(full = True )): # get all image info
        try :
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            # Converting to PIL Image 
            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")

            # Creating a unique Identifier
            image_id = f"page_{i}_img_{img_index}"

            # Store Image as base64 for later use while using LLM
            buffered = io.BytesIO()
            pil_image.save(buffered , format = "PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode()
            image_data_store[image_id] = img_base64

            # Embed image using CLIP 
            embedding = embedd_image(pil_image)
            all_embeddings.append(embedding)

            #Creating document of image 
            image_doc = Document(
                page_content=f"Image:{image_id}",
                metadata ={"page" : i , "type":"image","image_id" : image_id}
            ) 
            all_docs.append(image_doc)

        except Exception as e :
            print(f"Error processing image {img_index} on page {i} : {e}")

doc.close()

In [23]:
# Check the stored embeddings 

all_embeddings

[array([ 1.18476572e-02,  4.59373463e-03, -4.22904901e-02, -1.94893274e-02,
         4.47038114e-02,  6.25370350e-03,  2.55214144e-02,  1.00744632e-03,
        -2.42595375e-03, -5.78762731e-03,  4.14185859e-02,  2.68254392e-02,
         5.69887981e-02,  3.13153937e-02, -8.93720612e-03,  1.19905896e-03,
         2.97153629e-02,  2.87660770e-02, -1.60202887e-02,  1.59778073e-03,
         2.11583171e-02, -4.59142542e-03, -7.98687898e-03,  3.35340053e-02,
        -3.41327935e-02, -2.38303750e-04,  2.65449751e-02, -1.54065592e-02,
        -9.63775255e-03, -2.16407496e-02,  4.70372848e-02, -5.32204844e-03,
        -1.75524596e-02, -3.42158899e-02, -3.18505010e-03, -8.50727875e-03,
         4.21676189e-02, -2.79438496e-02, -1.26879718e-02, -1.68718338e-01,
        -2.88882554e-02, -1.04462234e-02,  2.55879164e-02, -1.60045139e-02,
        -8.48841667e-03,  2.61841994e-02, -3.21562178e-02, -1.86241530e-02,
        -2.08509266e-02,  7.02025145e-02,  3.81805487e-02, -9.42917541e-03,
        -1.2

In [24]:
all_docs

[Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_0'}, page_content='Image:page_0_img_0'),
 Document(metadata={'page': 1, 'type': 'Konverge AI is a decision science firm empowering \nbusinesses with the transformative power of AI. \nOperating at the intersection of data, machine \nlearning (ML) models, and business insights, we \nhelp \norganizations \ndevelop \ncutting-edge \nAI \nproducts and solutions. \nThis book provides actionable insights into Agentic \nAI, combining Konverge AI’s expertise with \ncontributions from Emergence AI. Emergence AI \nshared its deep knowledge in autonomous \nmulti-agent orchestration, addressing challenges \nlike outdated systems, complex processes, and \nregulatory compliance. Their solutions enhance \nadaptability and optimize operations.\nBy merging these perspectives, this book is a \npractical guide to leveraging Agentic AI, helping \nbusinesses navigate and excel in a dynamic world.\n'}, page_content='Konverge AI is a decis

# Create a Vector Store ( FAISS )

In [25]:
# Create a unified FAISS vector sotre with CLIP embedings 
embeddings_array = np.array(all_embeddings)

embeddings_array

array([[ 0.01184766,  0.00459373, -0.04229049, ..., -0.0114922 ,
         0.00675914, -0.01442797],
       [ 0.03029755,  0.0358094 , -0.0191733 , ...,  0.02621511,
        -0.00536314, -0.00361416],
       [ 0.01432921, -0.01166013, -0.00558024, ...,  0.01908205,
         0.00054203, -0.00718972],
       ...,
       [ 0.00185617,  0.01055579, -0.04615432, ...,  0.06195967,
         0.0046137 , -0.06365831],
       [ 0.04012128, -0.01114862,  0.01027755, ...,  0.00445096,
        -0.00258538,  0.01261639],
       [ 0.00611752, -0.00572225,  0.00584129, ...,  0.0925255 ,
        -0.04443743, -0.00094634]], shape=(194, 512), dtype=float32)

In [26]:
(all_docs,embeddings_array)

([Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_0'}, page_content='Image:page_0_img_0'),
  Document(metadata={'page': 1, 'type': 'Konverge AI is a decision science firm empowering \nbusinesses with the transformative power of AI. \nOperating at the intersection of data, machine \nlearning (ML) models, and business insights, we \nhelp \norganizations \ndevelop \ncutting-edge \nAI \nproducts and solutions. \nThis book provides actionable insights into Agentic \nAI, combining Konverge AI’s expertise with \ncontributions from Emergence AI. Emergence AI \nshared its deep knowledge in autonomous \nmulti-agent orchestration, addressing challenges \nlike outdated systems, complex processes, and \nregulatory compliance. Their solutions enhance \nadaptability and optimize operations.\nBy merging these perspectives, this book is a \npractical guide to leveraging Agentic AI, helping \nbusinesses navigate and excel in a dynamic world.\n'}, page_content='Konverge AI is a dec

In [28]:
# Create custom FAISS Index as we have pre computed embeddings 
vector_store = FAISS.from_embeddings(
    text_embeddings=[(doc.page_content,emb) for doc , emb in zip(all_docs , embeddings_array)],
    embedding=None,
    metadatas=[doc.metadata for doc in all_docs]
)

vector_store

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


<langchain_community.vectorstores.faiss.FAISS at 0x15d369f17f0>

# Initalizing LLM

In [None]:
# Using Gemini API 

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash" ,api_key="AIzaSyAC3y6fywJd97DhsWAjngCwwj7VUxHyKHM")

In [38]:
llm

ChatGoogleGenerativeAI(model='models/gemini-2.5-flash', google_api_key=SecretStr('**********'), client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x0000015D0F3ABB10>, default_metadata=(), model_kwargs={})

# Simple Retrival Pipline

In [None]:
# Use a method to retrieve the data from vector 

def retrieve_multimodal ( query , k = 5 ):

    """Unified retreival using CLIP embeddings for both text & images"""

    # Embed query using CLIP 
    query_embedding = embedd_text(query)

    # Search the answer similar to the query and then return 
    results = vector_store.similarity_search_by_vector( # using by vector although their are other ways for this
        embedding=query_embedding,
        k=k
    )

    return results

In [43]:
def create_multimodal_message(query, retrieved_docs):
    """Create a message with both text & images for LLM"""

    content = []

    # Add the query
    content.append({
        "type": "text",
        "text": f"Question: {query}\n\nContext:\n"
    })

    # Separate Text & Image Docs
    text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "text"]
    image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "image"]

    # Add the text context
    if text_docs:
        text_context = "\n\n".join([
            f"[Page {doc.metadata['page']}]: {doc.page_content}"
            for doc in text_docs
        ])
        content.append({
            "type": "text",
            "text": f"Text Excerpts:\n{text_context}\n"
        })

    # Add the image context
    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in image_data_store:
            content.append({
                "type": "text",
                "text": f"\n[Image from page {doc.metadata['page']}]:"
            })
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image_data_store[image_id]}"
                }
            })

    # Add instruction
    content.append({
        "type": "text",
        "text": (
            "\n\nYou must answer based only on the provided text and images. "
            "If the question is outside this context, politely inform the user "
            "that you can only answer based on the supplied materials."
        )
    })

    return HumanMessage(content=content)


In [49]:
def multimodal_pdf_rag_pipeline(query):
    
    """Main pipeline for Multimodal RAG"""

    # Retrieve relevant docs
    context_docs = retrieve_multimodal(query,k=5)

    # Create a multimodal message
    message = create_multimodal_message(query,context_docs)

    # Recieve response from LLM
    response = llm.invoke([message])

    #Print retrieved context info
    print(f"\nRetrieved {len(context_docs)} documents ")
    for doc in context_docs:
        doc_type = doc.metadata.get("type","unknown")
        page = doc.metadata.get("page","?")
        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  - Text from page {page}:{preview}")
        
        else:
            print(f"  - Image from page {page}")
    print('\n')

    return response.content


In [51]:
if __name__ == "__main__":

    queries = [
        "What is Agentic AI ?"
    ]

    for query in queries :
        print(f"\n Query : {query}")
        print("-" * 150)
        answer = multimodal_pdf_rag_pipeline(query)
        print(f"Answer:{answer}")
        print("=" * 170)


 Query : What is Agentic AI ?
------------------------------------------------------------------------------------------------------------------------------------------------------

Retrieved 5 documents 
  - Image from page 2
  - Image from page 6
  - Image from page 16
  - Image from page 15
  - Image from page 8


Answer:Based on the provided text and images, the definition of "Agentic AI" is not included in the supplied materials. Therefore, I cannot answer what Agentic AI is using only the information given.
