# Importing Packages

# Loading Clip Model

In [20]:
!pip install PyMuPDF
!pip install langchain-community
!pip install faiss-cpu
!pip install langchain-google-genai



In [21]:

import fitz
from langchain_core.documents import Document
from transformers import CLIPProcessor , CLIPModel #This is used to for summarizing both images & texts ( Contrast Languaeg Image Pre Training )
from PIL import Image
import torch
import numpy as np
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import os
import base64 #helps to store the image into this format
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage

import requests


In [22]:
# Initialize CLIP model for unified embedings

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") #train both image & text data from PDF
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") #ensures the format & tokensied remains same from the above model's format
clip_model.eval()


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

# Embedding Functions

In [23]:
# Create an embedding function for Images

def embedd_image(image_data) :

    """Embedd Image using CLIP """

    if isinstance(image_data, str) : #if given as path
        image = Image.open(image_data).convert("RGB")
    else : # if given as PIL Image only
        image = image_data

    inputs = clip_processor(images=image , return_tensors="pt" ) # coverts entire format to tensors ( DL concpet )

    with torch.no_grad() : # disbaled to save memory
        features = clip_model.get_image_features(**inputs)

        #Normalizing the embeddings to unit vector ( covert all different dimensions to a unit vector )
        features = features / features.norm(dim = 1, keepdim = True)
        return features.squeeze().cpu().numpy()

In [24]:
# Create an embedding function for Text

def embedd_text(text) :

    """Embedd Text using CLIP"""

    inputs = clip_processor(
        text = text ,
        return_tensors="pt",
        padding = True ,
        truncation = True ,
        max_length = 77 # CLIP max token length
    )

    with torch.no_grad() :
        features = clip_model.get_text_features(**inputs)

        #Normalizing the embeddings to unit vector ( covert all different dimensions to a unit vector )
        features = features / features.norm(dim = 1, keepdim = True)
        return features.squeeze().cpu().numpy()

# Process PDF

In [25]:


pdf_url = "https://github.com/aasiyasan/Training/blob/d19f63c59d1589f2e002826181d5a705cf7cabc4/content/Test.pdf"
response = requests.get(pdf_url)
response.raise_for_status()  # Raise an exception for bad status codes
pdf_bytes = response.content
doc = fitz.open(stream=pdf_bytes, filetype="pdf")

In [26]:
# Sotring for all documents and embeddings

all_docs = []
all_embeddings = []
image_data_store = {}

In [27]:
# Text Splitter

splitter = RecursiveCharacterTextSplitter(chunk_size = 1000 , chunk_overlap = 500 )

In [28]:
doc

Document('None', <memory, doc# 2>)

In [29]:
for i , page in enumerate(doc) :
    # process text
    text = page.get_text()

    if text.strip():
        # creat a temp doc for splitting
        temp_doc = Document(page_content = text , metadata={"page":i , "type":text})
        text_chunks = splitter.split_documents([temp_doc])

        # Embed each text converted chunk using CLIP
        for chunk in text_chunks:
            embedding = embedd_text(chunk.page_content) #created function
            all_embeddings.append(embedding)
            all_docs.append(chunk)

    # Process Images

    # 3 Important Actions -
    # i. Convert PDF Image to PIL format
    # ii. Store as base64 for Gemini Multimodal LLM ( should be in base 64 )
    # iii. Create CLIP embedding for retirval

    for img_index , img in enumerate(page.get_images(full = True )): # get all image info
        try :
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            # Converting to PIL Image
            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")

            # Creating a unique Identifier
            image_id = f"page_{i}_img_{img_index}"

            # Store Image as base64 for later use while using LLM
            buffered = io.BytesIO()
            pil_image.save(buffered , format = "PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode()
            image_data_store[image_id] = img_base64

            # Embed image using CLIP
            embedding = embedd_image(pil_image)
            all_embeddings.append(embedding)

            #Creating document of image
            image_doc = Document(
                page_content=f"Image:{image_id}",
                metadata ={"page" : i , "type":"image","image_id" : image_id}
            )
            all_docs.append(image_doc)

        except Exception as e :
            print(f"Error processing image {img_index} on page {i} : {e}")

doc.close()

In [30]:
# Check the stored embeddings

all_embeddings

[array([-5.63157238e-02, -2.18117665e-02, -3.70099698e-03,  9.33307409e-03,
        -1.05965734e-02,  3.64540592e-02, -3.25424783e-02,  7.30444863e-02,
         7.95402601e-02, -3.47262919e-02, -1.60291381e-02,  8.19601957e-03,
         4.39712293e-02, -5.93298301e-03,  3.14675197e-02, -1.82229280e-02,
         4.19200025e-03,  3.95009555e-02, -2.79736388e-02,  2.36383802e-03,
         3.10228448e-02,  2.20747925e-02, -3.22238691e-02,  2.23418064e-02,
         1.69801638e-02,  4.14809883e-02,  7.03572761e-03, -1.88205828e-04,
         2.55748164e-02, -1.72168929e-02,  1.48795061e-02,  3.27449925e-02,
         5.18383086e-02,  3.52716036e-02,  1.79413264e-03, -5.52727729e-02,
        -6.82717487e-02,  1.61367860e-02,  5.57496771e-02, -5.82980774e-02,
        -4.03989814e-02, -2.68714298e-02, -1.04568368e-02, -2.54366491e-02,
        -1.53195346e-02,  1.42854750e-02,  1.27739608e-02,  2.47442219e-02,
        -4.30880301e-02,  3.13887000e-03, -6.38356507e-02,  1.03787920e-02,
        -1.6

In [31]:
all_docs

[Document(metadata={'page': 0, 'type': 'Skip to content\n Toggle navigation\nNavigation Menu\nSign in\n Appearance settings\n•  Platform \n○  \nGitHub Copilot\nWrite better code with AI\n○  \nGitHub Spark New\nBuild and deploy intelligent apps\n○  \nGitHub Models New\nManage and compare prompts\n○  \nGitHub Advanced Security\nFind and fix vulnerabilities\n○  \nActions\nAutomate any workflow\n○  \nCodespaces\nInstant dev environments\n○  \nIssues\n'}, page_content='Skip to content\n Toggle navigation\nNavigation Menu\nSign in\n Appearance settings\n•  Platform \n○  \nGitHub Copilot\nWrite better code with AI\n○  \nGitHub Spark New\nBuild and deploy intelligent apps\n○  \nGitHub Models New\nManage and compare prompts\n○  \nGitHub Advanced Security\nFind and fix vulnerabilities\n○  \nActions\nAutomate any workflow\n○  \nCodespaces\nInstant dev environments\n○  \nIssues'),
 Document(metadata={'page': 1, 'type': 'Plan and track work\n○  \nCode Review\nManage code changes\n○  \nDiscussions\n

# Create a Vector Store ( FAISS )

In [32]:
# Create a unified FAISS vector sotre with CLIP embedings
embeddings_array = np.array(all_embeddings)

embeddings_array

array([[-0.05631572, -0.02181177, -0.003701  , ...,  0.01361988,
         0.05326267,  0.01115071],
       [-0.04103215, -0.01421319, -0.02995664, ...,  0.03456077,
         0.01062735, -0.00336789],
       [-0.00323676, -0.0149697 ,  0.01419608, ..., -0.02760404,
        -0.00142296, -0.02829592],
       ...,
       [-0.01657336,  0.03052791, -0.01159491, ...,  0.04026908,
        -0.01045963,  0.01382174],
       [-0.0308239 , -0.01514191,  0.00127925, ..., -0.08419829,
        -0.02160759,  0.01432139],
       [-0.00019071, -0.04849665, -0.03532244, ..., -0.05563572,
        -0.01818162,  0.02976229]], dtype=float32)

In [33]:
(all_docs,embeddings_array)

([Document(metadata={'page': 0, 'type': 'Skip to content\n Toggle navigation\nNavigation Menu\nSign in\n Appearance settings\n•  Platform \n○  \nGitHub Copilot\nWrite better code with AI\n○  \nGitHub Spark New\nBuild and deploy intelligent apps\n○  \nGitHub Models New\nManage and compare prompts\n○  \nGitHub Advanced Security\nFind and fix vulnerabilities\n○  \nActions\nAutomate any workflow\n○  \nCodespaces\nInstant dev environments\n○  \nIssues\n'}, page_content='Skip to content\n Toggle navigation\nNavigation Menu\nSign in\n Appearance settings\n•  Platform \n○  \nGitHub Copilot\nWrite better code with AI\n○  \nGitHub Spark New\nBuild and deploy intelligent apps\n○  \nGitHub Models New\nManage and compare prompts\n○  \nGitHub Advanced Security\nFind and fix vulnerabilities\n○  \nActions\nAutomate any workflow\n○  \nCodespaces\nInstant dev environments\n○  \nIssues'),
  Document(metadata={'page': 1, 'type': 'Plan and track work\n○  \nCode Review\nManage code changes\n○  \nDiscussions

In [34]:
# Create custom FAISS Index as we have pre computed embeddings
vector_store = FAISS.from_embeddings(
    text_embeddings=[(doc.page_content,emb) for doc , emb in zip(all_docs , embeddings_array)],
    embedding=None,
    metadatas=[doc.metadata for doc in all_docs]
)

vector_store



<langchain_community.vectorstores.faiss.FAISS at 0x7b4ab87ace30>

# Initalizing LLM

In [35]:
# Using Gemini API

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash" ,api_key="AIzaSyAC3y6fywJd97DhsWAjngCwwj7VUxHyKHM")

In [36]:
llm

ChatGoogleGenerativeAI(model='models/gemini-2.5-flash', google_api_key=SecretStr('**********'), client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x7b4a2338acc0>, default_metadata=(), model_kwargs={})

# Simple Retrival Pipline

In [37]:
# Use a method to retrieve the data from vector

def retrieve_multimodal ( query , k = 5 ):

    """Unified retreival using CLIP embeddings for both text & images"""

    # Embed query using CLIP
    query_embedding = embedd_text(query)

    # Search the answer similar to the query and then return
    results = vector_store.similarity_search_by_vector( # using by vector although their are other ways for this
        embedding=query_embedding,
        k=k
    )

    return results

In [38]:
def create_multimodal_message(query, retrieved_docs):
    """Create a message with both text & images for LLM"""

    content = []

    # Add the query
    content.append({
        "type": "text",
        "text": f"Question: {query}\n\nContext:\n"
    })

    # Separate Text & Image Docs
    text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "text"]
    image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "image"]

    # Add the text context
    if text_docs:
        text_context = "\n\n".join([
            f"[Page {doc.metadata['page']}]: {doc.page_content}"
            for doc in text_docs
        ])
        content.append({
            "type": "text",
            "text": f"Text Excerpts:\n{text_context}\n"
        })

    # Add the image context
    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in image_data_store:
            content.append({
                "type": "text",
                "text": f"\n[Image from page {doc.metadata['page']}]:"
            })
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image_data_store[image_id]}"
                }
            })

    # Add instruction
    content.append({
        "type": "text",
        "text": (
            "\n\nYou must answer based only on the provided text and images. "
            "If the question is outside this context, politely inform the user "
            "that you can only answer based on the supplied materials."
        )
    })

    return HumanMessage(content=content)


In [39]:
def multimodal_pdf_rag_pipeline(query):

    """Main pipeline for Multimodal RAG"""

    # Retrieve relevant docs
    context_docs = retrieve_multimodal(query,k=5)

    # Create a multimodal message
    message = create_multimodal_message(query,context_docs)

    # Recieve response from LLM
    response = llm.invoke([message])

    #Print retrieved context info
    print(f"\nRetrieved {len(context_docs)} documents ")
    for doc in context_docs:
        doc_type = doc.metadata.get("type","unknown")
        page = doc.metadata.get("page","?")
        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  - Text from page {page}:{preview}")

        else:
            print(f"  - Image from page {page}")
    print('\n')

    return response.content


In [40]:
if __name__ == "__main__":

    queries = [
        "What is Agentic AI ?"
    ]

    for query in queries :
        print(f"\n Query : {query}")
        print("-" * 150)
        answer = multimodal_pdf_rag_pipeline(query)
        print(f"Answer:{answer}")
        print("=" * 170)


 Query : What is Agentic AI ?
------------------------------------------------------------------------------------------------------------------------------------------------------


ChatGoogleGenerativeAIError: Invalid argument provided to Gemini: 400 API key expired. Please renew the API key. [reason: "API_KEY_INVALID"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
, locale: "en-US"
message: "API key expired. Please renew the API key."
]