In [None]:
!pip install faiss-cpu

In [None]:
!pip install langchain_community

In [3]:
import torch
from transformers import AutoProcessor, AutoModel
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline
from PIL import Image
import os

In [6]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [10]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-5.4.0-py3-none-any.whl.metadata (7.3 kB)
Downloading pypdf-5.4.0-py3-none-any.whl (302 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/302.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m297.0/302.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.3/302.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.4.0


In [11]:
class MultimodalRAG:
    def __init__(self, pdf_path):
        self.processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.vision_model = AutoModel.from_pretrained("openai/clip-vit-base-patch32")
        self.text_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

        # Load PDF and split into chunks
        self.pdf_path = pdf_path
        self.documents = self.load_pdf(pdf_path)

        # Create vector store from documents
        self.vector_store = FAISS.from_documents(self.documents, self.text_embeddings)

        try:
            # Initialize LLM for generation
            self.llm = HuggingFacePipeline.from_model_id(
                model_id="google/flan-t5-large",
                task="text2text-generation",
                model_kwargs={"temperature": 0.7, "max_length": 512}
            )
        except:
            # Fallback to a different model if flan-t5 fails
            from langchain.llms import OpenAI
            self.llm = OpenAI(temperature=0.7)

        # Create retriever
        self.retriever = self.vector_store.as_retriever(search_kwargs={"k": 2})

        # Initialize QA chain
        self.qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=self.retriever,
            return_source_documents=True
        )

    def load_pdf(self, pdf_path):
        """Load PDF file and split into chunks."""
        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"PDF file not found: {pdf_path}")

        # Load PDF
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()

        # Split into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200
        )
        return text_splitter.split_documents(documents)

    def process_image(self, image_path):
        """Extract features from an image."""
        if not os.path.exists(image_path):
            print(f"Warning: Image path {image_path} does not exist")
            return None

        image = Image.open(image_path)
        inputs = self.processor(images=image, return_tensors="pt")
        with torch.no_grad():
            image_features = self.vision_model.get_image_features(**inputs)
        return image_features

    def retrieve_related_documents(self, query_text, image_path=None):
        """Retrieve documents based on text query and optional image."""
        if image_path:
            image_features = self.process_image(image_path)

            if image_features is not None:
                # Get image description and key objects
                image_query = self.generate_image_description(image_features)

                # Combine text query with image-derived information
                enhanced_query = f"{query_text} {image_query}"
            else:
                enhanced_query = query_text
        else:
            enhanced_query = query_text

        # Retrieve documents
        docs = self.retriever.get_relevant_documents(enhanced_query)
        return docs

    def generate_image_description(self, image_features):
        """Generate text description from image features."""
        # This would typically use a vision-language model
        # Simplified implementation for example
        return "a product photo showing a smartphone with technical specifications visible"

    def answer_query(self, query_text, image_path=None):
        """Answer a query using RAG, incorporating image context if provided."""
        if image_path:
            print(f"Processing query with text and image from {image_path}")
            docs = self.retrieve_related_documents(query_text, image_path)
        else:
            print("Processing text-only query")
            docs = self.retrieve_related_documents(query_text)

        # Generate answer using retrieved context
        result = self.qa_chain({"query": query_text})
        return {
            "query": query_text,
            "answer": result["result"],
            "source_documents": result["source_documents"]
        }

In [12]:
pdf_path = "example.pdf"
rag_system = MultimodalRAG(pdf_path)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]



generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu


In [13]:
text_result = rag_system.answer_query("What are the battery specifications of the latest iPhone?")
print(f"Answer: {text_result['answer']}")

Processing text-only query


  docs = self.retriever.get_relevant_documents(enhanced_query)
  result = self.qa_chain({"query": query_text})


Answer: 4,000 mAh


In [14]:
sample_image_path = "samsung.jpg"
if os.path.exists(sample_image_path):
    multimodal_result = rag_system.answer_query(
        "Can you tell me more about this device?",
        image_path=sample_image_path
    )
    print(f"Answer: {multimodal_result['answer']}")
else:
    print(f"Skipping multimodal query as {sample_image_path} does not exist")

Processing query with text and image from samsung.jpg
Answer: The Samsung Galaxy S24 features a 6.2-inch Dynamic AMOLED display with 120
