<a href="https://colab.research.google.com/github/ayyucedemirbas/RAG_techniques/blob/main/multimodal_rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install faiss-cpu

In [None]:
!pip install langchain_community

In [3]:
import torch
from transformers import AutoProcessor, AutoModel
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline
from PIL import Image
import os

In [4]:
class MultimodalRAG:
    def __init__(self):
        self.processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.vision_model = AutoModel.from_pretrained("openai/clip-vit-base-patch32")

        self.text_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

        documents = [
            "The latest iPhone features a 4,000 mAh battery that lasts up to 24 hours on normal usage.",
            "The Samsung Galaxy S24 Ultra comes with a 5,000 mAh battery and 45W fast charging support.",
            "The Google Pixel 8 Pro has a 5,050 mAh battery with wireless charging capabilities.",
            "The iPhone 15 Pro Max dimensions are 159.9 x 76.7 x 8.25 mm and weighs 221 grams.",
            "The Samsung Galaxy S24 features a 6.2-inch Dynamic AMOLED display with 120Hz refresh rate."
        ]

        # Create vector store from documents
        self.vector_store = FAISS.from_texts(documents, self.text_embeddings)

        try:
            # Initialize LLM for generation
            self.llm = HuggingFacePipeline.from_model_id(
                model_id="google/flan-t5-large",
                task="text2text-generation",
                model_kwargs={"temperature": 0.7, "max_length": 512}
            )
        except:
            # Fallback to a different model if flan-t5 fails
            from langchain.llms import OpenAI
            self.llm = OpenAI(temperature=0.7)

        # Create retriever
        self.retriever = self.vector_store.as_retriever(search_kwargs={"k": 2})

        # Initialize QA chain
        self.qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=self.retriever,
            return_source_documents=True
        )

    def process_image(self, image_path):
        """Extract features from an image."""
        if not os.path.exists(image_path):
            print(f"Warning: Image path {image_path} does not exist")
            return None

        image = Image.open(image_path)
        inputs = self.processor(images=image, return_tensors="pt")
        with torch.no_grad():
            image_features = self.vision_model.get_image_features(**inputs)
        return image_features

    def retrieve_related_documents(self, query_text, image_path=None):
        """Retrieve documents based on text query and optional image."""
        if image_path:
            image_features = self.process_image(image_path)

            if image_features is not None:
                # Get image description and key objects
                image_query = self.generate_image_description(image_features)

                # Combine text query with image-derived information
                enhanced_query = f"{query_text} {image_query}"
            else:
                enhanced_query = query_text
        else:
            enhanced_query = query_text

        # Retrieve documents
        docs = self.retriever.get_relevant_documents(enhanced_query)
        return docs

    def generate_image_description(self, image_features):
        """Generate text description from image features."""
        # This would typically use a vision-language model
        # Simplified implementation for example
        return "a product photo showing a smartphone with technical specifications visible"

    def answer_query(self, query_text, image_path=None):
        """Answer a query using RAG, incorporating image context if provided."""
        if image_path:
            print(f"Processing query with text and image from {image_path}")
            docs = self.retrieve_related_documents(query_text, image_path)
        else:
            print("Processing text-only query")
            docs = self.retrieve_related_documents(query_text)

        # Generate answer using retrieved context
        result = self.qa_chain({"query": query_text})
        return {
            "query": query_text,
            "answer": result["result"],
            "source_documents": result["source_documents"]
        }

In [5]:
rag_system = MultimodalRAG()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

  self.text_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]



generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu


In [6]:
text_result = rag_system.answer_query("What are the battery specifications of the latest iPhone?")
print(f"Answer: {text_result['answer']}")

Processing text-only query


  docs = self.retriever.get_relevant_documents(enhanced_query)
  result = self.qa_chain({"query": query_text})


Answer: 4,000 mAh


In [7]:
sample_image_path = "samsung.jpg"
if os.path.exists(sample_image_path):
    multimodal_result = rag_system.answer_query(
        "Can you tell me more about this device?",
        image_path=sample_image_path
    )
    print(f"Answer: {multimodal_result['answer']}")
else:
    print(f"Skipping multimodal query as {sample_image_path} does not exist")

Processing query with text and image from samsung.jpg
Answer: 6.2-inch Dynamic AMOLED display with 120Hz refresh rate
