# Multimodal RAG with Multimodal Langchain

## Setup

In [2]:
# %% Cell 1: Imports & Utilities

import os
import json
from PIL import Image

from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage

def load_json_file(path: str):
    """Load JSON content from a file."""
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


In [3]:
# %% Cell 2: API Key
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY", "your-key")


In [4]:
# %% Cell 3: Embeddings + VectorStore
embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
vectorstore = Chroma(
    persist_directory="./chroma_db",
    embedding_function=embeddings
)


  embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
  vectorstore = Chroma(


In [5]:
# %% Cell 4: Retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})


In [6]:
# %% Cell 5: LLM Inference Client
chat = ChatOpenAI(
    model_name="gpt-4",
    temperature=0.0,
    openai_api_key=os.getenv("OPENAI_API_KEY")
)

def run_lvml_inference(prompt: str, image_path: str = None) -> str:
    """
    Run LLM on the given prompt.
    Optionally caption the image and prepend it to `prompt`.
    """
    # Example caption hook:
    # if image_path:
    #     caption = your_image_captioner(image_path)
    #     prompt = f"{caption}\n\n{prompt}"

    msgs = [HumanMessage(content=prompt)]
    response = chat(msgs)
    return response.content


  chat = ChatOpenAI(


In [7]:
# %% Cell 6: Retrieval Function
def retrieve_video_segment(query: str) -> dict:
    """
    Retrieve the most relevant video segment for `query`.
    Returns metadata including video_path, frame_path, transcript, and mid_time_ms.
    """
    docs = retriever_module.get_relevant_documents(query)
    if not docs:
        return {}
    meta = docs[0].metadata
    return {
        "video_path":      meta.get("video_path"),
        "extracted_frame": meta.get("frame_path"),
        "transcript":      meta.get("transcript"),
        "mid_time_ms":     meta.get("start_time"),
    }


In [8]:
import os
import json
from PIL import Image

from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage

# Utility loader

def load_json_file(path: str):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

# 1) Set your API key
os.environ["OPENAI_API_KEY"] = "your-key"

# 2) Create embeddings and vector store
embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
vectorstore = Chroma(
    persist_directory="./chroma_db",
    embedding_function=embeddings
)

# 3) Retriever module
retriever_module = vectorstore.as_retriever(search_kwargs={"k": 1})

# 4) Inference client
chat = ChatOpenAI(model_name="gpt-4", temperature=0.0)

def run_lvml_inference(prompt: str, image_path: str) -> str:
    """Run LLM on the given prompt (optionally prepend an image caption)."""
    msgs = [HumanMessage(content=prompt)]
    resp = chat(msgs)
    return resp.content

# 5) Retrieval wrapper

def retrieve_video_segment(query: str):
    docs = retriever_module.get_relevant_documents(query)
    meta = docs[0].metadata
    return {
        "video_path":       meta.get("video_path"),
        "extracted_frame":  meta.get("frame_path"),
        "transcript":       meta.get("transcript"),
        "mid_time_ms":      meta.get("start_time")
    }



### Preprocessing

#### Setup vectorstore

In [9]:
# %% Cell X: Configure your Chroma DB (replaces Lancedb host + table)
# Path where Chroma will read/write its on-disk index
PERSIST_DIR     = "./shared_data/chroma_db"  

# Name of the “collection” inside Chroma (was your LanceDB table)
COLLECTION_NAME = "test_tbl"  
# If you want to fall back to a demo collection:
# COLLECTION_NAME = "demo_tbl"

vectorstore = Chroma(
    persist_directory=PERSIST_DIR, 
    embedding_function=embeddings, 
    collection_name=COLLECTION_NAME
)


### Retrieval Module
#### Initialize Embedding Model

In [10]:
from langchain.embeddings import OpenAIEmbeddings

embedder = OpenAIEmbeddings(
    openai_api_key=os.getenv("OPENAI_API_KEY")
)


#### Create Retrieval

In [11]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

# 1) “uri”  →  where Chroma persists its on-disk index
PERSIST_DIR     = "./shared_data/chroma_db"
# 2) “table_name”  →  Chroma’s collection_name
COLLECTION_NAME = "test_tbl"

# 3) Create the embedder
embedder = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))

# 4) Initialize Chroma (replaces MultimodalLanceDB)
vectorstore = Chroma(
    persist_directory=PERSIST_DIR,
    embedding_function=embedder,
    collection_name=COLLECTION_NAME
)

# 5) Create a retriever (defaults to similarity search under the hood)
retriever_module = vectorstore.as_retriever(
    search_kwargs={"k": 1}   # exactly the same “k=1” you had
)


#### Invoke Retrieval with User Query

In [14]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
data = retrieve_video_segment_and_metadata("What do the astronauts feel about their work?")
print("Transcript:\n", data["transcript"])
print("Frame path:", data["frame_path"])


Transcript:
 The view is always amazing I didn't think I would do another spacewalk and to now have the chance to have done four more was just icing on the cake for a a wonderful mission. Does the 10th one feel like the first one? No, a little more comfortable on the tenth one.
Frame path: ./shared_data/videos/video1/extracted_frame/frame_7.jpg


### LVLM Inference Module

#### Initialize Client and LVLM for Inference

In [15]:
# Option A: alias the wrapper as your LVLM module
lvlm_inference_module = run_lvml_inference

# Option B: alias the raw ChatOpenAI client (less convenient, since it expects HumanMessage lists)
lvlm_inference_module = chat


#### Invoke LVLM Inference with User Query

In [17]:
# %% Cell X: Augment the user query with the retrieved transcript
# `data` comes from your retrieve_video_segment_and_metadata call
# `query` is the original user question

augmented_query = (
    f"The transcript associated with the image is '{data['transcript']}'. "
    f"{query}"
)
print("Augmented query is:\n", augmented_query)

# Now run your “LVLM” (ChatOpenAI wrapper) on this augmented prompt:
answer = run_lvml_inference(
    prompt=augmented_query,
    image_path=data["frame_path"]
)
print("\nAnswer:\n", answer)


Augmented query is:
 The transcript associated with the image is 'The view is always amazing I didn't think I would do another spacewalk and to now have the chance to have done four more was just icing on the cake for a a wonderful mission. Does the 10th one feel like the first one? No, a little more comfortable on the tenth one.'. What do the astronauts feel about their work?


  resp = chat(msgs)



Answer:
 The astronauts feel amazed by their work and view from space. They also feel comfortable and experienced, especially after multiple spacewalks. They consider additional spacewalks as a bonus to their already wonderful mission.


In [18]:

# New code using our ChatOpenAI wrapper:
response = run_lvml_inference(
    prompt=augmented_query,
    image_path=data["frame_path"]   # same as your old `frame_path`
)

print('LVLM Response:')
print(response)


LVLM Response:
The astronauts feel amazed by their work and consider it a wonderful mission. They also feel more comfortable with their tasks over time, as indicated by the comparison between their first and tenth spacewalks.


### Prompt Processing Module

In [19]:
def prompt_processing_module(retrieved_results: list, user_query: str) -> dict:
    """
    Given a list of retrieved Document objects and the original query,
    construct the prompt and return its text & image path.
    """
    # Take first result
    doc = retrieved_results[0]
    meta = doc.metadata

    transcript = meta.get("transcript")
    frame_path = meta.get("extracted_frame")

    prompt = (
        f"The transcript associated with the image is '{transcript}'. "
        f"{user_query}"
    )
    return {"prompt": prompt, "image": frame_path}

#### Invoke Prompt Processing Module with Retrieved Results and User Query

### Multimodal RAG

#### Define Multimodal RAG System as a Chain in LangChain

In [20]:
def mm_rag_chain(user_query: str) -> str:
    """
    End-to-end chain: retrieve document, process prompt, and run inference.
    Returns the final answer string.
    """
    # 1) Retrieve top-1 document
    docs = retriever_module.get_relevant_documents(user_query)
    if not docs:
        return "No relevant document found."

    # 2) Build prompt & extract image path
    processed = prompt_processing_module(docs, user_query)

    # 3) Run LLM inference
    answer = run_lvml_inference(
        prompt=processed['prompt'],
        image_path=processed['image']
    )
    return answer

***