# Installation

In [None]:
!sudo apt-get install poppler-utils tesseract-ocr libmagic-dev

In [None]:
!pip install -Uq "unstructured[all-docs]" pillow lxml
!pip install -Uq chromadb tiktoken
!pip install -Uq langchain langchain-community langchain-openai langchain-groq

# Data Extraction from research papers.


*   Used "unstructured" to extract text and images from pdfs
*   Reference: https://docs.unstructured.io/



In [None]:
from unstructured.partition.pdf import partition_pdf
from google.colab import files

# Upload multiple files
uploaded_files = files.upload()

In [3]:
# Initialize final containers
texts = []
images = []

# Helper to extract base64 images from chunks
def get_images_base64(chunks):
    images_b64 = []
    for chunk in chunks:
        if "CompositeElement" in str(type(chunk)):
            for el in chunk.metadata.orig_elements:
                if "Image" in str(type(el)):
                    images_b64.append(el.metadata.image_base64)
    return images_b64

# Loop through each uploaded file
for filename in uploaded_files.keys():
    print(f"Processing {filename}...")

    chunks = partition_pdf(
        filename=filename,
        infer_table_structure=True,
        strategy="hi_res",
        extract_image_block_types=["Image"],
        extract_image_block_to_payload=True,
        chunking_strategy="by_title",
        max_characters=4000,
        combine_text_under_n_chars=1000,
        new_after_n_chars=2000,
    )

    # Extract and store text chunks
    for chunk in chunks:
        if "CompositeElement" in str(type(chunk)):
            texts.append(chunk)

    # Extract and store image base64s
    images.extend(get_images_base64(chunks))

Processing 2023-ConceptGraphs.pdf...


yolox_l0.05.onnx:   0%|          | 0.00/217M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/115M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/46.8M [00:00<?, ?B/s]

In [None]:
import base64
from IPython.display import Image, display

def display_base64_image(base64_code):
    # Decode the base64 string to binary
    image_data = base64.b64decode(base64_code)
    # Display the image
    display(Image(data=image_data))

# Display an image for testing
display_base64_image(images[0])

# Build the ChromaDB Database

This section initializes and populates a ChromaDB vector database for retrieval-augmented generation (RAG).

Text and image data are first summarized to reduce token usage during embedding and retrieval.

Summarization methods:


*   Text Summarization:        LLaMA-3.1-8B-Instant
*   Image Summarization:       Meta-LLaMA/LLaMA-4-Scout-17B-16E-Instruct

Inference was performed using the Groq cloud.
Groq's free tier provides significantly higher usage limits than the
Hugging Face free tier. You can generate your free API key here: https://console.groq.com/keys

The generated summaries are then embedded using Qwen2-1.5B-instruct  and stored in the vector database. The original full documents (text and image) are retained separately for constructing detailed responses during query handling.



In [5]:
# Add your GROQ API key here
import os
os.environ["GROQ_API_KEY"] = "gs......................................."

## Summarize text

In [7]:
from groq import Groq
from tqdm import tqdm
import os

# Initialize Groq client
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

# System-level instruction
system_prompt = """
You are an assistant tasked with summarizing tables and text.
Give a concise summary of the table or text.

Respond only with the summary, no additional comment.
Do not start your message by saying "Here is a summary" or anything like that.
Just give the summary as it is.
"""

def get_text_summary(elements):
    summaries = []

    for element in tqdm(elements):
        user_prompt = f"Table or text chunk:\n{str(element).strip()}"

        messages = [
            {"role": "system", "content": system_prompt.strip()},
            {"role": "user", "content": user_prompt}
        ]

        stream = client.chat.completions.create(
            model="llama-3.1-8b-instant",
            messages=messages,
            temperature=0.5,
            max_completion_tokens=2048,
            top_p=1,
            stream=True,
        )

        # Capture the streamed summary
        full_response = ""
        for chunk in stream:
            full_response += chunk.choices[0].delta.content or ""

        summaries.append(full_response.strip())

    return summaries


In [8]:
# Summarize the text data
summary_texts = get_text_summary(texts)

100%|██████████| 32/32 [02:36<00:00,  4.91s/it]


In [None]:
# ===============================
# Preview a Sample Text Summary
# ===============================

# Set the index of the sample to inspect
# idx = 7
# print("Original Text:\n", texts[idx])
# print("\n" + "="*60 + "\n")
# print("Summarized Text:\n", summary_texts[idx])

## Summarize Image

In [19]:
from groq import Groq
from tqdm import tqdm
import os

# Initialize Groq client
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

system_prompt = """You are a scientific assistant specialized in interpreting images from research papers.
Describe figures, plots, and charts concisely in a paragraph, focusing on key elements like axes, labels, legends, and the main takeaway from the data.
Respond only with the summary in paragraph form, no additional comment. Do not start your message by saying "Here is a summary" or anything like that.
Just give the summary as it is."""


def get_image_summaries(images_b64):
    summaries = []
    for img_b64 in tqdm(images_b64):
        try:
            response = client.chat.completions.create(
                messages=[
                    {"role": "system", "content": system_prompt.strip()},
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": "Please describe this image in detail."},
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{img_b64}",
                                },
                            },
                        ],
                    }
                ],
                model="meta-llama/llama-4-scout-17b-16e-instruct",
                max_completion_tokens=2048,
            )

            summary = response.choices[0].message.content
            summaries.append(summary.strip())

        except Exception as e:
            summaries.append(f"Error: {e}")

    return summaries


In [20]:
summary_images = get_image_summaries(images)  # `images` is your list of base64 strings

100%|██████████| 4/4 [00:03<00:00,  1.27it/s]


In [None]:
# ===============================
# Preview a Sample Image Summary
# ===============================

##  Set the index of the sample to inspect
# idx = 3
# print("Original Image:\n")
# display_base64_image(images[idx])
# print("\n" + "="*60 + "\n")
# print("Summarized Text:\n", summary_images[idx])

## Set up the database

In [None]:
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import torch

# use GPU if available, otherwise fallback to CPU
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load model
shared_model = SentenceTransformer("Alibaba-NLP/gte-Qwen2-1.5B-instruct", trust_remote_code=True)
shared_model = shared_model.to(device)

# Function-style embedding
def sentence_transformer_embedding_function(input_texts):
    return shared_model.encode(input_texts, convert_to_numpy=True, device=device).tolist()

# Class wrapper for ChromaDB
class SentenceTransformerEmbeddings:
    def __init__(self, model, device):
        self.model = model.to(device)
        self.device = device

    def __call__(self, input):
        return self.model.encode(input, convert_to_numpy=True, device=self.device).tolist()


# Initialize Chroma client
chroma_client = chromadb.PersistentClient(path="chroma_persistent_storage")

# Create/reuse collection with the same model
collection_name = "mm_research_rag"
collection = chroma_client.get_or_create_collection(
    name=collection_name,
    embedding_function=SentenceTransformerEmbeddings(shared_model, device)
)

In [25]:
import uuid

# create unique ids for texts and images
doc_ids = [str(uuid.uuid4()) for _ in texts]
image_ids = [str(uuid.uuid4()) for _ in images]
doc_id_to_text = dict(zip(doc_ids, texts))
image_id_to_image = dict(zip(image_ids, images))

# Upsert text summaries
collection.upsert(ids=doc_ids,documents=summary_texts, embeddings=sentence_transformer_embedding_function(summary_texts))

# Upsert image summaries
collection.upsert(ids=image_ids,documents=summary_images,embeddings=sentence_transformer_embedding_function(summary_images))

## RAG pipeline

In [66]:
# Function to query and retrieve relevant documents
def query_and_retrieve(question, n_results=6):
    results = collection.query(query_texts=[question], n_results=n_results)
    retrieved = {"texts": [], "images": []}  # Separate lists for texts and images

    for ids, docs in zip(results['ids'], results['documents']):
        for doc_id, summary in zip(ids, docs):
            # Check whether it's a text or image and add to the appropriate list
            if doc_id in doc_id_to_text:
                full_content = doc_id_to_text[doc_id]
                content_type = "text"
                retrieved["texts"].append(full_content)  # Add to the texts list
            elif doc_id in image_id_to_image:
                full_content = image_id_to_image[doc_id]
                content_type = "image"
                retrieved["images"].append(full_content)  # Add to the images list
            else:
                full_content = None
                content_type = "unknown"
                # You can also handle the "unknown" type if needed

    return retrieved


In [71]:
import textwrap

def build_prompt(kwargs):
    docs_by_type = kwargs["context"]
    user_question = kwargs["question"]

    # Accumulate the text content
    context_text = ""
    if len(docs_by_type["texts"]) > 0:
      for text_element in docs_by_type.get("texts", []):
          context_text += str(text_element).strip() + "\n\n"

    # Instructions as part of user content
    system_instructions = textwrap.dedent("""
        Answer the following question as accurately and thoroughly as possible based only on the following context, which can include text, tables, and the below image.""")

    # Construct content blocks
    content_blocks = []

    # Add system-style instructions
    content_blocks.append({
        "type": "text",
        "text": system_instructions.strip()
    })

    # Add question
    content_blocks.append({
        "type": "text",
        "text": f"Question:\n{user_question.strip()}"
    })

    # Add context
    if context_text.strip():
        content_blocks.append({
            "type": "text",
            "text": f"Context:\n{context_text.strip()}"
        })

    # Add image(s) if present
    if len(docs_by_type["images"]) > 0:
      for image in docs_by_type.get("images", []):
          content_blocks.append({
              "type": "image_url",
              "image_url": {
                  "url": f"data:image/jpeg;base64,{image}"
              }
          })

    # Final messages format for Groq
    messages = [
        {
            "role": "user",
            "content": content_blocks
        }
    ]

    return messages


In [72]:
# Define RAG Client
rag_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
model="meta-llama/llama-4-scout-17b-16e-instruct"


In [74]:
# RAG query
question = "How are open-vocabulary 3D scene graph built in ConceptGraphs"

retrieved_documents = query_and_retrieve(question)

# Build messages
prompt = build_prompt({
    "context": retrieved_documents,
    "question": question
})

# Send request
response = rag_client.chat.completions.create(
    model= model,
    messages=prompt,
    max_tokens=2048,
)

print(response.choices[0].message.content.strip())


ConceptGraphs builds open-vocabulary 3D scene graphs through the following steps:

1. **Input RGB-D sequence**: The process starts with a sequence of posed RGB-D images.
2. **Open-vocab Detection/Segmentation**: A class-agnostic segmentation model is applied to the RGB-D sequence to obtain candidate objects.
3. **Object-based 3D Mapping**: The system constructs a 3D map by fusing the segmented regions from multiple views, resulting in a set of 3D objects and associated vision (and language) descriptors.
4. **Object Fusion/Initialization**: The 3D objects are fused and initialized to create a set of object nodes.
5. **Large Vision-Language Model (LVLM) Captioning**: An LVLM is used to caption each object node, providing a language description of each object.
6. **Large Language Model (LLM) Relationship Inference**: An LLM is used to infer relationships between adjacent object nodes, resulting in edges in the scene graph.
7. **3D Scene Graph Construction**: The object nodes and edges are