In [1]:
import json
from typing import Optional
from llama_index.core.query_engine import CustomQueryEngine
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.response_synthesizers import BaseSynthesizer
from llama_index.core import get_response_synthesizer
from llama_index.core.llms import LLM


class NewQueryEngine(CustomQueryEngine):
    """Custom Query Engine that uses the original query."""

    glossary: Optional[dict]
    llm: LLM
    retriever: BaseRetriever
    response_synthesizer: BaseSynthesizer

    def __init__(self, glossary_path=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.glossary = {}
        if glossary_path:
            with open(glossary_path) as f:
                self.glossary = json.load(f)

    def custom_query(self, query_str: str):
        # Retrieve documents based on the original query

        glossary_explanation = "\n".join([f'{item.get("term")}, Explanation: {item.get("explanation")}' for item in self.glossary.get("terms", [])])
        
        # Create a prompt for the LLM to rewrite the query, including the glossary explanations
        prompt = f"Given the glossary terms and their explanations:\n{glossary_explanation}\nPlease rewrite the following search query to be more effective: '{query_str}'\n\nKeep the question format while enriching the query with important terms while preserving the original intent. Do not exceed 100 words for the query and do only output the query itself."
        
        # Assuming the method to generate a new query string based on the prompt
        rewritten_query = self.llm.complete(prompt).text
        
        # Print the rewritten query string
        print("Rewritten Query String:", rewritten_query)


        nodes = self.retriever.retrieve(rewritten_query)
        # Synthesize a response based on the retrieved documents
        new_prompt = f"Question: {query_str}\n\nGlossary to incorporate (not part of question):\n{glossary_explanation}"
        response_obj = self.response_synthesizer.synthesize(new_prompt, nodes)
        return response_obj


# Your existing setup for documents and index
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)

# Initialize the embedding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

# Load documents
documents = SimpleDirectoryReader("./data", recursive=True).load_data()

# Create an index from the documents using the service context
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

# Assuming the retriever and response synthesizer are set up as follows:
retriever = index.as_retriever(similarity_top_k=3,  chunk_size=256, chunk_overlap=64)

# Initialize the LLM (e.g., LlamaCPP with your model)
llm = LlamaCPP(
    model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf",
    temperature=0.0,
    max_new_tokens=2048,
    context_window=4096,
    generate_kwargs={},
    model_kwargs={"n_gpu_layers": -1},
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=False,
)

response_synthesizer = get_response_synthesizer(llm=llm)

# Instantiate the OriginalQueryEngine with the retriever and response synthesizer
original_query_engine = NewQueryEngine(
    llm=llm,
    glossary_path="./glossary.json",
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

# Now you can use this custom query engine to perform queries
query_str = "What is the so called inspector view in spotlight?"
response = original_query_engine.custom_query(query_str)
print(response)

ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes


Rewritten Query String:  "What is the function of the Inspector Widget in Spotlight's UI for examining and editing individual data points?"
 In the context provided, the Inspector Widget in Spotlight is a crucial component that allows users to examine and edit the features of individual data points in their dataset in detail. It offers multiple views for different modalities such as text, audio, image, video, and 3D geometry data, enabling users to explore and edit data points' underlying structures. This Inspector Widget is particularly valuable for working with multimodal datasets and provides greater control over the structure and quality of the data. Users can configure it to choose how each data point should be represented with various visualization and interaction components available for different data types. Additionally, the Inspector Widget includes an Issues widget that lists potential data issues linking to certain data points within the Spotlight dataset. These issues can 

In [2]:
# Evaluating our current pipeline and identifying shortcomings.
EXAMPLE_QUESTIONS_WITH_GT = [
    {"question": "What is the similarity map?",
     "answer": "The similarity map is a dimensionality reduction plot that displays high dimensional data like embeddings in 2D space. It can be very useful for getting a first overview on dataset structure and possible problems, before diving into more detailed analysis."
    },
    {
        "question": "How can I install Renumics Spotlight?",
        "answer": "Just run pip install renumics-spotlight. After that, start it by running spotlight from the CLI or use the Python interface via spotlight.show."
    },
    {
        "question": "I want to find duplicates in my data. How can I do this?",
        "answer": "You can either just do that exploratively, e.g. by placing data on the similarity map and checking if nearby datapoints are identical via the inspector. You can also detect duplicates with specialized tooling like annoy and display the results in spotlight via spotlight.show."
    },
    {
        "question": "Which possibilities regarding cluster analysis does Spotlight offer and how can it help me?",
        "answer": "Spotlight offers the similarity map, which can display embeddings or feature arrays in 2D. You can then identify clusters visually and browse through them while looking at the underlying data such as images or audio files in the inspector."
    },
    {
        "question": "I want to identify label issues for my audio use case? Which features and visualizations can help me and how could the process look?",
        "answer": "Similarity Map, Inspector and Filtering can help you with that. You could simply place your data points on the similarity map using an general purpose audio embedding. You could then color the data points by label. If there are clusters with mixed labels, that's a sign for label issues. You could also train a model on the existing labels and then color by model error, which is a great way of pinpointing the issues even faster."
    },
    {
        "question": "What are the main UI elements that help me with my use case in Spotlight?",
        "answer": "The main elements are the table for exploring data points by their metadata, the inspector for displaying unstructured data, the filter bar for filtering the data, The similarity map as an overview plot on the dataset structure, and potentially multiple additional visualizations such as histograms or scatterplots, as well as metric display elements such as a confusion matrix."
     
    },
    {
        "question": "What is the Inspector and how does it play together with other components?",
        "answer": "It is Spotlight's UI component that is able to display unstructured data such as images, audio files, text, and more. It can be used to easily explore unstructured, multimodal datasets.",
    },
    {
        "question": "What is sliceguard and how does it play together with Spotlight?",
        "answer": "It is a library that can automatically detect data clusters or slices where a machine learning model does not perform well. It uses Spoglight to display its detection results. It is also built by Renumics."
    },
    {
        "question": "I want to look at my audio data. Which possibilities does Spotlight offer?",
        "answer": "You can display audio data in the inspector. Spotlight offers the visualizations audio player and spetrogram. It can also display windows in which a certain event in the audio data occurs. To visualize audio dataset structure you could use audio embeddings that are displayed on the Similarity Map."
    },
    {
        "question": "How can I customize the data visualization in Renumics Spotlight?",
        "answer": "You can configure the UI flexibly via the UI. This involves configuring a layout with different widgets and plots, as well as adding lenses to the inspector widget. You can also save and load configured layouts either via json files or via the python API."
    },
    {
        "question": "Which possibilities for data loading and export does Renumics Spotlight offer?",
        "answer": "Spotlight can load pandas dataframes and huggingface datasets via spotlight.show. It can also load hdf5 files (legacy) and csv files. The preferred way is directly loading pandas dataframes via spotlight.show."
    },
]

In [3]:
import pandas as pd

# Initialize a list to hold your data
data = []

# Iterate over each question
for i, item in enumerate(EXAMPLE_QUESTIONS_WITH_GT):
    print(f"Processing question {i+1}/{len(EXAMPLE_QUESTIONS_WITH_GT)}")
    question = item["question"]

    ground_truth = item["answer"] # Those could be multiple
    
    # Query your RAG system
    response = original_query_engine.query(question)
    
    # Assuming the response object has the answer
    answer = response.response  # Modify as per your response object structure
    
    # Extract context from the source_nodes attribute
    # Concatenate the text of each source node to form the full context
    contexts = [node.text for node in response.source_nodes]
    
    # Append the details to your data list
    data.append({"question": question, "ground_truth": ground_truth, "answer": answer, "contexts": contexts})

# Convert the list to a pandas DataFrame
df = pd.DataFrame(data)

Processing question 1/11
Rewritten Query String:  "What is the function and purpose of a Similarity Map in the context of Renumics Spotlight, including its capabilities for displaying high-dimensional data and customization options?"
Processing question 2/11
Rewritten Query String:  "What is the process for installing and setting up Renumics Spotlight software, including any necessary dependencies and widget integrations (Similarity Map, Data Point Selection, Map Customization, etc.) using the Python API for Layout?"
Processing question 3/11
Rewritten Query String:  "How do I identify and locate duplicates in my dataset using the features of Renumics Spotlight?"
Processing question 4/11
Rewritten Query String:  "What cluster analysis features does Spotlight provide, and how can I utilize these capabilities for effective data analysis?"
Processing question 5/11
Rewritten Query String:  "How can I identify data issues related to audio labels in my use case? Which features and visualizati

In [4]:
df

Unnamed: 0,question,ground_truth,answer,contexts
0,What is the similarity map?,The similarity map is a dimensionality reducti...,The Similarity Map is a widget in Renumics Sp...,"[`similaritymap(name=None, columns=None, reduc..."
1,How can I install Renumics Spotlight?,Just run pip install renumics-spotlight. After...,"To install Renumics Spotlight, you can use pi...","[`inspector(name=None, lenses=None, num_column..."
2,I want to find duplicates in my data. How can ...,"You can either just do that exploratively, e.g...",To find duplicates in your data using Renumic...,[---\ntags: []\nid: duplicates-annoy\nsidebar_...
3,Which possibilities regarding cluster analysis...,"Spotlight offers the similarity map, which can...",Spotlight offers various possibilities for cl...,[---\nsidebar_position: 30\nslug: /docs/config...
4,I want to identify label issues for my audio u...,"Similarity Map, Inspector and Filtering can he...",To identify label issues for your audio use c...,[---\ntags: []\nid: audio-classification\nslug...
5,What are the main UI elements that help me wit...,The main elements are the table for exploring ...,The main UI elements that can help you with y...,[---\nsidebar_position: 30\nslug: /docs/config...
6,What is the Inspector and how does it play tog...,It is Spotlight's UI component that is able to...,The Inspector Widget in Renumics Spotlight is...,[---\ntags: []\nid: inspector\nsidebar_positio...
7,What is sliceguard and how does it play togeth...,It is a library that can automatically detect ...,Sliceguard is an open-source library for data...,[---\ntags: []\nid: sliceguard\nsidebar_positi...
8,I want to look at my audio data. Which possibi...,You can display audio data in the inspector. S...,Spotlight offers two possibilities for lookin...,"[`audio(column, window_column=None, name=None,..."
9,How can I customize the data visualization in ...,You can configure the UI flexibly via the UI. ...,To customize the data visualization in Renumi...,[---\ntags: []\nid: inspector\nsidebar_positio...


In [5]:
df.to_json("improved_rag_results.json")