<a href="https://colab.research.google.com/github/akashmathur-2212/LLMs-playground/blob/main/LlamaIndex-applications/Advanced-RAG/advanced_query_transformations/Advanced_Query_Transformations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

1. In this notebook, we will work with the PDF data. Here, we are using QLoRA paper and create an initial set of nodes (chunk size 256).
2. We will use Open Source LLM [`zephyr-7b-alpha`](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha) and embedding [`bge-large-en-v1.5`](https://huggingface.co/BAAI/bge-large-en-v1.5)

Let's begin!

In [None]:
!pip install -qqq llama-index llama-hub langchain accelerate==0.21.0 bitsandbytes==0.40.2 transformers sentence_transformers InstructorEmbedding

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.2/100.2 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m802.4/802.4 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.9/147.9 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.9/75.9 kB[0m [31m9.

In [None]:
# import nest_asyncio
# nest_asyncio.apply()

import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

import json
import torch
from pathlib import Path
import pandas as pd
pd.set_option("display.max_colwidth", -1)

from copy import deepcopy

# transformers
from transformers import BitsAndBytesConfig

# llama_index
from llama_index.prompts import PromptTemplate
from llama_index.llms import HuggingFaceLLM
from llama_index import download_loader, Document, VectorStoreIndex, ServiceContext
from llama_index.node_parser import SentenceSplitter
from langchain.embeddings import HuggingFaceEmbeddings

from llama_index.indices.query.query_transform import HyDEQueryTransform
from llama_index.query_engine.transform_query_engine import TransformQueryEngine

from IPython.display import Markdown, display
from llama_index.response.notebook_utils import display_source_node

from llama_index.query_engine import RetrieverQueryEngine
from IPython.display import Markdown, display, HTML
from llama_index.retrievers import VectorIndexRetriever

from sentence_transformers import SentenceTransformer

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

  pd.set_option("display.max_colwidth", -1)


# Load Data

In [None]:
PDFReader = download_loader("PDFReader")
loader = PDFReader()
docs = loader.load_data(file=Path("QLoRa.pdf"))

In [None]:
node_parser = SentenceSplitter(chunk_size=256)
nodes = node_parser.get_nodes_from_documents(docs)

In [None]:
len(nodes)

383

# Models

## LLM (`zephyr-7b-alpha`)

In [None]:
from google.colab import userdata

# huggingface api token
hf_token = userdata.get('hf_token')

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)


def messages_to_prompt(messages):
  prompt = ""
  for message in messages:
    if message.role == 'system':
      prompt += f"<|system|>\n{message.content}</s>\n"
    elif message.role == 'user':
      prompt += f"<|user|>\n{message.content}</s>\n"
    elif message.role == 'assistant':
      prompt += f"<|assistant|>\n{message.content}</s>\n"

  # ensure we start with a system prompt, insert blank if needed
  if not prompt.startswith("<|system|>\n"):
    prompt = "<|system|>\n</s>\n" + prompt

  # add final assistant prompt
  prompt = prompt + "<|assistant|>\n"

  return prompt


llm = HuggingFaceLLM(
    model_name="HuggingFaceH4/zephyr-7b-alpha",
    tokenizer_name="HuggingFaceH4/zephyr-7b-alpha",
    query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
    context_window=3900,
    max_new_tokens=256,
    model_kwargs={"quantization_config": quantization_config},
    # tokenizer_kwargs={},
    generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95, "do_sample":True},
    messages_to_prompt=messages_to_prompt,
    device_map="auto",
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/628 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

## Embedding (`bge-large-en-v1.5`)

We will use **BGE embedding**. It is a general Embedding Model.

In [None]:
embed_model = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

## Configure Index and Retriever

In [None]:
# ServiceContext
service_context = ServiceContext.from_defaults(llm=llm,
                                               embed_model=embed_model
                                               )

# index
vector_index = VectorStoreIndex(
    nodes, service_context=service_context
)

# 1. HyDE Query Transformation

First, we query without transformation. Then, the same query string is used for embedding lookup and also summarization.

## Example 1

In [None]:
query_str = "What are the different approaches to reduce memory usage without sacrificing performance?"

In [None]:
query_engine = vector_index.as_query_engine()
response = query_engine.query(query_str)
display(Markdown(f"<b>{response}</b>"))



<b>QLORA introduces three approaches to reduce memory usage without sacrificing performance: 
1. 4-bit NormalFloat (NF4), an information theoretically optimal quantization data type for normally distributed data that yields better empirical results than 4-bit Integers and 4-bit Floats.
2. Double Quantization, a method that quantizes the quantization constants, saving an average of about 0.37 bits per parameter.
3. Paged Optimizers, using NVIDIA unified memory to avoid the gradient checkpointing memory spikes that occur when processing a mini-batch with a long sequence length.</b>

In [None]:
hyde = HyDEQueryTransform(include_original=True, llm=llm)
hyde_query_engine = TransformQueryEngine(query_engine, hyde)
response = hyde_query_engine.query(query_str)
display(Markdown(f"<b>{response}</b>"))

<b>QLORA introduces multiple innovations to reduce memory usage without sacrificing performance:

1. 4-bit NormalFloat (NF4), an information theoretically optimal quantization data type for normally distributed data that yields better empirical results than 4-bit Integers and 4-bit Floats.

2. Double Quantization, a method that quantizes the quantization constants, saving an average of about 0.37 bits per parameter (approximately 3 GB for a 65B model).

3. Paged Optimizers, using NVIDIA unified memory to avoid the gradient checkpointing memory spikes that occur when processing a mini-batch with a long sequence length.

These contributions are combined into a better tuned LoRA approach that includes adapters at every network layer, thereby avoiding almost all of the accuracy tradeoffs seen in prior work.

In summary, QLoRA's innovations are:

- 4-bit NormalFloat (NF4)
- Double Quantization
- Paged Optimizers

These approaches help to reduce the memory footprint and improve performance, allowing for an in-depth study of instruction finetuning and</b>

## Example 2

In [None]:
query_str = "How QLORA differentiate itself from earlier finetuning approachs and how it is better?"

In [None]:
query_engine = vector_index.as_query_engine()
response = query_engine.query(query_str)
display(Markdown(f"<b>{response}</b>"))

<b>QLORA differentiates itself from earlier finetuning approaches by enabling the finetuning of 33B parameter models on a single consumer GPU and 65B parameter models on a single professional GPU without degrading performance relative to a full finetuning baseline. This is a significant improvement over earlier approaches, which were limited in their ability to finetune these large models due to resource constraints. QLORA also allows for privacy-preserving usage of LLMs and makes LLMs easier to deploy, which can enable novel applications and help close the resource gap between large corporations and small teams with consumer GPUs. Overall, QLORA provides a more accessible and widely available finetuning method for state-of-the-art NLP technology.</b>

In [None]:
hyde = HyDEQueryTransform(include_original=True, llm=llm)
hyde_query_engine = TransformQueryEngine(query_engine, hyde)
response = hyde_query_engine.query(query_str)
display(Markdown(f"<b>{response}</b>"))

<b>QLORA differentiates itself from earlier finetuning approachs by being the first method that enables the finetuning of 33B parameter models on a single consumer GPU and 65B parameter models on a single professional GPU, without degrading performance relative to a full finetuning baseline. This allows for the training of large-scale models on consumer hardware, making finetuning more accessible and common for small teams with limited resources. QLORA also demonstrates that its best 33B model trained on the Open Assistant dataset can rival ChatGPT on the Vicuna benchmark, further improving the accessibility of state-of-the-art NLP technology.</b>

## Example 3

In [None]:
query_str = "Describe the trade-offs between using BFloat16 as the computation data type and other possible choices. When would you choose one over the other?"

In [None]:
query_engine = vector_index.as_query_engine()
response = query_engine.query(query_str)
display(Markdown(f"<b>{response}</b>"))

<b>The given context information provides insights into the use of BFloat16 as the computation data type in QLORA. While QLORA has one low-precision storage data type, which is usually 4-bit, and one computation data type that is usually BFloat16, there are trade-offs between using BFloat16 and other possible choices.

BFloat16 is a computation data type that offers a high level of precision and has been shown to provide better performance compared to other low-precision data types, such as 8-bit or 4-bit. However, BFloat16 requires more memory and computational resources than other low-precision data types.

In QLORA, BFloat16 is used for computation, which means whenever a QLORA weight tensor is used, it is dequantized to BFloat16, and then a matrix multiplication is performed in 16-bit. This approach provides high-fidelity 4-bit finetuning and ensures a good trade-off between precision and memory usage.

The choice between using BFloat16 and other low-precision data types would depend on the specific requirements of</b>

In [None]:
hyde = HyDEQueryTransform(include_original=True, llm=llm)
hyde_query_engine = TransformQueryEngine(query_engine, hyde)
response = hyde_query_engine.query(query_str)
display(Markdown(f"<b>{response}</b>"))

<b>While the text provided in the given context does not directly address the query, I can provide an answer based on the information provided.

The text mentions that in QLORA, the computation data type is usually BFloat16, and this is used for matrix multiplication in 16-bit. The use of BFloat16 has the advantage of providing better accuracy and lower errors compared to other low-precision data types, such as 4-bit or 8-bit floating-point. However, using BFloat16 as the computation data type can also lead to higher memory usage and slower computations compared to 4-bit data types.

The choice between using BFloat16 or 4-bit data types as the computation data type depends on the specific use case and the trade-offs between accuracy and computational efficiency. For applications where higher accuracy is required, such as scientific simulations or medical imaging, BFloat16 may be the preferred choice. However, for applications where lower accuracy is acceptable, such as video encoding or image recognition, 4-bit data types may be more suitable due to their lower memory usage and faster computations. Ultimately, the choice between BFloat16 and</b>

In [None]:
query_bundle = hyde(query_str)
hyde_doc = query_bundle.embedding_strs[0]

Let's look at the hypothetical document.
We use HyDEQueryTransform to generate a hypothetical document and use it for embedding lookup.

In [None]:
hyde_doc

'BFloat16, a data type developed by Google, has gained significant attention in the world of deep learning due to its ability to provide higher precision while using less memory and computational resources compared to traditional floating-point data types. However, like all data types, there are trade-offs to consider when deciding between BFloat16 and other possible choices.\n\nIn terms of precision, BFloat16 provides 15 bits for the mantissa and 1 bit for the exponent, resulting in a maximum relative error of approximately 0.0015% for values between 1 and 65,536. While this may seem low, it is sufficient for many applications in machine learning and other fields where high precision is not always necessary.\n\nOn the other hand, traditional floating-point data types like single-precision (32 bits) and double-precision (64 bits) provide significantly greater precision, with a maximum relative error of approximately 1.5% and 0.000015%, respectively. However, this precision comes at a c

**Conclusion** - In Example 1, 2 and 3, HyDE improves output quality significantly, by hallucinating accurately, thus improving the embedding quality, and final output.

# 2. Sub-Question Query Engine

Now, we will see how to use a sub question query engine to tackle the problem of answering a complex query.

It first breaks down the complex query into sub questions for each relevant data source, then gather all the intermediate reponses and synthesizes a final response.

In [None]:
from llama_index.tools import QueryEngineTool, ToolMetadata
from llama_index.query_engine import SubQuestionQueryEngine
from llama_index.callbacks import CallbackManager, LlamaDebugHandler

import nest_asyncio
nest_asyncio.apply()

In [None]:
# Using the LlamaDebugHandler to print the trace of the sub questions
# captured by the SUB_QUESTION callback event type
llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])

# ServiceContext
service_context = ServiceContext.from_defaults(llm=llm,
                                               embed_model=embed_model,
                                               callback_manager=callback_manager
                                               )

# index
vector_query_engine = VectorStoreIndex.from_documents(
    docs, service_context=service_context, use_async=True
).as_query_engine()

**********
Trace: index_construction
    |_node_parsing ->  0.17341 seconds
      |_chunking ->  0.007449 seconds
      |_chunking ->  0.008527 seconds
      |_chunking ->  0.002825 seconds
      |_chunking ->  0.004094 seconds
      |_chunking ->  0.007798 seconds
      |_chunking ->  0.007452 seconds
      |_chunking ->  0.007922 seconds
      |_chunking ->  0.007857 seconds
      |_chunking ->  0.008703 seconds
      |_chunking ->  0.008669 seconds
      |_chunking ->  0.003388 seconds
      |_chunking ->  0.002382 seconds
      |_chunking ->  0.002277 seconds
      |_chunking ->  0.004022 seconds
      |_chunking ->  0.007548 seconds
      |_chunking ->  0.001716 seconds
      |_chunking ->  0.012861 seconds
      |_chunking ->  0.013388 seconds
      |_chunking ->  0.013243 seconds
      |_chunking ->  0.014589 seconds
      |_chunking ->  0.000617 seconds
      |_chunking ->  0.003057 seconds
      |_chunking ->  0.003955 seconds
      |_chunking ->  0.003862 seconds
      |_chun

In [None]:
# setup base query engine as tool
query_engine_tools = [
    QueryEngineTool(
        query_engine=vector_query_engine,
        metadata=ToolMetadata(
            name="qlora_paper",
            description="Efficient Finetuning of Quantized LLMs",
        ),
    ),
]

query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
    service_context=service_context,
    use_async=True,
)

In [None]:
response = query_engine.query("Describe the trade-offs between using BFloat16 as the computation data type and other possible choices. When would you choose one over the other?")

**********
Trace: query
    |_query ->  21.319966 seconds
      |_templating ->  3.6e-05 seconds
      |_llm ->  21.316943 seconds
**********


OutputParserException: Got invalid JSON object. Error: Expecting ',' delimiter: line 22 column 10 (char 841) while parsing a flow sequence
  in "<unicode string>", line 2, column 14:
        "items": [
                 ^
expected ',' or ']', but got '<stream end>'
  in "<unicode string>", line 22, column 10:
            }
             ^. Got JSON string: {
    "items": [
        {
            "sub_question": "What are the benefits of using BFloat16 as the computation data type?",
            "tool_name": "qlora_paper"
        },
        {
            "sub_question": "What are the potential drawbacks of using BFloat16 as the computation data type?",
            "tool_name": "qlora_paper"
        },
        {
            "sub_question": "What are the alternative computation data types to BFloat16?",
            "tool_name": "qlora_paper"
        },
        {
            "sub_question": "What are the benefits of using alternative computation data types to BFloat16?",
            "tool_name": "qlora_paper"
        },
        {
            "sub_question": "What are the drawbacks of using alternative computation data types to BFloat16?",
            "tool_name": "qlora_paper"
        }

In [None]:
print(response)

QLORA introduces multiple innovations to reduce memory usage without sacrificing performance:

1. 4-bit NormalFloat (NF4): an information theoretically optimal quantization data type for normally distributed data that yields better empirical results than 4-bit Integers and 4-bit Floats.

2. Double Quantization: a method that quantizes the quantization constants, saving an average of about 0.37 bits per parameter (approximately 3 GB for a 65B model).

3. Paged Optimizers: using NVIDIA unified memory to avoid the gradient checkpointing memory spikes that occur when processing a mini-batch with a long sequence length.

These innovations enable QLoRA to finetune more than 1,000 models, providing a detailed analysis of instruction following and chatbot performance across 8 instruction datasets, multiple model types (LLaMA, T5), and model scales that would be infeasible to run with regular finetuning.


# 3. Router Query Engine

Now, we will define a custom router query engine that selects one out of several candidate query engines to execute a query.

In [None]:
from llama_index import VectorStoreIndex, SummaryIndex, SimpleKeywordTableIndex

In [None]:
service_context = ServiceContext.from_defaults(llm=llm,
                                               embed_model=embed_model
)

In [None]:
## Define all the different indexes over same data

# vector index
vector_index = VectorStoreIndex(
    nodes, service_context=service_context
)

# summary index
summary_index = SummaryIndex(
    nodes, service_context=service_context
    )

# keyword index
keyword_index = SimpleKeywordTableIndex(nodes, service_context=service_context)

## Define Query Engines and Set Metadata

In [None]:
summary_query_engine = summary_index.as_query_engine(
    response_mode="tree_summarize",
    service_context=service_context
)

vector_query_engine = vector_index.as_query_engine(service_context=service_context)

keyword_query_engine = keyword_index.as_query_engine(service_context=service_context)

## Define Query Engine and Tool for these Indices
We define a Query Engine for each Index. We then wrap these with our QueryEngineTool.

In [None]:
from llama_index.tools.query_engine import QueryEngineTool

summary_tool = QueryEngineTool.from_defaults(
    query_engine=summary_query_engine,
    description=(
        "Useful for summarization questions related to Efficient Finetuning QLORA reserach paper"
    ),
)

vector_tool = QueryEngineTool.from_defaults(
    query_engine=vector_query_engine,
    description=(
        "Useful for retrieving specific context from QLORA reserach paper related to Efficient Finetuning "
    ),
)

keyword_tool = QueryEngineTool.from_defaults(
    query_engine=keyword_query_engine,
    description=(
        "Useful for retrieving specific context from QLORA reserach paper related to Efficient Finetuning "
        "using entities mentioned in query"
    ),
)

## Define Router Query Engine
There are several selectors available, each with some distinct attributes.

1. The `LLM selectors` use the LLM to output a JSON that is parsed, and the corresponding indexes are queried.

2. The `Pydantic selectors` (currently only supported by gpt-4-0613 and gpt-3.5-turbo-0613 (the default)) use the OpenAI Function Call API to produce pydantic selection objects, rather than parsing raw JSON.

3. For each type of selector, there is also the option to select `1 index to route to, or multiple`.

4. Then, define the `RouterQueryEngine` with a desired selector module. Here, we use the `LLMSingleSelector`, which uses LLM to choose a underlying query engine to route the query to.

## LLMSingleSelector
We can use OpenAI or any other LLM to parse generated JSON under the hood to select a sub-index for routing.

In [None]:
from llama_index.query_engine.router_query_engine import RouterQueryEngine
from llama_index.selectors.llm_selectors import LLMSingleSelector, LLMMultiSelector

In [None]:
router_query_engine  = RouterQueryEngine(
    selector=LLMSingleSelector.from_defaults(service_context=service_context),
    query_engine_tools=[
        summary_tool,
        vector_tool,
        keyword_tool,
    ],
    service_context=service_context,
)

In [1]:
response = router_query_engine.query("What is Double Quantization?")

In [None]:
print(str(response))

Double Quantization is a technique introduced in QLoRa to further reduce the memory footprint of quantization constants. It involves quantizing the quantization constants themselves, yielding quantized quantization constants (cFP8) with a second level of quantization constants (cFP32)1. This results in an average of 0.37 bits per parameter being saved compared to using 32-bit constants with a blocksize of 64 for W. 4-bit NormalFloat, on the other hand, is an information theoretically optimal quantization data type for normally distributed data that yields better empirical results than 4-bit Integers and 4-bit Floats, as discussed in the context information.


In [None]:
# [optional] look at selected results
print(str(response.metadata["selector_result"]))

## LLMMultiSelector
If we want to route our query to multiple indexes, we can use a multi selector. The multi selector sends to query to multiple sub-indexes, and then aggregates all responses using a summary index to form a complete answer.

In [None]:
router_query_engine  = RouterQueryEngine(
    selector=LLMMultiSelector.from_defaults(service_context=service_context),
    query_engine_tools=[
        summary_tool,
        vector_tool,
        keyword_tool,
    ],
    service_context=service_context,
)

In [None]:
print(str(response))

In [None]:
# [optional] look at selected results
print(str(response.metadata["selector_result"]))

# 4. Multi-Step Query Engine

Multi-step query engine is able to decompose a complex query into sequential subquestions.

In [None]:
from llama_index.indices.query.query_transform.base import StepDecomposeQueryTransform
from llama_index.query_engine.multistep_query_engine import MultiStepQueryEngine

# set Logging to DEBUG for more detailed outputs
from llama_index.query_engine.multistep_query_engine import (
    MultiStepQueryEngine,
)

step_decompose_transform = StepDecomposeQueryTransform(llm=llm, verbose=True)
query_engine = vector_index.as_query_engine(service_context=service_context)

In [None]:
query_engine = MultiStepQueryEngine(
    query_engine=query_engine,
    query_transform=step_decompose_transform
)

ValueError:
******
Could not load OpenAI model. If you intended to use OpenAI, please check your OPENAI_API_KEY.
Original error:
No API key found for OpenAI.
Please set either the OPENAI_API_KEY environment variable or openai.api_key prior to initialization.
API keys can be found or created at https://platform.openai.com/account/api-keys

**Note**: I was getting the above error. It looks like MultiStepQueryEngine supports ***only*** OpenAI `GPT-4` and `GPT-3.5` model.

Please correct me if there is a way to run MultiStepQueryEngine using any other open source model.

# END