In [1]:
!pip install -Uq llama-index sentence-transformers cohere protobuf pypdf datasets evaluate arize-phoenix --upgrade  -q


In [2]:
import os
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.node_parser import SimpleNodeParser
from llama_index.schema import Document

# LLM
from llama_index.llms import OpenAI

# Embeddings
from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding, CohereEmbedding

# Retrievers
from llama_index.retrievers import (
    BaseRetriever,
    VectorIndexRetriever,
    KGTableRetriever
)

# Rerankers
from llama_index.indices.query.schema import QueryBundle, QueryType
from llama_index.schema import NodeWithScore
# from llama_index.indices.postprocessor.cohere_rerank import CohereRerank
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index.finetuning.embeddings.common import EmbeddingQAFinetuneDataset

# Evaluator
from llama_index.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
)
from llama_index.evaluation import RetrieverEvaluator

from typing import List
import pandas as pd

import nest_asyncio

nest_asyncio.apply()

#Setup  OPEN API Key
os.environ["OPENAI_API_KEY"] = ""

In [3]:
from datasets import load_dataset

dataset = load_dataset("sciq")


Downloading readme:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.99M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/339k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/343k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11679 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 11679
    })
    validation: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 1000
    })
})

In [5]:
dataset = dataset.remove_columns(['distractor3', 'distractor1', 'distractor2'])

In [6]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [7]:
show_random_elements(dataset["train"])

Unnamed: 0,question,correct_answer,support
0,The carrying angle is larger in females to accommodate their what?,wider pelvis,"the ulna and radius bones. The small, rounded area that forms the distal end is the head of the ulna. Projecting from the posterior side of the ulnar head is the styloid process of the ulna, a short bony projection. This serves as an attachment point for a connective tissue structure that unites the distal ends of the ulna and radius. In the anatomical position, with the elbow fully extended and the palms facing forward, the arm and forearm do not form a straight line. Instead, the forearm deviates laterally by 5–15 degrees from the line of the arm. This deviation is called the carrying angle. It allows the forearm and hand to swing freely or to carry an object without hitting the hip. The carrying angle is larger in females to accommodate their wider pelvis."
1,"Which law relates the pressure, volume, and temperature of a gas?",combines gas law,"The combined gas law relates pressure, volume, and temperature of a gas."
2,Light reactions occur during which stage of photosynthesis?,first stage,"The light reactions occur in the first stage of photosynthesis. This stage takes place in the thylakoid membranes of the chloroplast. In the light reactions, energy from sunlight is absorbed by chlorophyll. This energy is temporarily transferred to two molecules: ATP and NADPH. These molecules are used to store the energy for the second stage of photosynthesis. The light reactions use water and produce oxygen."
3,What is required to break down and build up molecules and to transport molecules across plasma membranes?,energy,"Inside every cell of all living things, energy is needed to carry out life processes. Energy is required to break down and build up molecules and to transport molecules across plasma membranes. All life’s work needs energy. A lot of energy is also simply lost to the environment as heat. The story of life is a story of energy flow—its capture, its change of form, its use for work, and its loss as heat. Energy, unlike matter, cannot be recycled, so organisms require a constant input of energy. Life runs on chemical energy. Where do living organisms get this chemical energy?."
4,"Structural rearrangements of what include partial duplications, deletions, inversions, and translocations; duplications and deletions often produce offspring that survive but exhibit physical and mental abnormalities?",chromosomes,"Chromosome Structural Rearrangements Cytologists have characterized numerous structural rearrangements in chromosomes, including partial duplications, deletions, inversions, and translocations. Duplications and deletions often produce offspring that survive but exhibit physical and mental abnormalities. Cri-du-chat (from the French for “cry of the cat”) is a syndrome associated with nervous system abnormalities and identifiable physical features that results from a deletion of most of the small arm of chromosome 5 (Figure 7.11). Infants with this genotype emit a characteristic high-pitched cry upon which the disorder’s name is based."
5,"Which organ protects the body from injury, water loss, and microorganisms?",skin,"Skin protects the body from injury, water loss, and microorganisms. It also plays a major role in maintaining a stable body temperature."
6,Why do effusive eruptions rarely kill anyone?,they move slowly,"Effusive eruptions rarely kill anyone because they move slowly. People can usually be evacuated before an effusive eruption. Still, effusive eruptions can be destructive. There is not much anyone can do to stop a lava flow from destroying a building or road ( Figure below )."
7,Cellular respiration brings hydrogen and oxygen together to form what?,water,
8,What is the name of pluto's moon?,charon,"Pluto and its moon, Charon, are actually two objects."
9,The epidermis of the leaf consists of a single layer of which cells?,dermal cells,"The epidermis of the leaf consists of a single layer of tightly-packed dermal cells. They secrete waxy cuticle to prevent evaporation of water from the leaf. The epidermis has tiny pores called stomata (singular, stoma) that control transpiration and gas exchange with the air. Figure below explains how stomata carry out this vital function."


The dataset has more than 13K rows, and just for simplicity, let us take only 500 rows from them for our evaluation. Please feel free to use all the data for your evaluation. We will also take only those documents whose chunk size is less than 512 for ease of use. However, this can be adapted for bigger documents too.

In [8]:
# Set the chunk size as 512 in node parser
node_parser = SimpleNodeParser.from_defaults(chunk_size=512)

corpus = []
filtered_queries = []
counter = 0
for train_row in dataset["train"]:
    # Remove the empty documents
    if len(train_row["support"].strip()) == 0:
        continue
    current_document = Document(text=train_row["support"])
    # If the number of nodes for the document is 1, use them for evaluation
    if len(node_parser.get_nodes_from_documents([current_document])) == 1:
        corpus.append(train_row["support"])
        filtered_queries.append(train_row["question"])
        counter += 1
    # Limit to 500 documents
    if counter == 500:
        break

In [9]:
# Create the nodes from documents
documents = [Document(text=c) for c in corpus]
nodes = node_parser.get_nodes_from_documents(documents)
# Manually assign node id for retrieval and evaluation
for idx, node in enumerate(nodes):
    node.id_ = f"corpus_{idx}"

In [10]:
# from llama_index.evaluation import generate_question_context_pairs
# # Prompt to generate questions
# qa_generate_prompt_tmpl = """\
# Context information is below.

# ---------------------
# {context_str}
# ---------------------

# Given the context information and not prior knowledge.
# generate only questions based on the below query.

# You are a Professor. Your task is to setup \
# {num_questions_per_chunk} questions for an upcoming \
# quiz/examination. The questions should be diverse in nature \
# across the document. The questions should not contain options, not start with Q1/ Q2. \
# Restrict the questions to the context information provided.\
# """
# llm = OpenAI(model="gpt-4")
# qa_dataset = generate_question_context_pairs(
#     nodes, llm=llm, num_questions_per_chunk=2
# )

In [11]:
# # function to clean the dataset
# def filter_qa_dataset(qa_dataset):
#     """
#     Filters out queries from the qa_dataset that contain certain phrases and the corresponding
#     entries in the relevant_docs, and creates a new EmbeddingQAFinetuneDataset object with
#     the filtered data.

#     :param qa_dataset: An object that has 'queries', 'corpus', and 'relevant_docs' attributes.
#     :return: An EmbeddingQAFinetuneDataset object with the filtered queries, corpus and relevant_docs.
#     """

#     # Extract keys from queries and relevant_docs that need to be removed
#     queries_relevant_docs_keys_to_remove = {
#         k for k, v in qa_dataset.queries.items()
#         if 'Here are 2' in v or 'Here are two' in v
#     }

#     # Filter queries and relevant_docs using dictionary comprehensions
#     filtered_queries = {
#         k: v for k, v in qa_dataset.queries.items()
#         if k not in queries_relevant_docs_keys_to_remove
#     }
#     filtered_relevant_docs = {
#         k: v for k, v in qa_dataset.relevant_docs.items()
#         if k not in queries_relevant_docs_keys_to_remove
#     }

#     # Create a new instance of EmbeddingQAFinetuneDataset with the filtered data
#     return EmbeddingQAFinetuneDataset(
#         queries=filtered_queries,
#         corpus=qa_dataset.corpus,
#         relevant_docs=filtered_relevant_docs
#     )

# # filter out pairs with phrases `Here are 2 questions based on provided context`
# qa_dataset = filter_qa_dataset(qa_dataset)

We will use EmbeddingQAFinetuneDataset from LlamaIndex for our evaluation. EmbeddingQAFinetuneDataset needs queries dictionary, corpus dictionary, and the relevant mapping dictionary as inputs. So, let us create them.

In [12]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
# Create inputs for EmbeddingQAFinetuneDataset
queries_dict = {f"query_{index}":filtered_queries[index] for index in range(counter)}
corpus_dict = {f"corpus_{index}":corpus[index] for index in range(counter)}
relevant_docs_dict = {f"query_{index}":[f"corpus_{index}"] for index in range(counter)}

# Create QA dataset
qa_dataset = EmbeddingQAFinetuneDataset(
    queries=queries_dict,
    corpus=corpus_dict,
    relevant_docs=relevant_docs_dict
)

In [15]:
# Define all embeddings and rerankers
EMBEDDINGS = {
    "bge-large": HuggingFaceEmbedding(model_name='BAAI/bge-large-en'), # You can use mean pooling by addin pooling='mean' parameter
    "JinaAI-Small": HuggingFaceEmbedding(model_name='jinaai/jina-embeddings-v2-small-en', pooling='mean', trust_remote_code=True),
    "JinaAI-Base": HuggingFaceEmbedding(model_name='jinaai/jina-embeddings-v2-base-en', pooling='mean', trust_remote_code=True),
}

RERANKERS = {
    "WithoutReranker": "None",
    "bge-reranker-base": SentenceTransformerRerank(model="BAAI/bge-reranker-base", top_n=5),
    "bge-reranker-large": SentenceTransformerRerank(model="BAAI/bge-reranker-large", top_n=5)
}

config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

configuration_bert.py:   0%|          | 0.00/8.24k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-implementation:
- configuration_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_bert.py:   0%|          | 0.00/97.5k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-implementation:
- modeling_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/65.4M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/275M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

In [18]:
# embed_model = OpenAIEmbedding()
# service_context = ServiceContext.from_defaults(llm=None, embed_model=embed_model)
# vector_index = VectorStoreIndex(nodes, service_context=service_context)
# vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5, service_context=service_context)

LLM is explicitly disabled. Using MockLLM.


In [16]:
def display_results(embedding_name, reranker_name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()

    metric_df = pd.DataFrame(
        {"Embedding": [embedding_name], "Reranker": [reranker_name], "hit_rate": [hit_rate], "mrr": [mrr]}
    )

    return metric_df

In [17]:
results_df = pd.DataFrame()

# Loop over embeddings
for embed_name, embed_model in EMBEDDINGS.items():

    print(f"Running Evaluation for Embedding Model: {embed_name}")

    service_context = ServiceContext.from_defaults(llm=None, embed_model=embed_model)
    vector_index = VectorStoreIndex(nodes, service_context=service_context)

    vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5, service_context=service_context)

    # Loop over rerankers
    for rerank_name, reranker in RERANKERS.items():

        print(f"Running Evaluation for Embedding Model: {embed_name} and Reranker: {rerank_name}")

        # Define Retriever
        class CustomRetriever(BaseRetriever):
            """Custom retriever that performs both Vector search and Knowledge Graph search"""

            def __init__(
                self,
                vector_retriever: VectorIndexRetriever,
            ) -> None:
                """Init params."""

                self._vector_retriever = vector_retriever

            def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
                """Retrieve nodes given query."""

                retrieved_nodes = self._vector_retriever.retrieve(query_bundle)

                if reranker != 'None':
                    retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)
                else:
                    retrieved_nodes = retrieved_nodes[:5]

                return retrieved_nodes

            async def _aretrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
                """Asynchronously retrieve nodes given query.

                Implemented by the user.

                """
                return self._retrieve(query_bundle)

            async def aretrieve(self, str_or_query_bundle: QueryType) -> List[NodeWithScore]:
                if isinstance(str_or_query_bundle, str):
                    str_or_query_bundle = QueryBundle(str_or_query_bundle)
                return await self._aretrieve(str_or_query_bundle)

        custom_retriever = CustomRetriever(vector_retriever)

        retriever_evaluator = RetrieverEvaluator.from_metric_names(
            ["mrr", "hit_rate"], retriever=custom_retriever
        )
        eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)

        current_df = display_results(embed_name, rerank_name, eval_results)
        results_df = pd.concat([results_df, current_df], ignore_index=True)

Running Evaluation for Embedding Model: bge-large
LLM is explicitly disabled. Using MockLLM.
Running Evaluation for Embedding Model: bge-large and Reranker: WithoutReranker
Running Evaluation for Embedding Model: bge-large and Reranker: bge-reranker-base
Running Evaluation for Embedding Model: bge-large and Reranker: bge-reranker-large
Running Evaluation for Embedding Model: JinaAI-Small
LLM is explicitly disabled. Using MockLLM.
Running Evaluation for Embedding Model: JinaAI-Small and Reranker: WithoutReranker
Running Evaluation for Embedding Model: JinaAI-Small and Reranker: bge-reranker-base
Running Evaluation for Embedding Model: JinaAI-Small and Reranker: bge-reranker-large
Running Evaluation for Embedding Model: JinaAI-Base
LLM is explicitly disabled. Using MockLLM.
Running Evaluation for Embedding Model: JinaAI-Base and Reranker: WithoutReranker
Running Evaluation for Embedding Model: JinaAI-Base and Reranker: bge-reranker-base
Running Evaluation for Embedding Model: JinaAI-Base

In [18]:
# Display final results
print(results_df)

      Embedding            Reranker  hit_rate       mrr
0     bge-large     WithoutReranker     0.970  0.935400
1     bge-large   bge-reranker-base     0.970  0.938900
2     bge-large  bge-reranker-large     0.970  0.935333
3  JinaAI-Small     WithoutReranker     0.968  0.915100
4  JinaAI-Small   bge-reranker-base     0.968  0.938067
5  JinaAI-Small  bge-reranker-large     0.968  0.937833
6   JinaAI-Base     WithoutReranker     0.972  0.926133
7   JinaAI-Base   bge-reranker-base     0.972  0.941667
8   JinaAI-Base  bge-reranker-large     0.972  0.948000
