In [1]:
import nest_asyncio
nest_asyncio.apply()
import chromadb
import pandas as pd

from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex, StorageContext
from llama_index.core.node_parser import  SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.llms.ollama import Ollama

from llama_index.core.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
    RetrieverEvaluator
)

  from .autonotebook import tqdm as notebook_tqdm


# Setup

In [2]:

embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2", trust_remote_code=True)
# embed_model = HuggingFaceEmbedding(model_name="jinaai/jina-embeddings-v3", trust_remote_code=True)

llm = Ollama(model="llama3.2:latest", request_timeout=60, temperature=0)
# qwen2 = Ollama(model="qwen2.5:latest", request_timeout=60)

Settings.embed_model = embed_model
Settings.llm = llm

In [3]:
QA_GENERATE_PROMPT_TMPL = """\
Context information is below.

---------------------
{context_str}
---------------------

Given the context information and not prior knowledge.
generate only questions based on the below query.

You are a Teacher/ Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. Restrict the questions to the \
context information provided. Your response should include \
the questions separated by a newline and nothing else.
"""

In [37]:
def get_nodes_from_index(index):
    """Gets the nodes from the index"""
    retriever = index.as_retriever(similarity_top_k=99999999999)
    all_nodes = retriever.retrieve("dummy")
    all_nodes = [item.node for item in all_nodes]
    return all_nodes


def build_index(documents, embed_model=embed_model or Settings.embed_model, db_path="../chromadb", collection_name="rust_book", rebuild=False):
    """Builds the index"""
    db = chromadb.PersistentClient(db_path)
    collection = db.get_or_create_collection(collection_name)
    vector_store = ChromaVectorStore(chroma_collection=collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    if collection.count() > 0 and not rebuild:
        index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context, embed_model=embed_model)
    else:
        index = VectorStoreIndex.from_documents(documents, storage_context=storage_context, embed_model=embed_model)

    return db, collection, vector_store, index


def display_results(name, eval_results, metrics):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    columns = {
        "retrievers": [name],
        **{k: [full_df[k].mean()] for k in metrics},
    }

    metric_df = pd.DataFrame(columns)

    return metric_df


# Baseline Retriever

Note: If we want to compare different storage / embedding methods, we need to rebuild the index and qa-dataset

In [20]:
documents = SimpleDirectoryReader('../txt').load_data()

db, collection, vector_store, index = build_index(documents)
nodes = get_nodes_from_index(index)

retriever = index.as_retriever(similarity_top_k=2)
query_engine = index.as_query_engine()

Number of requested results 99999999999 is greater than number of elements in index 490, updating n_results = 490


### Generate qa dataset

In [None]:
# n_nodes = 100
# qa_dataset = generate_question_context_pairs(nodes=nodes[:n_nodes], num_questions_per_chunk=2, qa_generate_prompt_tmpl=QA_GENERATE_PROMPT_TMPL)
# qa_dataset.save_json("../data/qa_dataset.json"),

qa_dataset = EmbeddingQAFinetuneDataset.from_json("../data/qa_dataset.json")

100%|██████████| 100/100 [01:23<00:00,  1.19it/s]


In [39]:

async def evaluate_retriever(retriever, qa_dataset=qa_dataset, metrics = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]):
    retriever_evaluator = RetrieverEvaluator.from_metric_names(
        metrics, retriever=retriever
    )
    eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)
    return  display_results("baseline top-2 eval", eval_results, metrics=metrics)


In [35]:
await evaluate_retriever(retriever)

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,baseline top-2 eval,0.16,0.1375,0.08,0.16,0.1375,0.08792


# Query Fusion Retriever

In [42]:
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.retrievers.bm25 import BM25Retriever


bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=2)
query_fusion_retriever = QueryFusionRetriever(
    [index.as_retriever(), bm25_retriever],
    similarity_top_k=2,
    num_queries=2,
    mode="reciprocal_rerank",
    verbose=False,
)


await evaluate_retriever(query_fusion_retriever)

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,baseline top-2 eval,0.175,0.1525,0.0875,0.175,0.1525,0.097118


Slight improvement, but nothing noteworthy

# Testing a different embedding model

### All-MiniLM-L12-v2

In [None]:
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L12-v2", trust_remote_code=True)

db, collection, vector_store, index = build_index(documents, embed_model=embed_model, collection_name="rust-rag-all-miniLM-L12-v2", rebuild=True)
nodes = get_nodes_from_index(index)

retriever = index.as_retriever(similarity_top_k=2)
query_engine = index.as_query_engine()


Number of requested results 99999999999 is greater than number of elements in index 490, updating n_results = 490


Evaluation on the same dataset

In [None]:
await evaluate_retriever(retriever)

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,baseline top-2 eval,0.0,0.0,0.0,0.0,0.0,0.0


Because the indexes are built differently, evaluation on the same dataset doesn't make sense

In [48]:
# qa_dataset_all_mini = generate_question_context_pairs(nodes=nodes[:n_nodes], num_questions_per_chunk=2, qa_generate_prompt_tmpl=QA_GENERATE_PROMPT_TMPL)
# qa_dataset_all_mini.save_json("../data/qa_dataset_all_mini.json")

qa_dataset_all_mini = EmbeddingQAFinetuneDataset.from_json("../data/qa_dataset_all_mini.json")

In [49]:
await evaluate_retriever(retriever, qa_dataset=qa_dataset_all_mini)

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,baseline top-2 eval,0.17,0.145,0.085,0.17,0.145,0.09292


### Stella

In [None]:
embed_model = HuggingFaceEmbedding(model_name="GameScribes/stella_en_400M_v5", trust_remote_code=True)

db, collection, vector_store, index = build_index(documents, embed_model=embed_model, collection_name="rust-rag-all-miniLM-L12-v2", rebuild=True)
nodes = get_nodes_from_index(index)

retriever = index.as_retriever(similarity_top_k=2)
query_engine = index.as_query_engine()

qa_dataset_stella = generate_question_context_pairs(nodes=nodes[:n_nodes], num_questions_per_chunk=2, qa_generate_prompt_tmpl=QA_GENERATE_PROMPT_TMPL)
qa_dataset_stella.save_json("../data/qa_dataset_stella.json")

qa_dataset_stella = EmbeddingQAFinetuneDataset.from_json("../data/qa_dataset_stella.json")

A new version of the following files was downloaded from https://huggingface.co/GameScribes/stella_en_400M_v5:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/GameScribes/stella_en_400M_v5:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


In [None]:
await evaluate_retriever(retriever, qa_dataset=qa_dataset_stella)
