In [27]:
import mlflow.experiments
import nest_asyncio
nest_asyncio.apply()
import chromadb
import pandas as pd

from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex, StorageContext
from llama_index.core.node_parser import  SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.llms.ollama import Ollama

from llama_index.core.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
    RetrieverEvaluator
)

import mlflow

mlflow.llama_index.autolog()

mlflow.set_experiment("rust-book-rag")
# experiment_id = .get_experiment_by_name("rust-book-rag").experiment_id

<Experiment: artifact_location='file:///home/carlos/Documents/repos/rust-programming/rust-rag/notebooks/mlruns/923516027696088727', creation_time=1731166672733, experiment_id='923516027696088727', last_update_time=1731166672733, lifecycle_stage='active', name='rust-book-rag', tags={}>

# Setup

In [2]:
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2", trust_remote_code=True)
# embed_model = HuggingFaceEmbedding(model_name="jinaai/jina-embeddings-v3", trust_remote_code=True)

llm = Ollama(model="llama3.2:latest", request_timeout=60, temperature=0)
# qwen2 = Ollama(model="qwen2.5:latest", request_timeout=60)

Settings.embed_model = embed_model
Settings.llm = llm

In [3]:
QA_GENERATE_PROMPT_TMPL = """\
Context information is below.

---------------------
{context_str}
---------------------

Given the context information and not prior knowledge.
generate only questions based on the below query.

You are a Teacher/ Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. Restrict the questions to the \
context information provided. Your response should include \
the questions separated by a newline and nothing else.
"""

In [4]:
def get_nodes_from_index(index):
    """Gets the nodes from the index"""
    retriever = index.as_retriever(similarity_top_k=99999999999)
    all_nodes = retriever.retrieve("dummy")
    all_nodes = [item.node for item in all_nodes]
    return all_nodes


def build_index(documents, embed_model=embed_model or Settings.embed_model, db_path="../chromadb", collection_name="rust_book", rebuild=False):
    """Builds the index"""
    db = chromadb.PersistentClient(db_path)
    collection = db.get_or_create_collection(collection_name)
    vector_store = ChromaVectorStore(chroma_collection=collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    if collection.count() > 0 and not rebuild:
        index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context, embed_model=embed_model)
    else:
        index = VectorStoreIndex.from_documents(documents, storage_context=storage_context, embed_model=embed_model)

    return db, collection, vector_store, index


def display_results(name, eval_results, metrics):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    columns = {
        "retrievers": [name],
        **{k: [full_df[k].mean()] for k in metrics},
    }

    metric_df = pd.DataFrame(columns)

    return metric_df


# Baseline Retriever

Note: If we want to compare different storage / embedding methods, we need to rebuild the index and qa-dataset

In [7]:
documents = SimpleDirectoryReader('../txt').load_data()

db, collection, vector_store, index = build_index(documents)
nodes = get_nodes_from_index(index)

retriever = index.as_retriever(similarity_top_k=2)
query_engine = index.as_query_engine()

Number of requested results 99999999999 is greater than number of elements in index 1470, updating n_results = 1470


### Generate qa dataset

In [None]:
# n_nodes = 150
# qa_dataset = generate_question_context_pairs(nodes=nodes[:n_nodes], num_questions_per_chunk=2, qa_generate_prompt_tmpl=QA_GENERATE_PROMPT_TMPL)
# qa_dataset.save_json("../data/qa_dataset.json"),

qa_dataset = EmbeddingQAFinetuneDataset.from_json("../data/qa_dataset.json")

100%|██████████| 150/150 [02:13<00:00,  1.12it/s]


In [48]:
async def evaluate_retriever(retriever, qa_dataset=qa_dataset, metrics = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]):
    retriever_evaluator = RetrieverEvaluator.from_metric_names(
        metrics, retriever=retriever
    )
    eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)
    return  display_results("baseline top-2 eval", eval_results, metrics=metrics)

async def log_retriever_eval(retriever, retriever_name, **kwargs):
    with mlflow.start_run():
        results = await evaluate_retriever(retriever, **kwargs)
        mlflow.log_param("retriever", retriever_name)
        mlflow.log_metrics(*results.drop(columns=["retrievers"]).to_dict(orient="records"))

In [44]:
await log_retriever_eval(retriever, "baseline top-2 eval")

# Query Fusion Retriever

In [46]:
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.retrievers.bm25 import BM25Retriever


bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=2)
query_fusion_retriever = QueryFusionRetriever(
    [index.as_retriever(), bm25_retriever],
    similarity_top_k=2,
    num_queries=2,
    mode="reciprocal_rerank",
    verbose=False,
)

with mlflow.start_run():
    results = await evaluate_retriever(retriever=query_fusion_retriever)
    mlflow.log_param("retriever", "baseline query fusion")
    mlflow.log_metrics(*results.drop(columns=["retrievers"]).to_dict(orient="records"))

# await log_retriever_eval(query_fusion_retriever, retriever_name="baseline query fusion")

Slight improvement, but nothing noteworthy

# Testing a different embedding model

### All-MiniLM-L12-v2

In [47]:
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L12-v2", trust_remote_code=True)

db, collection, vector_store, index = build_index(documents, embed_model=embed_model, collection_name="rust-rag-all-miniLM-L12-v2", rebuild=True)
nodes = get_nodes_from_index(index)

retriever = index.as_retriever(similarity_top_k=2)
query_engine = index.as_query_engine()


Number of requested results 99999999999 is greater than number of elements in index 980, updating n_results = 980


Evaluation on the same dataset

In [49]:
await log_retriever_eval(retriever, retriever_name="all-mini-lm-l12-v2 retriever")
# await evaluate_retriever(retriever)

Because the indexes are built differently, evaluation on the same dataset doesn't make sense

In [50]:
n_nodes = 150
qa_dataset_all_mini = generate_question_context_pairs(nodes=nodes[:n_nodes], num_questions_per_chunk=2, qa_generate_prompt_tmpl=QA_GENERATE_PROMPT_TMPL)
qa_dataset_all_mini.save_json("../data/qa_dataset_all_mini.json")

qa_dataset_all_mini = EmbeddingQAFinetuneDataset.from_json("../data/qa_dataset_all_mini.json")

100%|██████████| 150/150 [02:22<00:00,  1.05it/s]


In [51]:
qa_dataset_all_mini_eval = generate_question_context_pairs(nodes=nodes[n_nodes: n_nodes+50], num_questions_per_chunk=2, qa_generate_prompt_tmpl=QA_GENERATE_PROMPT_TMPL)
qa_dataset_all_mini_eval.save_json("../data/qa_dataset_all_mini_eval.json")

qa_dataset_all_mini_eval = EmbeddingQAFinetuneDataset.from_json("../data/qa_dataset_all_mini_eval.json")

100%|██████████| 50/50 [00:45<00:00,  1.09it/s]


In [53]:
await log_retriever_eval(retriever, retriever_name="all-mini-lm-l12-v2 retriever reindexed", qa_dataset=qa_dataset_all_mini)


In [25]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine

ft_engine = SentenceTransformersFinetuneEngine(
    model_id="all-mini-lm-l12-v2",
    model_output_path="test_mini_lm_ft",
    val_dataset=qa_dataset_all_mini_eval,
    num_epochs=1
)

ModuleNotFoundError: No module named 'llama_index.finetuning'

### Stella

In [9]:
# embed_model = HuggingFaceEmbedding(model_name="dunzhang/stella_en_400M_v5", trust_remote_code=True, )

# db, collection, vector_store, index = build_index(documents, embed_model=embed_model, collection_name="rust-rag-stella", rebuild=True)
# nodes = get_nodes_from_index(index)

# retriever = index.as_retriever(similarity_top_k=2)
# query_engine = index.as_query_engine()

# qa_dataset_stella = generate_question_context_pairs(nodes=nodes[:n_nodes], num_questions_per_chunk=2, qa_generate_prompt_tmpl=QA_GENERATE_PROMPT_TMPL)
# qa_dataset_stella.save_json("../data/qa_dataset_stella.json")

# qa_dataset_stella = EmbeddingQAFinetuneDataset.from_json("../data/qa_dataset_stella.json")

In [None]:
await evaluate_retriever(retriever, qa_dataset=qa_dataset_stella)