In [14]:
import mlflow.experiments
import nest_asyncio
nest_asyncio.apply()
import chromadb
import pandas as pd

import llama_index.core
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex, StorageContext
from llama_index.core.node_parser import  SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.llms.ollama import Ollama
from llama_index.core.schema import TextNode
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.retrievers.bm25 import BM25Retriever

from llama_index.core.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
    RetrieverEvaluator
)

import mlflow

mlflow.llama_index.autolog()

mlflow.set_experiment("rust-book-rag")
# experiment_id = .get_experiment_by_name("rust-book-rag").experiment_id

<Experiment: artifact_location='file:///home/carlos/Documents/repos/rust-programming/rust-rag/notebooks/mlruns/923516027696088727', creation_time=1731166672733, experiment_id='923516027696088727', last_update_time=1731166672733, lifecycle_stage='active', name='rust-book-rag', tags={}>

In [3]:
import phoenix as px

# Look for a URL in the output to open the App in a browser.
px.launch_app()

llama_index.core.set_global_handler("arize_phoenix")

Existing running Phoenix instance detected! Shutting it down and starting a new instance...
Attempting to instrument while already instrumented


🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


# Setup

In [4]:
mpnet_embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2", trust_remote_code=True)
# embed_model = HuggingFaceEmbedding(model_name="jinaai/jina-embeddings-v3", trust_remote_code=True)

llm = Ollama(model="llama3.2:latest", request_timeout=60, temperature=0)
# qwen2 = Ollama(model="qwen2.5:latest", request_timeout=60)

Settings.embed_model = mpnet_embed_model
Settings.llm = llm

In [5]:
QA_GENERATE_PROMPT_TMPL = """\
Context information is below.

---------------------
{context_str}
---------------------

Given the context information and not prior knowledge.
generate only questions based on the below query.

You are a Teacher/ Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. Restrict the questions to the \
context information provided. Your response should include \
the questions separated by a newline and nothing else.
"""

# Supporting functions

In [6]:
def get_nodes_from_index(index):
    """Gets the nodes from the index"""
    retriever = index.as_retriever(similarity_top_k=99999999999)
    all_nodes = retriever.retrieve("dummy")
    all_nodes = [item.node for item in all_nodes]
    return all_nodes


def build_index(documents, embed_model=Settings.embed_model, db_path="../chromadb", collection_name="rust_book", rebuild=False, distance_fn="l2"):
    """Builds the index"""

    db = chromadb.PersistentClient(db_path)

    if rebuild:
        db.delete_collection(name=collection_name)

    collection = db.get_or_create_collection(collection_name, metadata={"hnsw:space": distance_fn})
    vector_store = ChromaVectorStore(chroma_collection=collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    if collection.count() > 0 and not rebuild:
        index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context, embed_model=embed_model)
    else:
        index = VectorStoreIndex.from_documents(documents, storage_context=storage_context, embed_model=embed_model)

    return db, collection, vector_store, index


def display_results(name, eval_results, metrics, return_agg=True):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    columns = {
        "retrievers": [name],
        **{k: [full_df[k].mean()] for k in metrics},
    }

    metric_df = pd.DataFrame(columns)

    if return_agg:
        return metric_df
    else:
        return full_df, metric_df

async def evaluate_retriever(retriever, qa_dataset, metrics = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]):
    retriever_evaluator = RetrieverEvaluator.from_metric_names(
        metrics, retriever=retriever
    )
    eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)
    return  display_results("baseline top-2 eval", eval_results, metrics=metrics)


async def log_retriever_eval(retriever, retriever_name, **kwargs):
    with mlflow.start_run():
        results = await evaluate_retriever(retriever, **kwargs)
        mlflow.log_param("retriever", retriever_name)
        mlflow.log_metrics(*results.drop(columns=["retrievers"]).to_dict(orient="records"))
    return results

async def evaluate_embed_model(
    dataset,
    embed_model,
    retriever_name=None,
    top_k=2,
):
    """Evaluates the embedding model on a given dataset."""
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(
        nodes, embed_model=embed_model, show_progress=False
    )
    retriever = index.as_retriever(similarity_top_k=top_k)

    retriever_name = retriever_name or embed_model.model_name
    results = await log_retriever_eval(retriever,  qa_dataset=dataset, retriever_name=retriever_name)
    return results

# Baseline Retriever

Note: If we want to compare different storage / embedding methods, we need to rebuild the index and qa-dataset

In [17]:
documents = SimpleDirectoryReader('../txt').load_data()

db, collection, vector_store, index = build_index(documents)
nodes = get_nodes_from_index(index)

retriever = index.as_retriever(similarity_top_k=2)
query_engine = index.as_query_engine()

Number of requested results 99999999999 is greater than number of elements in index 384, updating n_results = 384


### Generate qa dataset

In [8]:
# n_nodes = 150
# qa_dataset = generate_question_context_pairs(nodes=nodes[:n_nodes], num_questions_per_chunk=2, qa_generate_prompt_tmpl=QA_GENERATE_PROMPT_TMPL)
# qa_dataset.save_json("../data/qa_dataset.json"),

qa_dataset = EmbeddingQAFinetuneDataset.from_json("../data/qa_dataset.json")

In [60]:
await evaluate_embed_model(qa_dataset, embed_model=mpnet_embed_model, retriever_name="mpnet top-2 eval", top_k=2)

Exception ignored in: <function tqdm.__del__ at 0x7dee28b58c20>
Traceback (most recent call last):
  File "/home/carlos/.cache/pypoetry/virtualenvs/rust-rag-l9LsLaFj-py3.11/lib/python3.11/site-packages/tqdm/std.py", line 1148, in __del__
    self.close()
  File "/home/carlos/.cache/pypoetry/virtualenvs/rust-rag-l9LsLaFj-py3.11/lib/python3.11/site-packages/tqdm/notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,baseline top-2 eval,0.646667,0.576667,0.323333,0.646667,0.576667,0.364821


# Query Fusion Retriever

In [None]:
bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=2)
query_fusion_retriever = QueryFusionRetriever(
    [index.as_retriever(), bm25_retriever],
    similarity_top_k=2,
    num_queries=1,
    mode="reciprocal_rerank",
    verbose=False,
)

In [None]:
await log_retriever_eval(query_fusion_retriever, retriever_name="baseline query fusion", qa_dataset=qa_dataset)

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,baseline top-2 eval,0.716667,0.603333,0.358333,0.716667,0.603333,0.388129


Results are better! Note that the precision will be at most 0.5 because we're always retrieving 2 documents while the qa-dataset has only 1 expected results per question.

# Testing a different embedding model

### All-MiniLM-L12-v2

In [29]:
mini_lm_embed = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L12-v2", trust_remote_code=True)

db, collection, vector_store, index = build_index(documents, embed_model=mini_lm_embed, collection_name="rust-rag-all-miniLM-L12-v2", rebuild=False)
nodes = get_nodes_from_index(index)

retriever = index.as_retriever(similarity_top_k=2)
query_engine = index.as_query_engine()

Number of requested results 99999999999 is greater than number of elements in index 384, updating n_results = 384


Evaluation on the same dataset

In [None]:
mini_lm_results = await evaluate_embed_model(qa_dataset, embed_model=mini_lm_embed, retriever_name="miniLM top-2 eval", top_k=2)
mini_lm_results

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,baseline top-2 eval,0.53,0.473333,0.265,0.53,0.473333,0.299321


Changing the embedding model makes a big difference! Let's try a couple more:

In [None]:
stella_small_embed = HuggingFaceEmbedding(model_name="dunzhang/stella_en_400M_v5", trust_remote_code=True)

stella_results = await evaluate_embed_model(qa_dataset, embed_model=stella_small_embed, retriever_name="stella top-2 eval", top_k=2)
stella_results

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,baseline top-2 eval,0.75,0.686667,0.375,0.75,0.686667,0.431196


Out of curiosity, let's see how stella performs with a fusion query retriever:

In [None]:
corpus = qa_dataset.corpus

qa_nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
index = VectorStoreIndex(
    qa_nodes, embed_model=stella_small_embed, show_progress=False
)

bm25_retriever = BM25Retriever.from_defaults(nodes=qa_nodes, similarity_top_k=2)
stella_query_fusion_retriever = QueryFusionRetriever(
    [index.as_retriever(), bm25_retriever],
    similarity_top_k=2,
    num_queries=1,
    mode="reciprocal_rerank",
    verbose=False,
)

results = await log_retriever_eval(stella_query_fusion_retriever,  qa_dataset=qa_dataset, retriever_name="stella query-fusion top-2")
results

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,baseline top-2 eval,0.85,0.74,0.425,0.85,0.74,0.47139


And let's test one more embedding model

In [10]:
bge_large_embed = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5", trust_remote_code=False)

bge_large_results = await evaluate_embed_model(qa_dataset, embed_model=bge_large_embed, retriever_name="bge-large top-2 eval", top_k=2)
bge_large_results

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,baseline top-2 eval,0.756667,0.691667,0.378333,0.756667,0.691667,0.43453


# Embedding fine tuning

Let's fine tune these models and see which one gives better performance. Let's try first with MiniLM as it is a small model and should be 'quick enough'

In [None]:
from sklearn.model_selection import train_test_split

train_nodes, test_nodes = train_test_split(nodes, test_size=0.2)

307 77


In [None]:
from llama_index.finetuning import generate_qa_embedding_pairs

test_dataset = generate_qa_embedding_pairs(
    llm=Settings.llm,
    nodes=test_nodes,
    output_path="ft_test_dataset.json",
    qa_generate_prompt_tmpl=QA_GENERATE_PROMPT_TMPL,
    num_questions_per_chunk=2,
    verbose=False
)

  0%|          | 0/77 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"


  1%|▏         | 1/77 [00:02<02:44,  2.17s/it]

INFO:httpx:HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"


  3%|▎         | 2/77 [00:03<02:13,  1.78s/it]

In [None]:
train_dataset = generate_qa_embedding_pairs(
    llm=OpenAI(model="gpt-3.5-turbo"),
    nodes=train_nodes,
    output_path="train_dataset.json",
)


384

In [None]:
# qa_dataset_all_mini_eval = generate_question_context_pairs(nodes=nodes[n_nodes: n_nodes+50], num_questions_per_chunk=2, qa_generate_prompt_tmpl=QA_GENERATE_PROMPT_TMPL)
# qa_dataset_all_mini_eval.save_json("../data/qa_dataset_all_mini_eval.json")

# qa_dataset_all_mini_eval = EmbeddingQAFinetuneDataset.from_json("../data/qa_dataset_all_mini_eval.json")

100%|██████████| 50/50 [00:45<00:00,  1.09it/s]


In [25]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine

ft_engine = SentenceTransformersFinetuneEngine(
    model_id="all-mini-lm-l12-v2",
    model_output_path="test_mini_lm_ft",
    val_dataset=qa_dataset_all_mini_eval,
    num_epochs=1
)

ModuleNotFoundError: No module named 'llama_index.finetuning'