In [1]:
from bubls.utils.rag_design.rag_building_blocks import RAGBuildingBlocks
import logging
import sys
from metadata import lyft_10k, wiki_public_companies, papers_ridesharing

<jemalloc>: Unsupported system page size


In [2]:
# from llama_index.core import Settings
# Settings.llm = OpenAI(temperature=0.2, model="gpt-3.5-turbo")
# Settings.embed_model = OpenAIEmbedding(name="text-embedding-ada-002")

In [2]:
logging.basicConfig(
    stream=sys.stdout,
    level=logging.ERROR,
    format="%(asctime)s - %(levelname)s - %(message)s",
)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [3]:
components_cfg = {
    "lyft_10k": lyft_10k,
    "wiki_public_companies": wiki_public_companies,
    "papers_ridesharing": papers_ridesharing,
}

In [4]:
rag = RAGBuildingBlocks(components_cfg)
rag.execute()

Loading data artifacts for lyft_10k
Loading LLM eval data for lyft_10k, train
Loading LLM eval data for lyft_10k, val
Loading LLM eval data for lyft_10k, test
Generating Nodes for lyft_10k, train
Generating Nodes for lyft_10k, val
Generating Nodes for lyft_10k, test
Loading QA pairs for lyft_10k, train
Loading QA pairs for lyft_10k, val
Loading QA pairs for lyft_10k, test
Loading engines for lyft_10k
Loading Index for lyft_10k, baseline
Generating Query Engine for lyft_10k
Generating Query Engine Tool for lyft_10k
Loading eval data for lyft_10k, train
Loading eval data for lyft_10k, val
Loading eval data for lyft_10k, test
Loading data artifacts for wiki_public_companies
Loading LLM eval data for wiki_public_companies, train
Loading LLM eval data for wiki_public_companies, val
Loading LLM eval data for wiki_public_companies, test
Generating Nodes for wiki_public_companies, train
Generating Nodes for wiki_public_companies, val
Generating Nodes for wiki_public_companies, test
Loading QA 

## Fine-tune embeddings

In [5]:
from sentence_transformers import (
    SentenceTransformer,
    losses,
    InputExample
)
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from torch.utils.data import DataLoader

In [12]:
qa_pairs_training = rag.qa_pairs["lyft_10k"]["train"]
qa_pairs_val = rag.qa_pairs["lyft_10k"]["val"]
corpus = rag.get_corpus_from_index(rag.index["lyft_10k"]) 

In [7]:
training_examples = []
for query_id, query in qa_pairs_training.queries.items():
    node_id = qa_pairs_training.relevant_docs[query_id][0]
    reference = qa_pairs_training.corpus[node_id]
    example = InputExample(texts=[query, reference])
    training_examples.append(example)

loader = DataLoader(training_examples, batch_size=8, shuffle=True)

In [8]:
evaluator = InformationRetrievalEvaluator(
    qa_pairs_val.queries,
    qa_pairs_val.corpus,
    qa_pairs_val.relevant_docs
)

In [10]:
model_id = "all-MiniLM-L6-v2"
embed_model = SentenceTransformer(model_id)
loss = losses.MultipleNegativesRankingLoss(embed_model)

In [13]:
embed_model.fit(
    train_objectives=[(loader, loss)],
    epochs=1,
    warmup_steps=1,  # int(len(loader) * epochs * 0.1)  # Default 10000
    output_path='fine-tune-embeddings1',
    show_progress_bar=True,
    evaluator=evaluator, 
    evaluation_steps=int(len(loader)*0.3),
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/16 [00:00<?, ?it/s]

In [13]:
from bubls.utils.evaluation.from_dataframe import is_hit, reciprocal_ranking

In [18]:
index_tuned = rag.gen_index(
    "lyft_10k",
    "mini_tuned",
    rag.nodes["lyft_10k"]["train"]+rag.nodes["lyft_10k"]["val"]+rag.nodes["lyft_10k"]["test"], 
    {"embed_model": "local:fine-tune-embeddings1"}
)
tuned_qe = index_tuned.as_query_engine()
tuned_eval_data = rag.gen_eval_data(tuned_qe, rag.qa_pairs["lyft_10k"]["val"], corpus)

tuned_eval_data["is_hit"] =  tuned_eval_data.apply(is_hit, axis=1)
tuned_eval_data["reciprocal_ranking"] =  tuned_eval_data.apply(reciprocal_ranking, axis=1)

print(tuned_eval_data[["is_hit", "reciprocal_ranking"]].mean())

  0%|          | 0/126 [00:00<?, ?it/s]

is_hit                0.753968
reciprocal_ranking    0.718254
dtype: float64


In [21]:
index_original = rag.gen_index(
    "lyft_10k",
    "mini_original",
    rag.nodes["lyft_10k"]["train"]+rag.nodes["lyft_10k"]["val"]+rag.nodes["lyft_10k"]["test"], 
    {"embed_model": "local:all-MiniLM-L6-v2"}
)
original_qe = index_original.as_query_engine()
original_eval_data = rag.gen_eval_data(original_qe, rag.qa_pairs["lyft_10k"]["val"], corpus)

original_eval_data["is_hit"] =  original_eval_data.apply(is_hit, axis=1)
original_eval_data["reciprocal_ranking"] =  original_eval_data.apply(reciprocal_ranking, axis=1)

print(original_eval_data[["is_hit", "reciprocal_ranking"]].mean())

is_hit                0.730159
reciprocal_ranking    0.666667
dtype: float64
