This notebook was inpired by [this LlamaIndex example](https://github.com/run-llama/finetune-embedding/blob/main/finetune.ipynb)

Making some changes to it with the only intention of trying ideas and learning.

In [6]:
%pip install llama-index-finetuning
%pip install llama-index-embeddings-huggingface

In [7]:
from bubls.utils.data.download import download_file_from_url
from bubls.utils.data.load import load_corpus
from bubls.utils.evaluation.evaluate_embeddings import get_query_hit_pairs
from sentence_transformers import (
    SentenceTransformer,
    losses
)
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import InputExample
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
from llama_index.finetuning import generate_qa_embedding_pairs
from llama_index.llms.openai import OpenAI
from torch.utils.data import DataLoader
import pandas as pd
import os

## Defining Global Variables

In [4]:
METADATA = {
    "train": {
        "lyft_10k": {
            "source_url": "https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/10k/lyft_2021.pdf",
            "file_name": "lyft_10k_2021.pdf",
            "save_data_to": os.path.join(os.environ["DATA_DIR"], "lyft_10k"),
        }
    },
    "val": {
        "uber_10k": {
            "source_url": "https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/10k/uber_2021.pdf",
            "file_name": "uber_10k_2021.pdf",
            "save_data_to": os.path.join(os.environ["DATA_DIR"], "uber_10k"),
        }
    }
}

PERSIST_FINETUNE_DATA_TO = os.path.join(os.environ["PERSIST_DIR"], "naive_finetune")

## Ingest Data
- Download Information
- Split train and validation data
- Load corpus
- Generate QA embeddings

This is a very naive example and we are loading a very small sample of a file (pct_sample). Only intention of this was to run it fast and understand how the code works without the intention of training an actual good model.

In [20]:
data = {}
for split in METADATA:
    files = []
    for k, md in METADATA[split].items():
        files.append(
            download_file_from_url(md["source_url"], md["file_name"], md["save_data_to"])
        )
    data_path = os.path.join(PERSIST_FINETUNE_DATA_TO, f"{split}_data.json")
    if not os.path.exists(data_path):
        if not os.path.exists(PERSIST_FINETUNE_DATA_TO):
            os.mkdir(PERSIST_FINETUNE_DATA_TO)
        print("Generating data with QA embedding pairs")
        # For every node we have id, embedding placeholder, metadata, text, relationships, etc.
        nodes = load_corpus(files, pct_sample = 0.2)
        data[split] = generate_qa_embedding_pairs(
            llm=OpenAI(model="gpt-3.5-turbo"), nodes=nodes
        )
        data[split].save_json(data_path)
    else:
        print("Loading data with QA embedding pairs")
        data[split] = EmbeddingQAFinetuneDataset.from_json(data_path)
        


Loading data with QA embedding pairs
Loading data with QA embedding pairs


## Defining Loader and Evaluator

In [6]:
training_examples = []
for query_id, query in data["train"].queries.items():
    node_id = data["train"].relevant_docs[query_id][0]
    text = data["train"].corpus[node_id]
    example = InputExample(texts=[query, text])
    training_examples.append(example)

loader = DataLoader(training_examples, batch_size=8)


In [7]:
evaluator = InformationRetrievalEvaluator(
    data["val"].queries,
    data["val"].corpus,
    data["val"].relevant_docs
)

## Define Model and Loss

In [8]:
model_id = "BAAI/bge-small-en"
embedding_model = SentenceTransformer(model_id)
loss = losses.MultipleNegativesRankingLoss(embedding_model)

## Training Loop

In [9]:
epochs = 3
warmup_steps = int(len(loader) * epochs * 0.1)

embedding_model.fit(
    train_objectives=[(loader, loss)],
    epochs=1,
    warmup_steps=warmup_steps,
    output_path='exp_finetune',
    show_progress_bar=True,
    evaluator=evaluator, 
    evaluation_steps=50,
)

## Evaluate

In [19]:
eval_df = get_query_hit_pairs(
    data["val"],
    embedding_model,
    top_k = 5,
    verbose = False,
)
eval_df["is_hit"].mean()

0.6567901234567901