This notebook was inpired by [this LlamaIndex example](https://docs.llamaindex.ai/en/stable/examples/finetuning/embeddings/finetune_embedding_adapter/)

Making some changes to it with the only intention of trying ideas and learning.

Notice that I am assuming you have the relevant API_KEYs as environmental variables.

In [12]:
%pip install llama-index-finetuning
%pip install llama-index-embeddings-openai
%pip install llama-index-embeddings-adapter
%pip install llama-index-embeddings-huggingface

In [1]:
from bubls.utils.data.download import download_file_from_url
from bubls.utils.data.load import load_corpus
from bubls.utils.evaluation.evaluate_embeddings import (
    get_query_hit_pairs, sentence_transformer_ir_evaluator
)
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
from llama_index.finetuning import generate_qa_embedding_pairs
from llama_index.llms.openai import OpenAI
from llama_index.finetuning import EmbeddingAdapterFinetuneEngine
from llama_index.core.embeddings import resolve_embed_model
import os

# from llama_index.finetuning import SentenceTransformersFinetuneEngine
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.embeddings.adapter.utils import TwoLayerNN
from llama_index.embeddings.adapter import LinearAdapterEmbeddingModel

<jemalloc>: Unsupported system page size


## Defining Global Variables

In [2]:
METADATA = {
    "train": {
        "lyft_10k": {
            "source_url": "https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/10k/lyft_2021.pdf",
            "file_name": "lyft_10k_2021.pdf",
            "save_data_to": os.path.join(os.environ["DATA_DIR"], "lyft_10k"),
        }
    },
    "val": {
        "uber_10k": {
            "source_url": "https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/10k/uber_2021.pdf",
            "file_name": "uber_10k_2021.pdf",
            "save_data_to": os.path.join(os.environ["DATA_DIR"], "uber_10k"),
        }
    }
}

PERSIST_FINETUNE_DATA_TO = os.path.join(os.environ["PERSIST_DIR"], "eg1_finetune_data")

## Ingest Data
- Download Information
- Split train and validation data
- Load corpus
- Generate QA embeddings

In [3]:
data = {}
for split in METADATA:
    files = []
    for k, md in METADATA[split].items():
        files.append(
            download_file_from_url(md["source_url"], md["file_name"], md["save_data_to"])
        )
    data_path = os.path.join(PERSIST_FINETUNE_DATA_TO, f"{split}_data.json")
    if not os.path.exists(data_path):
        if not os.path.exists(PERSIST_FINETUNE_DATA_TO):
            os.makedirs(PERSIST_FINETUNE_DATA_TO)
        print("Generating data with QA embedding pairs")
        # For every node we have id, embedding placeholder, metadata, text, relationships, etc.
        nodes = load_corpus(files, pct_sample = 0.2)
        data[split] = generate_qa_embedding_pairs(
            llm=OpenAI(model="gpt-3.5-turbo"), nodes=nodes
        )
        data[split].save_json(data_path)
    else:
        print("Loading data with QA embedding pairs")
        data[split] = EmbeddingQAFinetuneDataset.from_json(data_path)
        


Loading data with QA embedding pairs
Loading data with QA embedding pairs


## Define Embedding model

In [12]:
embedding_model = OpenAIEmbedding()
# embedding_model = resolve_embed_model("local:BAAI/bge-small-en")

## Define two-layer Adapter

In [13]:
adapter_model = TwoLayerNN(
    1536,  # input dimension. Change depending on embedding model
    1024,  # hidden dimension
    1536,  # output dimension Change depending on embedding model
    bias=True,
    add_residual=True,
)

## Run Fine-tuning

In [15]:
finetune_engine = EmbeddingAdapterFinetuneEngine(
    data["train"],
    embedding_model,
    model_output_path="adapter_test3",
    model_checkpoint_path="adapter_ck3",
    adapter_model=adapter_model,
    batch_size=4,
    epochs=4,
    verbose=False,
)

finetune_engine.finetune()
embed_model = finetune_engine.get_finetuned_model(adapter_cls=TwoLayerNN)

## Evalute and Compare

### OpenAI, no fine-tuning

In [32]:
openai_embedding = OpenAIEmbedding()
ada_val_results = get_query_hit_pairs(data["val"], openai_embedding)
ada_val_results["is_hit"].mean()

0.9142857142857143

### Fine-tuned model

In [17]:
fine_tuned_val_results = get_query_hit_pairs(data["val"], embed_model)
fine_tuned_val_results["is_hit"].mean()

0.9316374755857143