In [5]:
%pip install llama-index-llms-openai
%pip install llama-index-embeddings-openai
%pip install llama-index-finetuning
%pip install llama-index-readers-file
%pip install llama-index-embeddings-huggingface

Collecting llama-index-llms-openai
  Downloading llama_index_llms_openai-0.1.16-py3-none-any.whl (10 kB)
Collecting llama-index-core<0.11.0,>=0.10.24 (from llama-index-llms-openai)
  Downloading llama_index_core-0.10.30-py3-none-any.whl (15.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.4/15.4 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json (from llama-index-core<0.11.0,>=0.10.24->llama-index-llms-openai)
  Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)
Collecting deprecated>=1.2.9.3 (from llama-index-core<0.11.0,>=0.10.24->llama-index-llms-openai)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting dirtyjson<2.0.0,>=1.0.8 (from llama-index-core<0.11.0,>=0.10.24->llama-index-llms-openai)
  Downloading dirtyjson-1.0.8-py3-none-any.whl (25 kB)
Collecting httpx (from llama-index-core<0.11.0,>=0.10.24->llama-index-llms-openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━

In [6]:
import json

from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import MetadataMode

In [7]:
def load_corpus(files, verbose=False):
    if verbose:
        print(f"Loading files {files}")

    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f"Loaded {len(docs)} docs")

    parser = SentenceSplitter()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    return nodes

In [9]:
from sklearn.model_selection import train_test_split

# Load the corpus
derma_nodes = load_corpus(["/content/Derma.pdf"], verbose=True)

# Split the data into training and validation sets
train_nodes, val_nodes = train_test_split(derma_nodes, test_size=0.2, random_state=42)

# Print the lengths of the training and validation sets
print(f"Number of training nodes: {len(train_nodes)}")
print(f"Number of validation nodes: {len(val_nodes)}")


Loading files ['/content/Derma.pdf']
Loaded 452 docs


Parsing nodes:   0%|          | 0/452 [00:00<?, ?it/s]

Parsed 738 nodes
Number of training nodes: 590
Number of validation nodes: 148


In [10]:

# TRAIN_FILES = ["/content/lyft_annual_report.pdf"]
# VAL_FILES = ["/content/uber_annual_report.pdf"]

# TRAIN_CORPUS_FPATH = "./data/train_corpus.json"
# VAL_CORPUS_FPATH = "./data/val_corpus.json"



# TRAIN_FILES = ["/content/derma.pdf"]
# VAL_FILES = ["/content/derma.pdf"]

# # Loading training and validation data separately
# train_nodes = load_corpus(TRAIN_FILES, verbose=True)
# val_nodes = load_corpus(VAL_FILES, verbose=True)



#Generate synthetic queries
Now, we use an LLM (gpt-3.5-turbo) to generate questions using each text chunk in the corpus as context.

Each pair of (generated question, text chunk used as context) becomes a datapoint in the finetuning dataset (either for training or evaluation).

In [11]:
from llama_index.finetuning import generate_qa_embedding_pairs
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

In [None]:
import os

## Get your own API KEY from OPEN AI to use this code

OPENAI_API_TOKEN = "Enter you own API KEY here"
os.environ["OPENAI_API_KEY"] = OPENAI_API_TOKEN

In [13]:
from llama_index.llms.openai import OpenAI


train_dataset = generate_qa_embedding_pairs(
    llm=OpenAI(model="gpt-3.5-turbo"), nodes=train_nodes
)
val_dataset = generate_qa_embedding_pairs(
    llm=OpenAI(model="gpt-3.5-turbo"), nodes=val_nodes
)

100%|██████████| 590/590 [11:05<00:00,  1.13s/it]
100%|██████████| 148/148 [02:49<00:00,  1.15s/it]


In [14]:
train_dataset.save_json("train_dataset.json")
val_dataset.save_json("val_dataset.json")

In [None]:
# [Optional] Load
# train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
# val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

#Finetuning

In [15]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine

In [16]:
finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="BAAI/bge-small-en",
    model_output_path="test_model",
    val_dataset=val_dataset,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [17]:
finetune_engine.finetune()

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/118 [00:00<?, ?it/s]

Iteration:   0%|          | 0/118 [00:00<?, ?it/s]

In [18]:
embed_model = finetune_engine.get_finetuned_model()

In [19]:
embed_model

HuggingFaceEmbedding(model_name='test_model', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7b5b6e9f0df0>, max_length=512, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None)

#Evaluate Finetuned Model
In this section, we evaluate 3 different embedding models:

1. proprietary OpenAI embedding,

2. open source BAAI/bge-small-en, and

3. our finetuned embedding model.

We consider 2 evaluation approaches:

1. a simple custom hit rate metric

2. using InformationRetrievalEvaluator from sentence_transformers

We show that finetuning on synthetic (LLM-generated) dataset significantly improve upon an opensource embedding model.

In [20]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd

# Define eval function
Option 1: We use a simple hit rate metric for evaluation:

1. for each (query, relevant_doc) pair,
2. we retrieve top-k documents with the query, and
3. it's a hit if the results contain the relevant_doc.

This approach is very simple and intuitive, and we can apply it to both the proprietary OpenAI embedding as well as our open source and fine-tuned embedding models.

In [21]:
def evaluate(dataset, embed_model, top_k=5, verbose=False):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(
        nodes, embed_model=embed_model, show_progress=True
    )
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    for query_id, query in tqdm(queries.items()):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids  # assume 1 relevant doc

        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
        }
        eval_results.append(eval_result)
    return eval_results

Option 2: We use the InformationRetrievalEvaluator from sentence_transformers.

This provides a more comprehensive suite of metrics, but we can only run it against the sentencetransformers compatible models (open source and our finetuned model, not the OpenAI embedding model).

In [22]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformer
from pathlib import Path


def evaluate_st(dataset, model_id, name):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    evaluator = InformationRetrievalEvaluator(
        queries, corpus, relevant_docs, name=name
    )
    model = SentenceTransformer(model_id)
    output_path = "results/"
    Path(output_path).mkdir(exist_ok=True, parents=True)
    return evaluator(model, output_path=output_path)

#OpenAI

In [23]:
ada = OpenAIEmbedding()
ada_val_results = evaluate(val_dataset, ada)
df_ada = pd.DataFrame(ada_val_results)

Generating embeddings:   0%|          | 0/148 [00:00<?, ?it/s]

  0%|          | 0/296 [00:00<?, ?it/s]

In [24]:
hit_rate_ada = df_ada["is_hit"].mean()
hit_rate_ada

0.9831081081081081

#BAAI/bge-small-en

In [25]:
bge = "local:BAAI/bge-small-en"
bge_val_results = evaluate(val_dataset, bge)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating embeddings:   0%|          | 0/148 [00:00<?, ?it/s]

  0%|          | 0/296 [00:00<?, ?it/s]

In [26]:
df_bge = pd.DataFrame(bge_val_results)

In [27]:
hit_rate_bge = df_bge["is_hit"].mean()
hit_rate_bge

0.9459459459459459

In [28]:
evaluate_st(val_dataset, "BAAI/bge-small-en", name="bge")

0.8665795023653617

#Finetuned

In [29]:
finetuned = "local:test_model"
val_results_finetuned = evaluate(val_dataset, finetuned)
df_finetuned = pd.DataFrame(val_results_finetuned)

Generating embeddings:   0%|          | 0/148 [00:00<?, ?it/s]

  0%|          | 0/296 [00:00<?, ?it/s]

In [30]:
hit_rate_finetuned = df_finetuned["is_hit"].mean()
hit_rate_finetuned

0.956081081081081

In [31]:
evaluate_st(val_dataset, "test_model", name="finetuned")

0.8748073057884999

In [32]:
df_ada

Unnamed: 0,is_hit,retrieved,expected,query
0,False,"[6c2bb8c5-713e-49b2-87ef-3a3efa584c2e, 15386c4...",305da6ce-1d28-41d8-81b0-fad25649d1da,a9af513a-af39-4413-9580-715ece29dbba
1,True,"[305da6ce-1d28-41d8-81b0-fad25649d1da, b3db955...",305da6ce-1d28-41d8-81b0-fad25649d1da,fa89f3a8-de8f-4c9f-9b91-6f3dce0f48c5
2,True,"[305da6ce-1d28-41d8-81b0-fad25649d1da, 6c2bb8c...",6c2bb8c5-713e-49b2-87ef-3a3efa584c2e,8df62eb8-2533-414a-88c0-f299d6bbd7b9
3,True,"[6c2bb8c5-713e-49b2-87ef-3a3efa584c2e, b3db955...",6c2bb8c5-713e-49b2-87ef-3a3efa584c2e,3996da3a-0619-4d5d-83f8-3383b0fc0c89
4,True,"[93962f81-8f6f-4a5b-9bf1-546ade429e5d, 65116fd...",93962f81-8f6f-4a5b-9bf1-546ade429e5d,7df6d3fb-6fb7-4a49-a3a6-97f9647ddc5e
...,...,...,...,...
291,True,"[402a0eb1-e79f-42d2-bb97-c81cf5e4e8de, f5ae053...",402a0eb1-e79f-42d2-bb97-c81cf5e4e8de,176b2a51-bb5b-4a70-ab83-40d75549a8cb
292,True,"[6955cc23-1af5-440f-a451-b91c09b4cd98, 3ca2ef3...",6955cc23-1af5-440f-a451-b91c09b4cd98,b61e9749-e563-4b9f-99a0-e8d1d843db68
293,True,"[6955cc23-1af5-440f-a451-b91c09b4cd98, e39dd85...",6955cc23-1af5-440f-a451-b91c09b4cd98,321b23d3-41b9-48cb-a013-5d4dbb6995c6
294,True,"[200fc9c6-1d09-4ccd-87f7-08877c684c86, 7d1d1d3...",200fc9c6-1d09-4ccd-87f7-08877c684c86,e20dbccd-77ec-4f0f-8653-98160db51803


In [33]:
df_bge

Unnamed: 0,is_hit,retrieved,expected,query
0,False,"[8aa072d0-42cf-4a7e-9dea-9a386f158435, d7c4219...",305da6ce-1d28-41d8-81b0-fad25649d1da,a9af513a-af39-4413-9580-715ece29dbba
1,True,"[305da6ce-1d28-41d8-81b0-fad25649d1da, b3db955...",305da6ce-1d28-41d8-81b0-fad25649d1da,fa89f3a8-de8f-4c9f-9b91-6f3dce0f48c5
2,True,"[6c2bb8c5-713e-49b2-87ef-3a3efa584c2e, 305da6c...",6c2bb8c5-713e-49b2-87ef-3a3efa584c2e,8df62eb8-2533-414a-88c0-f299d6bbd7b9
3,True,"[6c2bb8c5-713e-49b2-87ef-3a3efa584c2e, 1e58a9d...",6c2bb8c5-713e-49b2-87ef-3a3efa584c2e,3996da3a-0619-4d5d-83f8-3383b0fc0c89
4,True,"[93962f81-8f6f-4a5b-9bf1-546ade429e5d, 65116fd...",93962f81-8f6f-4a5b-9bf1-546ade429e5d,7df6d3fb-6fb7-4a49-a3a6-97f9647ddc5e
...,...,...,...,...
291,True,"[65116fdf-d3d9-443c-b52e-5e656b3c47cb, 402a0eb...",402a0eb1-e79f-42d2-bb97-c81cf5e4e8de,176b2a51-bb5b-4a70-ab83-40d75549a8cb
292,True,"[6955cc23-1af5-440f-a451-b91c09b4cd98, 3ca2ef3...",6955cc23-1af5-440f-a451-b91c09b4cd98,b61e9749-e563-4b9f-99a0-e8d1d843db68
293,True,"[3ca2ef33-7035-471b-90b5-0085f8abf96e, 6955cc2...",6955cc23-1af5-440f-a451-b91c09b4cd98,321b23d3-41b9-48cb-a013-5d4dbb6995c6
294,True,"[200fc9c6-1d09-4ccd-87f7-08877c684c86, 7d1d1d3...",200fc9c6-1d09-4ccd-87f7-08877c684c86,e20dbccd-77ec-4f0f-8653-98160db51803


In [34]:
df_finetuned

Unnamed: 0,is_hit,retrieved,expected,query
0,False,"[8aa072d0-42cf-4a7e-9dea-9a386f158435, 67f020a...",305da6ce-1d28-41d8-81b0-fad25649d1da,a9af513a-af39-4413-9580-715ece29dbba
1,True,"[305da6ce-1d28-41d8-81b0-fad25649d1da, b3db955...",305da6ce-1d28-41d8-81b0-fad25649d1da,fa89f3a8-de8f-4c9f-9b91-6f3dce0f48c5
2,True,"[305da6ce-1d28-41d8-81b0-fad25649d1da, 6c2bb8c...",6c2bb8c5-713e-49b2-87ef-3a3efa584c2e,8df62eb8-2533-414a-88c0-f299d6bbd7b9
3,True,"[6c2bb8c5-713e-49b2-87ef-3a3efa584c2e, 305da6c...",6c2bb8c5-713e-49b2-87ef-3a3efa584c2e,3996da3a-0619-4d5d-83f8-3383b0fc0c89
4,True,"[93962f81-8f6f-4a5b-9bf1-546ade429e5d, 1ecb735...",93962f81-8f6f-4a5b-9bf1-546ade429e5d,7df6d3fb-6fb7-4a49-a3a6-97f9647ddc5e
...,...,...,...,...
291,True,"[402a0eb1-e79f-42d2-bb97-c81cf5e4e8de, 65116fd...",402a0eb1-e79f-42d2-bb97-c81cf5e4e8de,176b2a51-bb5b-4a70-ab83-40d75549a8cb
292,True,"[6955cc23-1af5-440f-a451-b91c09b4cd98, 3ca2ef3...",6955cc23-1af5-440f-a451-b91c09b4cd98,b61e9749-e563-4b9f-99a0-e8d1d843db68
293,True,"[c798581c-ae91-4a1f-afb4-bc8fdd3e242b, 3ca2ef3...",6955cc23-1af5-440f-a451-b91c09b4cd98,321b23d3-41b9-48cb-a013-5d4dbb6995c6
294,True,"[200fc9c6-1d09-4ccd-87f7-08877c684c86, 992624b...",200fc9c6-1d09-4ccd-87f7-08877c684c86,e20dbccd-77ec-4f0f-8653-98160db51803


In [35]:
df_ada['model'] = 'ada'
df_bge['model'] = 'bge'
df_finetuned['model'] = 'fine_tuned'

In [36]:
df_all = pd.concat([df_ada, df_bge, df_finetuned])
df_all.groupby('model').mean('is_hit')

Unnamed: 0_level_0,is_hit
model,Unnamed: 1_level_1
ada,0.983108
bge,0.945946
fine_tuned,0.956081


In [37]:
df_st_bge = pd.read_csv('results/Information-Retrieval_evaluation_bge_results.csv')
df_st_finetuned = pd.read_csv('results/Information-Retrieval_evaluation_finetuned_results.csv')

In [38]:
df_st_bge['model'] = 'bge'
df_st_finetuned['model'] = 'fine_tuned'
df_st_all = pd.concat([df_st_bge, df_st_finetuned])
df_st_all = df_st_all.set_index('model')
df_st_all

Unnamed: 0_level_0,epoch,steps,cos_sim-Accuracy@1,cos_sim-Accuracy@3,cos_sim-Accuracy@5,cos_sim-Accuracy@10,cos_sim-Precision@1,cos_sim-Recall@1,cos_sim-Precision@3,cos_sim-Recall@3,...,dot_score-Recall@1,dot_score-Precision@3,dot_score-Recall@3,dot_score-Precision@5,dot_score-Recall@5,dot_score-Precision@10,dot_score-Recall@10,dot_score-MRR@10,dot_score-NDCG@10,dot_score-MAP@100
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bge,-1,-1,0.810811,0.905405,0.945946,0.962838,0.810811,0.810811,0.301802,0.905405,...,0.810811,0.301802,0.905405,0.189189,0.945946,0.096284,0.962838,0.865019,0.889065,0.86658
fine_tuned,-1,-1,0.814189,0.918919,0.956081,0.972973,0.814189,0.814189,0.306306,0.918919,...,0.814189,0.306306,0.918919,0.191216,0.956081,0.097297,0.972973,0.873713,0.898297,0.874807


In [39]:
df_st_all.to_csv("DermaReQA.csv", index = False)