In [1]:
# Import libraries
import os
import json
import pandas as pd
from tqdm.notebook import tqdm
from dotenv import load_dotenv
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.schema import TextNode
from llama_index.embeddings import OpenAIEmbedding
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformer

# Load environment variables for API key
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

# Load datasets
TRAIN_DATASET_FPATH = './data/train_dataset.json'
VAL_DATASET_FPATH = './data/val_dataset.json'

with open(TRAIN_DATASET_FPATH, 'r+') as f:
    train_dataset = json.load(f)

with open(VAL_DATASET_FPATH, 'r+') as f:
    val_dataset = json.load(f)

In [2]:
# Define the evaluation function using hit rate metric
def evaluate(dataset, embed_model, top_k=5, verbose=False):
    corpus = dataset['corpus']
    queries = dataset['queries']
    relevant_docs = dataset['relevant_docs']

    service_context = ServiceContext.from_defaults(embed_model=embed_model)
    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(nodes, service_context=service_context, show_progress=True)
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    for query_id, query in tqdm(queries.items()):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids
        
        eval_result = {
            'is_hit': is_hit,
            'retrieved': retrieved_ids,
            'expected': expected_id,
            'query': query_id,
        }
        eval_results.append(eval_result)
    return eval_results

# Define the evaluation function using InformationRetrievalEvaluator
def evaluate_st(dataset, model_id, name):
    corpus = dataset['corpus']
    queries = dataset['queries']
    relevant_docs = dataset['relevant_docs']

    evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs, name=name)
    model = SentenceTransformer(model_id)
    return evaluator(model, output_path='results/')

In [3]:
# Run evaluations
ada = OpenAIEmbedding(api_key=openai_api_key)
ada_val_results = evaluate(val_dataset, ada)
df_ada = pd.DataFrame(ada_val_results)
hit_rate_ada = df_ada['is_hit'].mean()
print(f'Hit rate for ada: {hit_rate_ada}')

Generating embeddings:   0%|          | 0/4293 [00:00<?, ?it/s]

  0%|          | 0/8590 [00:00<?, ?it/s]

Hit rate for ada: 0.6714784633294528


In [4]:
# For BAAI/bge-large-en (assuming BAAIEmbedding is your class to handle BAAI embeddings)
bge_large = "local:BAAI/bge-large-en"
bge_large_val_results = evaluate(val_dataset, bge_large)
df_bge_large = pd.DataFrame(bge_large_val_results)
hit_rate_bge_large = df_bge_large['is_hit'].mean()
print(f'Hit rate for BAAI/bge-large-en: {hit_rate_bge_large}')

# For BAAI/bge-large-en using InformationRetrievalEvaluator
evaluate_st(val_dataset, "BAAI/bge-large-en", name='bge_large')

Generating embeddings:   0%|          | 0/4293 [00:00<?, ?it/s]

  0%|          | 0/8590 [00:00<?, ?it/s]

Hit rate for BAAI/bge-large-en: 0.6519208381839348


0.511669801273743

In [5]:
# For finetuned model
finetuned = "local:exp_finetune_optimal"
val_results_finetuned = evaluate(val_dataset, finetuned)
df_finetuned = pd.DataFrame(val_results_finetuned)
hit_rate_finetuned = df_finetuned['is_hit'].mean()
print(f'Hit rate for finetuned model: {hit_rate_finetuned}')

# For finetuned model using InformationRetrievalEvaluator
evaluate_st(val_dataset, "exp_finetune_optimal", name='finetuned')

Generating embeddings:   0%|          | 0/4293 [00:00<?, ?it/s]

  0%|          | 0/8590 [00:00<?, ?it/s]

Hit rate for finetuned model: 0.739464493597206


0.6542465972976504

In [7]:
df_ada['model'] = 'ada'
df_bge_large['model'] = 'bge_large'
df_finetuned['model'] = 'fine_tuned'

In [8]:
df_all = pd.concat([df_ada, df_bge_large, df_finetuned])
df_all.groupby('model').mean('is_hit')

Unnamed: 0_level_0,is_hit
model,Unnamed: 1_level_1
ada,0.671478
bge_large,0.651921
fine_tuned,0.739464


In [13]:
# Improvement of finetuned model over ada
print(f'Improvement of finetuned model over ada: {round((hit_rate_finetuned - hit_rate_ada) / hit_rate_ada * 100, 2)}%')

# Improvement of finetuned model over BAAI/bge-large-en
print(f'Improvement of finetuned model over BAAI/bge-large-en: {round((hit_rate_finetuned - hit_rate_bge_large) / hit_rate_bge_large * 100, 2)}%')

Improvement of finetuned model over ada: 10.12%
Improvement of finetuned model over BAAI/bge-large-en: 13.43%


In [9]:
df_st_bge = pd.read_csv('results/Information-Retrieval_evaluation_bge_large_results.csv')
df_st_finetuned = pd.read_csv('results/Information-Retrieval_evaluation_finetuned_results.csv')

In [10]:
df_st_bge['model'] = 'bge_large'
df_st_finetuned['model'] = 'fine_tuned'
df_st_all = pd.concat([df_st_bge, df_st_finetuned])
df_st_all = df_st_all.set_index('model')
df_st_all

Unnamed: 0_level_0,epoch,steps,cos_sim-Accuracy@1,cos_sim-Accuracy@3,cos_sim-Accuracy@5,cos_sim-Accuracy@10,cos_sim-Precision@1,cos_sim-Recall@1,cos_sim-Precision@3,cos_sim-Recall@3,...,dot_score-Recall@1,dot_score-Precision@3,dot_score-Recall@3,dot_score-Precision@5,dot_score-Recall@5,dot_score-Precision@10,dot_score-Recall@10,dot_score-MRR@10,dot_score-NDCG@10,dot_score-MAP@100
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bge_large,-1,-1,0.428172,0.562747,0.609313,0.665076,0.428172,0.428172,0.187582,0.562747,...,0.092782,0.053628,0.160885,0.039558,0.197788,0.024517,0.245169,0.136886,0.162539,0.147194
fine_tuned,-1,-1,0.58475,0.702678,0.739464,0.776251,0.58475,0.58475,0.234226,0.702678,...,0.559953,0.228328,0.684983,0.144587,0.722934,0.076554,0.765541,0.6307,0.663444,0.634284


In [3]:
# For finetuned model
finetuned = "local:exp_finetune_entire_dataset"
val_results_finetuned = evaluate(val_dataset, finetuned)
df_finetuned = pd.DataFrame(val_results_finetuned)
hit_rate_finetuned = df_finetuned['is_hit'].mean()
print(f'Hit rate for finetuned model: {hit_rate_finetuned}')

# For finetuned model using InformationRetrievalEvaluator
evaluate_st(val_dataset, "exp_finetune_entire_dataset", name='finetuned_entire_dataset')

Generating embeddings:   0%|          | 0/4293 [00:00<?, ?it/s]

  0%|          | 0/8590 [00:00<?, ?it/s]

Hit rate for finetuned model: 0.769383003492433


0.6953220765652576