In [3]:
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import MetadataMode
from random import seed, sample
TRAIN = "TrainVal/Train"
VAL = "TrainVal/Val"

In [2]:
from llama_index import SimpleDirectoryReader
from llama_index.core.node_parser import SimpleNodeParser
seed(42)
def nodeCreator(directory, docCount):
  reader = SimpleDirectoryReader(directory, filename_as_id=True)
  docs = reader.load_data()
  docs = sample(docs, docCount)
  node_parser = SimpleNodeParser(chunk_size = 1024, chunk_overlap=50)
  nodes = node_parser.get_nodes_from_documents(docs, show_progress=True)
  return nodes

In [3]:
train_nodes = nodeCreator(TRAIN, 256)
val_nodes = nodeCreator(VAL, 20)

Parsing nodes:   0%|          | 0/256 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
%pip install llama-index-finetuning

In [None]:
from llama_index.finetuning import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset,
)

In [None]:
# For llama model
# from llama_index.llms import LlamaCPP

# llm = LlamaCPP(
#     model_url="model_url",
#     model_path="path/to/llama_model") #if locally stored

In [5]:
from llama_index.llms import OpenAI
import os
os.environ["OPENAI_API_KEY"] = "YOUR KEY HERE"
llm = OpenAI(model_name="gpt-3.5-turbo")

In [6]:
train_dataset = generate_qa_embedding_pairs(train_nodes, llm)
train_dataset.save_json("train_dataset.json")

100%|██████████| 723/723 [31:03<00:00,  2.58s/it]  


In [7]:
val_dataset = generate_qa_embedding_pairs(val_nodes, llm)
val_dataset.save_json("val_dataset.json")

100%|██████████| 64/64 [02:59<00:00,  2.80s/it]


In [4]:
train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

In [5]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine

finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset, 
    model_id='BAAI/bge-base-en-v1.5', 
    model_output_path="OUTPUT PATH", 
    val_dataset=val_dataset, 
    epochs=5
)

In [None]:
finetune_engine.finetune()

In [None]:
finetuned_embedding_model = finetune_engine.get_finetuned_model()

In [None]:
finetuned_embedding_model.to_json()

In [None]:
from llama_index.embeddings import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="OUTPUT PATH")

In [None]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformer
from pathlib import Path

def evaluate_st(
    dataset,
    model_id,
    name,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs, name=name, map_at_k=[5])
    model = SentenceTransformer(model_id)
    output_path = "RESULT OUTPUT PATH"
    Path(output_path).mkdir(exist_ok=True, parents=True)
    return evaluator(model, output_path=output_path)

In [None]:
def evaluate_st(
    dataset,
    model_id,
    name,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs, name=name, map_at_k=[5])
    model = SentenceTransformer(model_id)
    output_path = "RESULT OUTPUT PATH"
    Path(output_path).mkdir(exist_ok=True, parents=True)
    return evaluator(model, output_path=output_path)

In [None]:
evaluate_st(val_dataset, 'BAAI/bge-base-en-v1.5', name="bge")

0.7862654320987653

In [None]:
evaluate_st(val_dataset, "OUTPUT PATH", name="finetuned")

0.903395061728395