In [1]:
import json

from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
from llama_index.schema import MetadataMode

In [6]:
import glob

all_documents = glob.glob('../Downloads/mca_dataset/*.pdf') + glob.glob('../Downloads/mca_dataset/*/*.pdf')

In [None]:
import openai

openai.

In [15]:
TRAIN_FILES = all_documents[:-6]
VAL_FILES = all_documents[-5:]

TRAIN_CORPUS_FPATH = "./data/train_corpus.json"
VAL_CORPUS_FPATH = "./data/val_corpus.json"

In [16]:
def load_corpus(files, verbose=False):
    if verbose:
        print(f"Loading files {files}")

    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f"Loaded {len(docs)} docs")

    parser = SimpleNodeParser.from_defaults()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    return nodes

In [17]:
train_nodes = load_corpus(TRAIN_FILES, verbose=True)
val_nodes = load_corpus(VAL_FILES, verbose=True)

Loading files ['../Downloads/mca_dataset/List_of_companies_from_FTOK.pdf', '../Downloads/mca_dataset/DisqualifiedDirectorsMumbai_08092020.pdf', '../Downloads/mca_dataset/AnnualReport05English_07012020.pdf', '../Downloads/mca_dataset/Corrigendum1819_19062018.pdf', '../Downloads/mca_dataset/8th-AR-English-20221221.pdf', '../Downloads/mca_dataset/ANDHRAPRADESH-20230809.pdf', '../Downloads/mca_dataset/Innovation_Action_Plan_2014-15.pdf', '../Downloads/mca_dataset/MLMCompanies.pdf', '../Downloads/mca_dataset/RFD_21Nov2010.pdf', '../Downloads/mca_dataset/CBIRC Report_23112021.pdf', '../Downloads/mca_dataset/NOTICE-UNDER-RULE-37(3)-20230822.pdf', '../Downloads/mca_dataset/RTIAppeal_23082021.pdf', '../Downloads/mca_dataset/7thAnnualReport_20220209.pdf', '../Downloads/mca_dataset/Nbfc_Companies.pdf', '../Downloads/mca_dataset/Assam_company_1.pdf', '../Downloads/mca_dataset/MIB_January-2023-20230301.pdf', '../Downloads/mca_dataset/Form_23AC-XBRL_help.pdf', '../Downloads/mca_dataset/List_of_compa

Overwriting cache for 0 7686
Multiple definitions in dictionary at byte 0x5ca15 for key /Info
Multiple definitions in dictionary at byte 0x5ca21 for key /Info
Multiple definitions in dictionary at byte 0x5ca2d for key /Info
incorrect startxref pointer(3)


Loaded 5259 docs


  from .autonotebook import tqdm as notebook_tqdm
Parsing documents into nodes: 100%|████████| 5259/5259 [00:16<00:00, 313.11it/s]


Parsed 9792 nodes
Loading files ['../Downloads/mca_dataset/Form-ADJ-help/Form_ADJ.pdf', '../Downloads/mca_dataset/Form_BEN2_help/Instruction_Kit_eForm_BEN-2.pdf', '../Downloads/mca_dataset/Form_BEN2_help/Form_BEN-2.pdf', '../Downloads/mca_dataset/Form-ADT-2-help/Form_ADT-2.pdf', '../Downloads/mca_dataset/Form-ADT-2-help/Instruction_Kit_eForm ADT-2.pdf']
Loaded 24 docs


Parsing documents into nodes: 100%|████████████| 24/24 [00:00<00:00, 830.91it/s]

Parsed 24 nodes





In [21]:
from llama_index.finetuning import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset,
)
train_dataset = generate_qa_embedding_pairs(train_nodes)
val_dataset = generate_qa_embedding_pairs(val_nodes)

train_dataset.save_json("train_dataset.json")
val_dataset.save_json("val_dataset.json")

ImportError: cannot import name 'OpenAIFineTuningHandler' from 'llama_index.callbacks' (/Users/aashaysachdeva/anaconda3/envs/dl_py38/lib/python3.11/site-packages/llama_index/callbacks/__init__.py)

In [20]:
!pip install --upgrade llama-index

Collecting llama-index
  Obtaining dependency information for llama-index from https://files.pythonhosted.org/packages/95/57/2c175d86dd2a85eb3ad74a6fec8afb5353b326cf5eae85ede6313eb14de7/llama_index-0.8.13-py3-none-any.whl.metadata
  Downloading llama_index-0.8.13-py3-none-any.whl.metadata (4.9 kB)
Collecting langchain>=0.0.262 (from llama-index)
  Obtaining dependency information for langchain>=0.0.262 from https://files.pythonhosted.org/packages/1d/63/1e7c55a653c93cc83b642a7cf5893a9caa4a1325c2b33bc7796bd7a0f143/langchain-0.0.276-py3-none-any.whl.metadata
  Downloading langchain-0.0.276-py3-none-any.whl.metadata (14 kB)
Collecting langsmith<0.1.0,>=0.0.21 (from langchain>=0.0.262->llama-index)
  Obtaining dependency information for langsmith<0.1.0,>=0.0.21 from https://files.pythonhosted.org/packages/2b/cb/3525fb0d1bf144840c726345a107ad35998565a05e99d4bfec755c71ffd8/langsmith-0.0.27-py3-none-any.whl.metadata
  Downloading langsmith-0.0.27-py3-none-any.whl.metadata (10 kB)
Downloading l

In [None]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine
finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="BAAI/bge-small-en",
    model_output_path="test_model",
    val_dataset=val_dataset,
)
finetune_engine.finetune()
embed_model = finetune_engine.get_finetuned_model()
embed_model

In [None]:
from llama_index.embeddings import OpenAIEmbedding
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd


def evaluate(
    dataset,
    embed_model,
    top_k=5,
    verbose=False,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    service_context = ServiceContext.from_defaults(embed_model=embed_model)
    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(nodes, service_context=service_context, show_progress=True)
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    for query_id, query in tqdm(queries.items()):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids  # assume 1 relevant doc

        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
        }
        eval_results.append(eval_result)
    return eval_results

In [None]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformer


def evaluate_st(
    dataset,
    model_id,
    name,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs, name=name)
    model = SentenceTransformer(model_id)
    return evaluator(model, output_path="results/")

In [None]:
ada = OpenAIEmbedding()
ada_val_results = evaluate(val_dataset, ada)
df_ada = pd.DataFrame(ada_val_results)
hit_rate_ada = df_ada["is_hit"].mean()




In [None]:
bge = "local:BAAI/bge-small-en"
bge_val_results = evaluate(val_dataset, bge)
df_bge = pd.DataFrame(bge_val_results)
hit_rate_bge = df_bge["is_hit"].mean()

evaluate_st(val_dataset, "BAAI/bge-small-en", name="bge")