In [7]:
import warnings
warnings.filterwarnings("ignore")

import os
from dotenv import load_dotenv
from llama_index.core import Document
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import VectorStoreIndex
from llama_index.core.postprocessor import SentenceTransformerRerank
import faiss
from huggingface_hub import login
import pandas as pd 
from pathlib import Path
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.schema import MetadataMode, NodeWithScore, QueryBundle


load_dotenv()
login(os.environ['HF_TOKEN'])

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /nfs/home/scg1143/.cache/huggingface/token
Login successful


In [21]:
class Config:
   EMBED_DIMENSION =  1024
   EMBED_MODEL = "baconnier/Finance_embedding_large_en-V0.1"
   RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
   SIM_TOP_K = 50
   RERANKER_TOP_N = 30

cfg = Config()

In [3]:
# Llamaindex global settings for llm and embeddings
Settings.llm = None
Settings.embed_model = HuggingFaceEmbedding(model_name=cfg.EMBED_MODEL)

LLM is explicitly disabled. Using MockLLM.


## FinQABench

In [9]:
data_dir = Path.cwd().parent / 'data'
print(os.listdir(data_dir))

['convfinqa_queries.jsonl', 'ConvFinQA_qrels.tsv', 'FinanceBench_qrels.tsv', 'FinDER_qrels.tsv', 'TATQA_qrels.tsv', 'finqabench_corpus.jsonl', 'finder_corpus.jsonl', 'tatqa_corpus.jsonl', 'tatqa_queries.jsonl', 'multiheirtt_corpus.jsonl', 'convfinqa_corpus.jsonl', 'finqa_queries.jsonl', 'multiheirtt_queries.jsonl', 'finqa_corpus.jsonl', 'financebench_queries.jsonl', 'FinQA_qrels.tsv', 'FinQABench_qrels.tsv', 'sample_submission_.csv', 'MultiHeirtt_qrels.tsv', 'finder_queries.jsonl', 'financebench_corpus.jsonl', 'finqabench_queries.jsonl']


In [10]:
finq_bench_corpus_path = data_dir / 'finqabench_corpus.jsonl/corpus.jsonl'
finq_bench_query_path = data_dir / 'finqabench_queries.jsonl/queries.jsonl'
finq_bench_tsv_path = data_dir / 'FinQABench_qrels.tsv'

In [11]:
finq_bench_corpus = pd.read_json(finq_bench_corpus_path, lines=True)
finq_bench_queries = pd.read_json(finq_bench_query_path, lines=True)
print("Dataset:FinQ Bench\nTotal Corpus:{}\nTotal Queries:{}".format(finq_bench_corpus.shape[0], finq_bench_queries.shape[0]))

Dataset:FinQ Bench
Total Corpus:92
Total Queries:100


In [23]:
def create_documents(df):
    """Create Documents with metadata from df"""
    documents = []
    for idx,row in df.iterrows():
        _ = Document(
            text=row['text'], 
            metadata={'_id' : row['_id'], 'title' : row['title']}
            )
        documents.append(_)
    return documents

# TODO: Create Custom Retriever Class after finalizing the experiment
# https://docs.llamaindex.ai/en/stable/examples/query_engine/CustomRetrievers/


class RetrievalAgent:
    def __init__(self, cfg, documents):
        self.cfg = cfg 
        self.documents = documents 

        self.index , self.reranker = self.initialise_retrieval_components()

    def initialise_retrieval_components(self):
        # Create FaisVectorStore to store embeddings
        fais_index = faiss.IndexFlatL2(self.cfg.EMBED_DIMENSION)
        vector_store = FaissVectorStore(faiss_index=fais_index)
        print("Vector Store Created")

        ## Can experiment with different transformations
        base_pipeline = IngestionPipeline(
            transformations=[SentenceSplitter(chunk_size=256, chunk_overlap=20)],
            vector_store=vector_store,
            documents=self.documents
        )
        nodes = base_pipeline.run()

        # Create vector index from base nodes
        index = VectorStoreIndex(nodes)
        print("Vector Index Initialised")
        
        # Create Reranker
        reranker = SentenceTransformerRerank(
                    model=self.cfg.RERANKER_MODEL,
                    top_n=self.cfg.RERANKER_TOP_N
                )
        print("Reranker Initialised")
        return index, reranker 

    def retrieve_nodes(self, query_str, with_reranker=True):
        query_bundle = QueryBundle(query_str)
        # configure retriever
        retriever = VectorIndexRetriever(
            index=self.index,
            similarity_top_k=self.cfg.SIM_TOP_K
        )
        retrieved_nodes = retriever.retrieve(query_bundle)

        if with_reranker:    
            retrieved_nodes = self.reranker.postprocess_nodes(
                retrieved_nodes, query_bundle
            )

        return retrieved_nodes

In [60]:
def create_df_from_nodes(nodes, extract_unique=True):
    init_rows = []
    for node in nodes:
        tmp = {
            "score" : node.score,
            "text" : node.text,
            "corpus_id" : node.metadata['_id']
        }
        init_rows.append(tmp)
    tmp_df = pd.DataFrame(init_rows)

    if not extract_unique:
        return tmp_df 
    
    final_rows = []
    for corpus_id, corpus_df in tmp_df.groupby('corpus_id'):
        max_score = corpus_df['score'].max()
        text = corpus_df[corpus_df.score == max_score].text.tolist()[0]
        final_rows.append({
            "corpus_id" : corpus_id, 
            "text" : text, 
            "score" : max_score
        })
    df = pd.DataFrame(final_rows)
    df = df.sort_values(by='score', ascending=False)
    return df

In [57]:
# Create FinQ Bench Documents
finq_bench_documents = create_documents(finq_bench_corpus)

# Initialize Retrieval Agent 
ret_agent = RetrievalAgent(cfg=cfg, documents=finq_bench_documents)

Vector Store Created
Vector Index Initialised
Reranker Initialised


In [58]:
sample_query = finq_bench_queries.iloc[10].text
nodes = ret_agent.retrieve_nodes(sample_query)
node_df = create_df_from_nodes(nodes)

In [61]:
query_id_list = []
corpus_id_list = []
score_list = []

for idx,row in finq_bench_queries.iterrows():
    query_id = row['_id']
    query_text = row['text']

    nodes = ret_agent.retrieve_nodes(query_text)
    # Extract top 10 unique corpus_id
    node_df = create_df_from_nodes(nodes)[:10]

    query_id_list.extend([query_id] * 10)
    corpus_id_list.extend(node_df.corpus_id.tolist())
    score_list.extend(node_df.score.tolist())


final_df = pd.DataFrame({
    "query_id" : query_id_list, 
    "corpus_id" : corpus_id_list,
    "score" : score_list
})

In [62]:
final_df.head(20)

Unnamed: 0,query_id,corpus_id,score
0,q4aa0b116,d4aa0b1f2,8.651594
1,q4aa0b116,d4aa0a52c,4.465078
2,q4aa0b116,d4aa0a9e6,4.373746
3,q4aa0b116,d4aa18e92,4.332072
4,q4aa0b116,d4aa1b854,4.299079
5,q4aa0b116,d4aa10314,4.128644
6,q4aa0b116,d4aa0a7d4,3.507901
7,q4aa0b116,d4aa09b4a,2.562739
8,q4aa0b116,d4aa1afee,1.981896
9,q4aa0b116,d4aa0a66c,0.916237
