In [8]:
import warnings
warnings.filterwarnings("ignore")

import os
import sys
from typing import List
from dotenv import load_dotenv
from llama_index.core import Document
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.readers import SimpleDirectoryReader
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import VectorStoreIndex
from llama_index.core.postprocessor import SentenceTransformerRerank, LLMRerank
from llama_index.core import QueryBundle
import faiss
from huggingface_hub import hf_hub_download
from huggingface_hub import login
import pandas as pd 
from pathlib import Path


load_dotenv()
login(os.environ['HF_TOKEN'])

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /nfs/home/scg1143/.cache/huggingface/token
Login successful


In [31]:
# Llamaindex global settings for llm and embeddings
EMBED_DIMENSION=1024
model_id="baconnier/Finance_embedding_large_en-V0.1"
Settings.llm = None
Settings.embed_model = HuggingFaceEmbedding(model_name=model_id)

LLM is explicitly disabled. Using MockLLM.


## FinQABench

In [10]:
data_dir = Path.cwd().parent / 'data'
print(os.listdir(data_dir))

['convfinqa_queries.jsonl', 'ConvFinQA_qrels.tsv', 'FinanceBench_qrels.tsv', 'FinDER_qrels.tsv', 'TATQA_qrels.tsv', 'finqabench_corpus.jsonl', 'finder_corpus.jsonl', 'tatqa_corpus.jsonl', 'tatqa_queries.jsonl', 'multiheirtt_corpus.jsonl', 'convfinqa_corpus.jsonl', 'finqa_queries.jsonl', 'multiheirtt_queries.jsonl', 'finqa_corpus.jsonl', 'financebench_queries.jsonl', 'FinQA_qrels.tsv', 'FinQABench_qrels.tsv', 'sample_submission_.csv', 'MultiHeirtt_qrels.tsv', 'finder_queries.jsonl', 'financebench_corpus.jsonl', 'finqabench_queries.jsonl']


In [13]:
finq_bench_corpus_path = data_dir / 'finqabench_corpus.jsonl/corpus.jsonl'
finq_bench_query_path = data_dir / 'finqabench_queries.jsonl/queries.jsonl'
finq_bench_tsv_path = data_dir / 'FinQABench_qrels.tsv'

In [26]:
finq_bench_corpus = pd.read_json(finq_bench_corpus_path, lines=True)
finq_bench_queries = pd.read_json(finq_bench_query_path, lines=True)
finq_bench_corpus.head()

Unnamed: 0,_id,title,text
0,d4aa0660c,,Apple Inc.\nCONSOLIDATED STATEMENTS OF OPERATI...
1,d4aa04834,,to \ncover all losses or all types of claims ...
2,d4aa01288,,The Company’s operations are also subject to t...
3,d4aa0a270,,"such election by written consent, other than ..."
4,d4aa09b4a,,"rant, dated as of August 20, 2020, including \..."


In [20]:
def create_documents(df):
    documents = []
    for idx,row in df.iterrows():
        _ = Document(
            text=row['text'], 
            metadata={'_id' : row['_id'], 'title' : row['title']}
            )
        documents.append(_)
    return documents

In [22]:
finq_bench_documents = create_documents(finq_bench_corpus)

In [23]:
# Create FaisVectorStore to store embeddings
fais_index = faiss.IndexFlatL2(EMBED_DIMENSION)
vector_store = FaissVectorStore(faiss_index=fais_index)

base_pipeline = IngestionPipeline(
    transformations=[SentenceSplitter()],
    vector_store=vector_store,
    documents=finq_bench_documents
)

nodes = base_pipeline.run()

In [29]:
finq_bench_queries.text.tolist()

['What is the redemption price for the 0.875% 2025 Notes and the 1.375% 2029 Notes if they are redeemed prior to the applicable Par Call Date?',
 'What is the maturity date of the 0.875% 2025 Notes issued by Apple Inc.?',
 'What are the conditions under which the Company is not required to make any payment for any Tax imposed by any government or a political subdivision or taxing authority of or in any government or political subdivision?',
 "What is the Company's investment policy and strategy focused on?",
 "What is the purpose of the Company's internal control over financial reporting?",
 'What was the net income for Apple Inc. in 2020?',
 "What is the Company's fiscal year and how does it align with calendar quarters?",
 "What risks may the COVID-19 pandemic pose to the Company's operational and financial performance?",
 'What is the total net deferred tax assets as of September 24, 2022?',
 'Who signed the Annual Report on Form 10-K on behalf of the Registrant on October 27, 2022?

In [32]:
# Create vector index from base nodes
index = VectorStoreIndex(nodes)

query_engine_w_cross_encoder = index.as_query_engine(
    similarity_top_k=10,
    node_postprocessors=[
        SentenceTransformerRerank(
            model='cross-encoder/ms-marco-MiniLM-L-6-v2',
            top_n=5
        )
    ],
)

resp = query_engine_w_cross_encoder.query("What is the maturity date of the 0.875% 2025 Notes issued by Apple Inc.?")
print(resp)

Context information is below.
---------------------
_id: d4aa0a52c
title:

the maturity date of the 0.875% 2025 Notes), (iii) with respect to the 1.375% 2029 Notes, February 
24, 2029 (three months prior to the maturity date of 1.375% 2029 Notes) and (iv) with respect to the 2031 Notes, 
August 15, 2031 (three months prior to the maturity of the 2031 Notes).
7

_id: d4aa0b1f2
title:

000% 2025 Notes, the 0.875% 2025 Notes, the 1.375% 2029 Notes and the 2031 
Notes at our option, at any time in whole or from time to time in part, prior to the applicable Par Call Date at a 
redemption price equal to the greater of: 
•100% of the principal amount of the Notes to be redeemed; or 
•the sum of the present values of the remaining scheduled payments of principal and interest thereon 
assuming that the Notes matured on the applicable Par Call Date (not including any portion of such 
payments of interest accrued as of the date of redemption), discounted to the date of redemption on an 
annual ba