In [1]:
import os
from pathlib import Path
import sys
sys.path.insert(0, str(Path(os.getcwd()).parent))

In [2]:
import logging
from financerag.retrieval import BM25Retriever, BM25Processor
import financerag.tasks as tasks_module

import importlib
import inspect
import os
import json
import pandas as pd
from datasets import load_dataset

from nltk.tokenize import word_tokenize, TweetTokenizer
from rank_bm25 import BM25Okapi
import nltk

tweet_tokenizer = TweetTokenizer()
logging.basicConfig(level=logging.INFO)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
retrieval = ['FinDER', 'FinQABench', 'FinanceBench', 'TATQA', 'FinQA', 'ConvFinQA', 'MultiHiertt']
tabular_retrieval = ['TATQA', 'FinQA', 'ConvFinQA', 'MultiHiertt']

In [4]:
from typing import Any, Callable, Dict, List, Literal, Optional

def tokenize_list_tweet(input_list: List[str]) -> List[List[str]]:
    """
    Tokenizes a list of strings using the `nltk.word_tokenize` function.

    Args:
        input_list (`List[str]`):
            A list of input strings to be tokenized.

    Returns:
        `List[List[str]]`:
            A list where each element is a list of tokens corresponding to an input string.
    """
    return list(map(tweet_tokenizer.tokenize, input_list))

In [5]:
output_dir = './BM25'
os.makedirs(output_dir, exist_ok=True)

def process_task(task_class, corpus_documents, output_dir, finder_task):
    # Tokenize corpus
    processor = BM25Processor(tokenizer=tweet_tokenizer.tokenize)
    # Tokenize and prepare the corpus
    tokenized_corpus = processor.build_corpus(corpus_documents)
    
    print(len(tokenized_corpus))
    # tokenized_corpus = [tweet_tokenizer.tokenize(doc) for doc in corpus_documents]
    
    # Initialize BM25 and retrieval model
    bm25_model = BM25Okapi(tokenized_corpus)
    
    retrieval_model = BM25Retriever(bm25_model, tokenize_list_tweet)
    
    # Retrieve documents
    retrieval_result = finder_task.retrieve(
        retriever=retrieval_model,
        # top_k=500
    )
    
    # Save retrieval result
    file_name = f"{output_dir}/{task_class}.json"
    with open(file_name, "w") as json_file:
        json.dump(retrieval_result, json_file, indent=4)
    
    # Save evaluation result
    df = pd.read_csv(f'../data/{task_class.split("_")[0]}_qrels.tsv', sep='\t')
    qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
    
    eval_result = finder_task.evaluate(qrels_dict, retrieval_result, [1, 5, 10])
    combined_result = {**eval_result[0], **eval_result[1], **eval_result[2], **eval_result[3]}
    df_eval = pd.DataFrame([combined_result])
    df_eval.to_csv(f'{output_dir}/{task_class.split("_")[0]}_eval.csv', index=False)


In [12]:
processor = BM25Processor(tokenizer=tweet_tokenizer.tokenize)
# Tokenize and prepare the corpus
tokenized_corpus = processor.build_corpus(corpus)

In [17]:
corpus_list = [{"title": value["title"], "text": value["text"]} for value in finder_task.corpus.values()]


In [19]:
token = processor.build_corpus(corpus_list)
len(token)

13863

In [6]:
task_class = 'FinDER'
task_class_obj = getattr(tasks_module, task_class)
finder_task = task_class_obj()

# corpus = pd.read_csv(f"./data/{task_class}_corpus_convert.csv")
corpus = load_dataset("Linq-AI-Research/FinanceRAG", task_class, split="corpus")
# process_task(task_class, corpus, output_dir, finder_task)

INFO:financerag.common.loader:Loading Corpus...
INFO:financerag.common.loader:Loaded 13867 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'ADBE20230004', 'title': 'ADBE OVERVIEW', 'text': 'Adobe is a global technology company with a mission to change the world through personalized digital experiences. For over four decades, Adobe’s innovations have transformed how individuals, teams, businesses, enterprises, institutions, and governments engage and interact across all types of media. Our products, services and solutions are used around the world to imagine, create, manage, deliver, measure, optimize and engage with content across surfaces and fuel digital experiences. We have a diverse user base that includes consumers, communicators, creative professionals, developers, students, small and medium businesses and enterprises. We are also empowering creators by putting the power of artificial intelligence (“AI”) in their hands, and doing so in ways we believe are responsi

In [11]:
finder_task.corpus

{'ADBE20230004': {'title': 'ADBE OVERVIEW',
  'text': 'Adobe is a global technology company with a mission to change the world through personalized digital experiences. For over four decades, Adobe’s innovations have transformed how individuals, teams, businesses, enterprises, institutions, and governments engage and interact across all types of media. Our products, services and solutions are used around the world to imagine, create, manage, deliver, measure, optimize and engage with content across surfaces and fuel digital experiences. We have a diverse user base that includes consumers, communicators, creative professionals, developers, students, small and medium businesses and enterprises. We are also empowering creators by putting the power of artificial intelligence (“AI”) in their hands, and doing so in ways we believe are responsible. Our products and services help unleash creativity, accelerate document productivity and power businesses in a digital world.'},
 'ADBE20230006': {

In [6]:

for task_class in retrieval:
    print(f"Running for {task_class}")
    if task_class in tabular_retrieval:
        pass
    task_class_obj = getattr(tasks_module, task_class)
    finder_task = task_class_obj()

    # corpus = pd.read_csv(f"./data/{task_class}_corpus_convert.csv")
    corpus = load_dataset("Linq-AI-Research/FinanceRAG", task_class, split="corpus")
    process_task(task_class, corpus, output_dir, finder_task)

INFO:financerag.common.loader:Loading Corpus...


Running for FinDER


INFO:financerag.common.loader:Loaded 13867 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'ADBE20230004', 'title': 'ADBE OVERVIEW', 'text': 'Adobe is a global technology company with a mission to change the world through personalized digital experiences. For over four decades, Adobe’s innovations have transformed how individuals, teams, businesses, enterprises, institutions, and governments engage and interact across all types of media. Our products, services and solutions are used around the world to imagine, create, manage, deliver, measure, optimize and engage with content across surfaces and fuel digital experiences. We have a diverse user base that includes consumers, communicators, creative professionals, developers, students, small and medium businesses and enterprises. We are also empowering creators by putting the power of artificial intelligence (“AI”) in their hands, and doing so in ways we believe are responsible. Our products and services help unleash crea

13867


  qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.0000
INFO:financerag.tasks.BaseTask:NDCG@5: 0.0048
INFO:financerag.tasks.BaseTask:NDCG@10: 0.0116
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:MAP@1: 0.0000
INFO:financerag.tasks.BaseTask:MAP@5: 0.0026
INFO:financerag.tasks.BaseTask:MAP@10: 0.0050
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:Recall@1: 0.0000
INFO:financerag.tasks.BaseTask:Recall@5: 0.0078
INFO:financerag.tasks.BaseTask:Recall@10: 0.0273
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:P@1: 0.0000
INFO:financerag.tasks.BaseTask:P@5: 0.0031
INFO:financerag.tasks.BaseTask:P@10: 0.0047
INFO:financerag.common.loader:Loading Corpus.

Running for FinQABench


INFO:financerag.common.loader:Loaded 92 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd4aa0660c', 'title': '', 'text': 'Apple Inc.\nCONSOLIDATED STATEMENTS OF OPERATIONS\n(In millions, except number of shares which are reflected in thousands and per share amounts)\nYears ended\nSeptember 24,\n2022September 25,\n2021September 26,\n2020\nNet sales:\n   Products $ 316,199 $ 297,392 $ 220,747 \n   Services  78,129  68,425  53,768 \nTotal net sales  394,328  365,817  274,515 \nCost of sales:\n   Products  201,471  192,266  151,286 \n   Services  22,075  20,715  18,273 \nTotal cost of sales  223,546  212,981  169,559 \nGross margin  170,782  152,836  104,956 \nOperating expenses:\nResearch and development  26,251  21,914  18,752 \nSelling, general and administrative  25,094  21,973  19,916 \nTotal operating expenses  51,345  43,887  38,668 \nOperating income  119,437  108,949  66,288 \nOther income/(expense), net  (334)  258  803 \nIncome before provision for income taxes 

92
Running for FinanceBench


INFO:financerag.common.loader:Loaded 180 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'dd2af2336', 'title': 'PEPSICO_2022_10K', 'text': '6) Africa, Middle East and South Asia (AMESA), which includes all of our beverage and convenient food businesses in\nAfrica, the Middle East and South Asia; and\n7) Asia Pacific, Australia and New Zealand and China Region (APAC), which includes all of our beverage and convenient\nfood businesses in Asia Pacific, Australia and New Zealand, and China region.'}
INFO:financerag.common.loader:Loading Queries...
INFO:financerag.common.loader:Loaded 150 Queries.
INFO:financerag.common.loader:Query Example: {'id': 'qd2ac917a', 'text': 'What is the FY2019 - FY2020 total revenue growth rate for Block (formerly known as Square)? Answer in units of percents and round to one decimal place. Approach the question asked by assuming the standpoint of an investment banking analyst who only has access to the statement of income.'}
INFO:financerag.retr

180
Running for TATQA


INFO:financerag.common.loader:Loaded 2756 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd1b2e74c0', 'title': '', 'text': 'The following tables present the recorded investment by portfolio segment and by class, excluding commercial financing receivables and other miscellaneous financing receivables at December 31, 2019 and 2018. Commercial financing receivables are excluded from the presentation of financing receivables by portfolio segment, as they are short term in nature and the current estimated risk of loss and resulting impact to the company’s financing results are not material.\nWrite-offs of lease receivables and loan receivables were $16 million and $47 million, respectively, for the year ended December 31, 2019. Provisions for credit losses recorded for lease receivables and loan receivables were a release of $6 million and an addition of $2 million, respectively, for the year ended December 31, 2019.\nThe average recorded investment of impaired leases and l

2756


  qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.1647
INFO:financerag.tasks.BaseTask:NDCG@5: 0.2378
INFO:financerag.tasks.BaseTask:NDCG@10: 0.2646
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:MAP@1: 0.1647
INFO:financerag.tasks.BaseTask:MAP@5: 0.2149
INFO:financerag.tasks.BaseTask:MAP@10: 0.2261
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:Recall@1: 0.1647
INFO:financerag.tasks.BaseTask:Recall@5: 0.3072
INFO:financerag.tasks.BaseTask:Recall@10: 0.3896
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:P@1: 0.1647
INFO:financerag.tasks.BaseTask:P@5: 0.0614
INFO:financerag.tasks.BaseTask:P@10: 0.0390
INFO:financerag.common.loader:Loading Corpus.

Running for FinQA


INFO:financerag.common.loader:Loaded 2789 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd61d9e858', 'title': '', 'text': 'performance graph the performance graph below shows the five-year cumulative total stockholder return on applied common stock during the period from october 25 , 2009 through october 26 , 2014 .\nthis is compared with the cumulative total return of the standard & poor 2019s 500 stock index and the rdg semiconductor composite index over the same period .\nthe comparison assumes $ 100 was invested on october 25 , 2009 in applied common stock and in each of the foregoing indices and assumes reinvestment of dividends , if any .\ndollar amounts in the graph are rounded to the nearest whole dollar .\nthe performance shown in the graph represents past performance and should not be considered an indication of future performance .\ncomparison of 5 year cumulative total return* among applied materials , inc. , the s&p 500 index 201cs&p 201d is a registered 

2789


  qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.2849
INFO:financerag.tasks.BaseTask:NDCG@5: 0.4228
INFO:financerag.tasks.BaseTask:NDCG@10: 0.4585
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:MAP@1: 0.2849
INFO:financerag.tasks.BaseTask:MAP@5: 0.3826
INFO:financerag.tasks.BaseTask:MAP@10: 0.3978
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:Recall@1: 0.2849
INFO:financerag.tasks.BaseTask:Recall@5: 0.5436
INFO:financerag.tasks.BaseTask:Recall@10: 0.6512
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:P@1: 0.2849
INFO:financerag.tasks.BaseTask:P@5: 0.1087
INFO:financerag.tasks.BaseTask:P@10: 0.0651
INFO:financerag.common.loader:Loading Corpus.

Running for ConvFinQA


INFO:financerag.common.loader:Loaded 2066 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'dd4bff516', 'title': '', 'text': 'containerboard , kraft papers and saturating kraft .\nkapstone also owns victory packaging , a packaging solutions distribution company with facilities in the u.s. , canada and mexico .\nwe have included the financial results of kapstone in our corrugated packaging segment since the date of the acquisition .\non september 4 , 2018 , we completed the acquisition ( the 201cschl fcter acquisition 201d ) of schl fcter print pharma packaging ( 201cschl fcter 201d ) .\nschl fcter is a leading provider of differentiated paper and packaging solutions and a german-based supplier of a full range of leaflets and booklets .\nthe schl fcter acquisition allowed us to further enhance our pharmaceutical and automotive platform and expand our geographical footprint in europe to better serve our customers .\nwe have included the financial results of the acquired op

2066


  qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.2381
INFO:financerag.tasks.BaseTask:NDCG@5: 0.3824
INFO:financerag.tasks.BaseTask:NDCG@10: 0.4224
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:MAP@1: 0.2381
INFO:financerag.tasks.BaseTask:MAP@5: 0.3452
INFO:financerag.tasks.BaseTask:MAP@10: 0.3612
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:Recall@1: 0.2381
INFO:financerag.tasks.BaseTask:Recall@5: 0.4921
INFO:financerag.tasks.BaseTask:Recall@10: 0.6190
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:P@1: 0.2381
INFO:financerag.tasks.BaseTask:P@5: 0.0984
INFO:financerag.tasks.BaseTask:P@10: 0.0619
INFO:financerag.common.loader:Loading Corpus.

Running for MultiHiertt


INFO:financerag.common.loader:Loaded 10475 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd8e4ea4ac', 'title': '', 'text': '|  | Years Ended December 31, |\n|  | 2006 | 2005 |\n|  | (In millions) |\n| Investment return | $192 | $-26 |\n| Expense | 45 | 11 |\n| In-force/Persistency | -7 | -33 |\n| Policyholder dividends and other | -39 | -11 |\n| Total | $191 | $-59 |\nAs of December 31, 2006 and 2005, DAC and VOBA for the Individual segment were $14.0 billion and $13.5 billion, respectively, and for the total Company were $20.8 billion and $19.7 billion, respectively.\nGoodwill Goodwill is the excess of cost over the fair value of net assets acquired.\nThe Company tests goodwill for impairment at least annually or more frequently if events or circumstances, such as adverse changes in the business climate, indicate that there may be justification for conducting an interim test.\nImpairment testing is performed using the fair value approach, which requires the use of es

10475


INFO:financerag.retrieval.bm25:Tokenizing queries with lower cases
  qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.0822
INFO:financerag.tasks.BaseTask:NDCG@5: 0.0576
INFO:financerag.tasks.BaseTask:NDCG@10: 0.0643
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:MAP@1: 0.0220
INFO:financerag.tasks.BaseTask:MAP@5: 0.0324
INFO:financerag.tasks.BaseTask:MAP@10: 0.0351
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:Recall@1: 0.0220
INFO:financerag.tasks.BaseTask:Recall@5: 0.0500
INFO:financerag.tasks.BaseTask:Recall@10: 0.0683
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:P@1: 0.0822
INFO:financerag.tasks.BaseTask:P@5: 0.0390
INFO:financerag.tasks.

In [19]:
output_dir = './BM25'

master_result = pd.DataFrame()
for task_class in retrieval:
    df_eval = pd.read_csv(f'{output_dir}/{task_class.split("_")[0]}_eval.csv')
    df_eval['task'] = task_class
    master_result = pd.concat([master_result, df_eval])
master_result.to_csv(f'{output_dir}/master_eval.csv')


In [20]:
master_result

Unnamed: 0,NDCG@1,NDCG@5,NDCG@10,MAP@1,MAP@5,MAP@10,Recall@1,Recall@5,Recall@10,P@1,P@5,P@10,task
0,0.0,0.00479,0.01156,0.0,0.0026,0.00499,0.0,0.00781,0.02734,0.0,0.00313,0.00469,FinDER
0,0.56667,0.71819,0.74058,0.56667,0.67,0.67972,0.56667,0.86667,0.93333,0.56667,0.17333,0.09333,FinQABench
0,0.08889,0.14304,0.16246,0.06667,0.11741,0.12552,0.06667,0.2,0.25556,0.08889,0.04444,0.02889,FinanceBench
0,0.16466,0.23783,0.26463,0.16466,0.21493,0.22609,0.16466,0.30723,0.38956,0.16466,0.06145,0.03896,TATQA
0,0.28488,0.42283,0.45848,0.28488,0.38261,0.39783,0.28488,0.5436,0.65116,0.28488,0.10872,0.06512,FinQA
0,0.2381,0.38241,0.42243,0.2381,0.34524,0.36115,0.2381,0.49206,0.61905,0.2381,0.09841,0.0619,ConvFinQA
0,0.08219,0.05764,0.06433,0.02203,0.03245,0.03506,0.02203,0.04995,0.0683,0.08219,0.03904,0.02671,MultiHiertt


In [14]:
output_dir = './BM25_ms-marco-MiniLM-L-12-v2'
os.makedirs(output_dir, exist_ok=True)
from financerag.rerank import CrossEncoderReranker
from sentence_transformers import CrossEncoder

# reranker = CrossEncoderReranker(
#     model=CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
# )
def process_task_reranker(task_class, corpus_documents, output_dir, finder_task):
    # Tokenize corpus
    reranker = CrossEncoderReranker(
    model=CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
    )
    processor = BM25Processor(tokenizer=tweet_tokenizer.tokenize)
    # Tokenize and prepare the corpus
    tokenized_corpus = processor.build_corpus(corpus_documents)
    
    print(len(tokenized_corpus))
    # tokenized_corpus = [tweet_tokenizer.tokenize(doc) for doc in corpus_documents]
    
    # Initialize BM25 and retrieval model
    bm25_model = BM25Okapi(tokenized_corpus)
    
    retrieval_model = BM25Retriever(bm25_model, tokenize_list_tweet)
    
    # Retrieve documents
    retrieval_result = finder_task.retrieve(
        retriever=retrieval_model,
        # top_k=500
    )
    reranking_result = finder_task.rerank(
                        reranker=reranker,
                        results=retrieval_result,
                        top_k=100,  # Rerank the top 100 documents
                        batch_size=32
                    )
    # Save retrieval result
    file_name = f"{output_dir}/{task_class}.json"
    with open(file_name, "w") as json_file:
        json.dump(reranking_result, json_file, indent=4)
    
    # Save evaluation result
    df = pd.read_csv(f'../data/{task_class.split("_")[0]}_qrels.tsv', sep='\t')
    qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
    
    eval_result = finder_task.evaluate(qrels_dict, reranking_result, [1, 5, 10])
    combined_result = {**eval_result[0], **eval_result[1], **eval_result[2], **eval_result[3]}
    df_eval = pd.DataFrame([combined_result])
    df_eval.to_csv(f'{output_dir}/{task_class.split("_")[0]}_eval.csv', index=False)


In [15]:
for task_class in retrieval:
    print(f"Running for {task_class}")
    if task_class in tabular_retrieval:
        pass
    task_class_obj = getattr(tasks_module, task_class)
    finder_task = task_class_obj()

    # corpus = pd.read_csv(f"./data/{task_class}_corpus_convert.csv")
    corpus = load_dataset("Linq-AI-Research/FinanceRAG", task_class, split="corpus")
    process_task_reranker(task_class, corpus, output_dir, finder_task)

INFO:financerag.common.loader:Loading Corpus...


Running for FinDER


INFO:financerag.common.loader:Loaded 13867 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'ADBE20230004', 'title': 'ADBE OVERVIEW', 'text': 'Adobe is a global technology company with a mission to change the world through personalized digital experiences. For over four decades, Adobe’s innovations have transformed how individuals, teams, businesses, enterprises, institutions, and governments engage and interact across all types of media. Our products, services and solutions are used around the world to imagine, create, manage, deliver, measure, optimize and engage with content across surfaces and fuel digital experiences. We have a diverse user base that includes consumers, communicators, creative professionals, developers, students, small and medium businesses and enterprises. We are also empowering creators by putting the power of artificial intelligence (“AI”) in their hands, and doing so in ways we believe are responsible. Our products and services help unleash crea

13867


INFO:financerag.rerank.cross_encoder:Starting To Rerank Top-100....
Batches: 100%|██████████| 675/675 [06:58<00:00,  1.61it/s]
  qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.1094
INFO:financerag.tasks.BaseTask:NDCG@5: 0.1417
INFO:financerag.tasks.BaseTask:NDCG@10: 0.1446
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:MAP@1: 0.0820
INFO:financerag.tasks.BaseTask:MAP@5: 0.1247
INFO:financerag.tasks.BaseTask:MAP@10: 0.1264
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:Recall@1: 0.0820
INFO:financerag.tasks.BaseTask:Recall@5: 0.1641
INFO:financerag.tasks.BaseTask:Recall@10: 0.1719
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:P@1: 0.1094
INFO:

Running for FinQABench


INFO:financerag.common.loader:Loaded 92 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd4aa0660c', 'title': '', 'text': 'Apple Inc.\nCONSOLIDATED STATEMENTS OF OPERATIONS\n(In millions, except number of shares which are reflected in thousands and per share amounts)\nYears ended\nSeptember 24,\n2022September 25,\n2021September 26,\n2020\nNet sales:\n   Products $ 316,199 $ 297,392 $ 220,747 \n   Services  78,129  68,425  53,768 \nTotal net sales  394,328  365,817  274,515 \nCost of sales:\n   Products  201,471  192,266  151,286 \n   Services  22,075  20,715  18,273 \nTotal cost of sales  223,546  212,981  169,559 \nGross margin  170,782  152,836  104,956 \nOperating expenses:\nResearch and development  26,251  21,914  18,752 \nSelling, general and administrative  25,094  21,973  19,916 \nTotal operating expenses  51,345  43,887  38,668 \nOperating income  119,437  108,949  66,288 \nOther income/(expense), net  (334)  258  803 \nIncome before provision for income taxes 

92


INFO:financerag.rerank.cross_encoder:Starting To Rerank Top-100....
Batches: 100%|██████████| 288/288 [03:17<00:00,  1.46it/s]
  qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.8333
INFO:financerag.tasks.BaseTask:NDCG@5: 0.8667
INFO:financerag.tasks.BaseTask:NDCG@10: 0.8763
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:MAP@1: 0.8333
INFO:financerag.tasks.BaseTask:MAP@5: 0.8556
INFO:financerag.tasks.BaseTask:MAP@10: 0.8589
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:Recall@1: 0.8333
INFO:financerag.tasks.BaseTask:Recall@5: 0.9000
INFO:financerag.tasks.BaseTask:Recall@10: 0.9333
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:P@1: 0.8333
INFO:

Running for FinanceBench


INFO:financerag.common.loader:Loaded 180 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'dd2af2336', 'title': 'PEPSICO_2022_10K', 'text': '6) Africa, Middle East and South Asia (AMESA), which includes all of our beverage and convenient food businesses in\nAfrica, the Middle East and South Asia; and\n7) Asia Pacific, Australia and New Zealand and China Region (APAC), which includes all of our beverage and convenient\nfood businesses in Asia Pacific, Australia and New Zealand, and China region.'}
INFO:financerag.common.loader:Loading Queries...
INFO:financerag.common.loader:Loaded 150 Queries.
INFO:financerag.common.loader:Query Example: {'id': 'qd2ac917a', 'text': 'What is the FY2019 - FY2020 total revenue growth rate for Block (formerly known as Square)? Answer in units of percents and round to one decimal place. Approach the question asked by assuming the standpoint of an investment banking analyst who only has access to the statement of income.'}
INFO:sentence_transf

180


Batches: 100%|██████████| 469/469 [05:04<00:00,  1.54it/s]
  qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.5111
INFO:financerag.tasks.BaseTask:NDCG@5: 0.5172
INFO:financerag.tasks.BaseTask:NDCG@10: 0.5218
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:MAP@1: 0.4111
INFO:financerag.tasks.BaseTask:MAP@5: 0.4889
INFO:financerag.tasks.BaseTask:MAP@10: 0.4921
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:Recall@1: 0.4111
INFO:financerag.tasks.BaseTask:Recall@5: 0.5444
INFO:financerag.tasks.BaseTask:Recall@10: 0.5556
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:P@1: 0.5111
INFO:financerag.tasks.BaseTask:P@5: 0.1422
INFO:financerag.tasks.BaseTask

Running for TATQA


INFO:financerag.common.loader:Loaded 2756 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd1b2e74c0', 'title': '', 'text': 'The following tables present the recorded investment by portfolio segment and by class, excluding commercial financing receivables and other miscellaneous financing receivables at December 31, 2019 and 2018. Commercial financing receivables are excluded from the presentation of financing receivables by portfolio segment, as they are short term in nature and the current estimated risk of loss and resulting impact to the company’s financing results are not material.\nWrite-offs of lease receivables and loan receivables were $16 million and $47 million, respectively, for the year ended December 31, 2019. Provisions for credit losses recorded for lease receivables and loan receivables were a release of $6 million and an addition of $2 million, respectively, for the year ended December 31, 2019.\nThe average recorded investment of impaired leases and l

2756


INFO:financerag.rerank.cross_encoder:Starting To Rerank Top-100....
Batches: 100%|██████████| 5197/5197 [59:37<00:00,  1.45it/s]  
  qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.2349
INFO:financerag.tasks.BaseTask:NDCG@5: 0.3118
INFO:financerag.tasks.BaseTask:NDCG@10: 0.3379
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:MAP@1: 0.2349
INFO:financerag.tasks.BaseTask:MAP@5: 0.2871
INFO:financerag.tasks.BaseTask:MAP@10: 0.2979
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:Recall@1: 0.2349
INFO:financerag.tasks.BaseTask:Recall@5: 0.3876
INFO:financerag.tasks.BaseTask:Recall@10: 0.4679
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:P@1: 0.2349
I

Running for FinQA


INFO:financerag.common.loader:Loaded 2789 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd61d9e858', 'title': '', 'text': 'performance graph the performance graph below shows the five-year cumulative total stockholder return on applied common stock during the period from october 25 , 2009 through october 26 , 2014 .\nthis is compared with the cumulative total return of the standard & poor 2019s 500 stock index and the rdg semiconductor composite index over the same period .\nthe comparison assumes $ 100 was invested on october 25 , 2009 in applied common stock and in each of the foregoing indices and assumes reinvestment of dividends , if any .\ndollar amounts in the graph are rounded to the nearest whole dollar .\nthe performance shown in the graph represents past performance and should not be considered an indication of future performance .\ncomparison of 5 year cumulative total return* among applied materials , inc. , the s&p 500 index 201cs&p 201d is a registered 

2789


INFO:financerag.retrieval.bm25:Tokenizing queries with lower cases
INFO:financerag.rerank.cross_encoder:Starting To Rerank Top-100....
Batches: 100%|██████████| 3585/3585 [34:52<00:00,  1.71it/s]
  qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.2238
INFO:financerag.tasks.BaseTask:NDCG@5: 0.3183
INFO:financerag.tasks.BaseTask:NDCG@10: 0.3485
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:MAP@1: 0.2238
INFO:financerag.tasks.BaseTask:MAP@5: 0.2892
INFO:financerag.tasks.BaseTask:MAP@10: 0.3017
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:Recall@1: 0.2238
INFO:financerag.tasks.BaseTask:Recall@5: 0.4070
INFO:financerag.tasks.BaseTask:Recall@10: 0.5000
INFO:finance

Running for ConvFinQA


INFO:financerag.common.loader:Loaded 2066 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'dd4bff516', 'title': '', 'text': 'containerboard , kraft papers and saturating kraft .\nkapstone also owns victory packaging , a packaging solutions distribution company with facilities in the u.s. , canada and mexico .\nwe have included the financial results of kapstone in our corrugated packaging segment since the date of the acquisition .\non september 4 , 2018 , we completed the acquisition ( the 201cschl fcter acquisition 201d ) of schl fcter print pharma packaging ( 201cschl fcter 201d ) .\nschl fcter is a leading provider of differentiated paper and packaging solutions and a german-based supplier of a full range of leaflets and booklets .\nthe schl fcter acquisition allowed us to further enhance our pharmaceutical and automotive platform and expand our geographical footprint in europe to better serve our customers .\nwe have included the financial results of the acquired op

2066


INFO:financerag.rerank.cross_encoder:Starting To Rerank Top-100....
Batches: 100%|██████████| 1316/1316 [12:48<00:00,  1.71it/s]
  qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.2143
INFO:financerag.tasks.BaseTask:NDCG@5: 0.3376
INFO:financerag.tasks.BaseTask:NDCG@10: 0.3818
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:MAP@1: 0.2143
INFO:financerag.tasks.BaseTask:MAP@5: 0.3019
INFO:financerag.tasks.BaseTask:MAP@10: 0.3204
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:Recall@1: 0.2143
INFO:financerag.tasks.BaseTask:Recall@5: 0.4444
INFO:financerag.tasks.BaseTask:Recall@10: 0.5794
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:P@1: 0.2143
INF

Running for MultiHiertt


INFO:financerag.common.loader:Loaded 10475 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd8e4ea4ac', 'title': '', 'text': '|  | Years Ended December 31, |\n|  | 2006 | 2005 |\n|  | (In millions) |\n| Investment return | $192 | $-26 |\n| Expense | 45 | 11 |\n| In-force/Persistency | -7 | -33 |\n| Policyholder dividends and other | -39 | -11 |\n| Total | $191 | $-59 |\nAs of December 31, 2006 and 2005, DAC and VOBA for the Individual segment were $14.0 billion and $13.5 billion, respectively, and for the total Company were $20.8 billion and $19.7 billion, respectively.\nGoodwill Goodwill is the excess of cost over the fair value of net assets acquired.\nThe Company tests goodwill for impairment at least annually or more frequently if events or circumstances, such as adverse changes in the business climate, indicate that there may be justification for conducting an interim test.\nImpairment testing is performed using the fair value approach, which requires the use of es

10475


INFO:financerag.retrieval.bm25:Tokenizing queries with lower cases
INFO:financerag.rerank.cross_encoder:Starting To Rerank Top-100....
Batches: 100%|██████████| 3044/3044 [32:20<00:00,  1.57it/s]
  qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.1507
INFO:financerag.tasks.BaseTask:NDCG@5: 0.0847
INFO:financerag.tasks.BaseTask:NDCG@10: 0.0843
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:MAP@1: 0.0368
INFO:financerag.tasks.BaseTask:MAP@5: 0.0478
INFO:financerag.tasks.BaseTask:MAP@10: 0.0491
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:Recall@1: 0.0368
INFO:financerag.tasks.BaseTask:Recall@5: 0.0647
INFO:financerag.tasks.BaseTask:Recall@10: 0.0727
INFO:finance

In [16]:
master_result = pd.DataFrame()
for task_class in retrieval:
    df_eval = pd.read_csv(f'{output_dir}/{task_class.split("_")[0]}_eval.csv')
    df_eval['task'] = task_class
    master_result = pd.concat([master_result, df_eval])

In [18]:
master_result.to_csv(f'{output_dir}/master_eval.csv')
