In [1]:
import os
from pathlib import Path
import sys
sys.path.insert(0, str(Path(os.getcwd()).parent))

In [2]:
import logging
from financerag.retrieval import BM25Retriever, BM25Processor
import financerag.tasks as tasks_module

import importlib
import inspect
import os
import json
import pandas as pd
from datasets import load_dataset

from nltk.tokenize import word_tokenize, TweetTokenizer
from rank_bm25 import BM25Okapi
import nltk
from financerag.retrieval import DenseRetrieval, SentenceTransformerEncoder
from sentence_transformers import CrossEncoder
import logging

from financerag.rerank import CrossEncoderReranker
tweet_tokenizer = TweetTokenizer()
logging.basicConfig(level=logging.INFO)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
retrieval = ['FinDER', 'FinQABench', 'FinanceBench', 'TATQA', 'FinQA', 'ConvFinQA', 'MultiHiertt']
retrieval = ['FinQA', 'ConvFinQA', 'MultiHiertt']

tabular_retrieval = ['TATQA', 'FinQA', 'ConvFinQA', 'MultiHiertt']

In [4]:
output_dir = './e5-large-v2'
reranker_name = 'ms-marco-MiniLM-L-12-v2'
os.makedirs(output_dir, exist_ok=True)

def process_task(task_class, corpus_documents, output_dir, finder_task):
    # Tokenize corpus

    # processor = BM25Processor(tokenizer=tweet_tokenizer.tokenize)
    # # Tokenize and prepare the corpus
    # tokenized_corpus = processor.build_corpus(corpus_documents)
    
    # print(len(tokenized_corpus))
    # # tokenized_corpus = [tweet_tokenizer.tokenize(doc) for doc in corpus_documents]
    
    # # Initialize BM25 and retrieval model
    # bm25_model = BM25Okapi(tokenized_corpus)
    
    # retrieval_model = BM25Retriever(bm25_model, tokenize_list_tweet)
    
    encoder_model = SentenceTransformerEncoder(
        model_name_or_path='intfloat/e5-large-v2',
        query_prompt='query: ',
        doc_prompt='passage: ',
    )

    retrieval_model = DenseRetrieval(
        model=encoder_model
    )

    # Retrieve documents
    retrieval_result = finder_task.retrieve(
        retriever=retrieval_model,
        # top_k=500
    )

    reranker = CrossEncoderReranker(
    model=CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
    )
    reranking_result = finder_task.rerank(
                        reranker=reranker,
                        results=retrieval_result,
                        top_k=100,  # Rerank the top 100 documents
                        batch_size=32
                    )
    
    # Save retrieval result
    os.makedirs(output_dir, exist_ok=True)
    file_name = f"{output_dir}/{task_class}.json"
    with open(file_name, "w") as json_file:
        json.dump(retrieval_result, json_file, indent=4)
    
    # Save evaluation result
    df = pd.read_csv(f'../data/{task_class.split("_")[0]}_qrels.tsv', sep='\t')
    qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
    
    eval_result = finder_task.evaluate(qrels_dict, retrieval_result, [1, 5, 10])
    combined_result = {**eval_result[0], **eval_result[1], **eval_result[2], **eval_result[3]}
    df_eval = pd.DataFrame([combined_result])
    df_eval.to_csv(f'{output_dir}/{task_class.split("_")[0]}_eval.csv', index=False)
    
    output_dir_ = output_dir+'_'+reranker_name
    os.makedirs(output_dir_, exist_ok=True)
    file_name = f"{output_dir_}/{task_class}.json"
    with open(file_name, "w") as json_file:
        json.dump(reranking_result, json_file, indent=4)
    
    # Save evaluation result
    df = pd.read_csv(f'../data/{task_class.split("_")[0]}_qrels.tsv', sep='\t')
    qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
    
    eval_result = finder_task.evaluate(qrels_dict, reranking_result, [1, 5, 10])
    combined_result = {**eval_result[0], **eval_result[1], **eval_result[2], **eval_result[3]}
    df_eval = pd.DataFrame([combined_result])
    df_eval.to_csv(f'{output_dir_}/{task_class.split("_")[0]}_eval.csv', index=False)


In [5]:

for task_class in retrieval:
    print(f"Running for {task_class}")
    if task_class in tabular_retrieval:
        pass
    task_class_obj = getattr(tasks_module, task_class)
    finder_task = task_class_obj()

    # corpus = pd.read_csv(f"./data/{task_class}_corpus_convert.csv")
    corpus = load_dataset("Linq-AI-Research/FinanceRAG", task_class, split="corpus")
    process_task(task_class, corpus, output_dir, finder_task)

INFO:financerag.common.loader:Loading Corpus...


Running for FinQA


INFO:financerag.common.loader:Loaded 2789 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd61d9e858', 'title': '', 'text': 'performance graph the performance graph below shows the five-year cumulative total stockholder return on applied common stock during the period from october 25 , 2009 through october 26 , 2014 .\nthis is compared with the cumulative total return of the standard & poor 2019s 500 stock index and the rdg semiconductor composite index over the same period .\nthe comparison assumes $ 100 was invested on october 25 , 2009 in applied common stock and in each of the foregoing indices and assumes reinvestment of dividends , if any .\ndollar amounts in the graph are rounded to the nearest whole dollar .\nthe performance shown in the graph represents past performance and should not be considered an indication of future performance .\ncomparison of 5 year cumulative total return* among applied materials , inc. , the s&p 500 index 201cs&p 201d is a registered 

Running for ConvFinQA


INFO:financerag.common.loader:Loaded 2066 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'dd4bff516', 'title': '', 'text': 'containerboard , kraft papers and saturating kraft .\nkapstone also owns victory packaging , a packaging solutions distribution company with facilities in the u.s. , canada and mexico .\nwe have included the financial results of kapstone in our corrugated packaging segment since the date of the acquisition .\non september 4 , 2018 , we completed the acquisition ( the 201cschl fcter acquisition 201d ) of schl fcter print pharma packaging ( 201cschl fcter 201d ) .\nschl fcter is a leading provider of differentiated paper and packaging solutions and a german-based supplier of a full range of leaflets and booklets .\nthe schl fcter acquisition allowed us to further enhance our pharmaceutical and automotive platform and expand our geographical footprint in europe to better serve our customers .\nwe have included the financial results of the acquired op

Running for MultiHiertt


INFO:financerag.common.loader:Loaded 10475 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd8e4ea4ac', 'title': '', 'text': '|  | Years Ended December 31, |\n|  | 2006 | 2005 |\n|  | (In millions) |\n| Investment return | $192 | $-26 |\n| Expense | 45 | 11 |\n| In-force/Persistency | -7 | -33 |\n| Policyholder dividends and other | -39 | -11 |\n| Total | $191 | $-59 |\nAs of December 31, 2006 and 2005, DAC and VOBA for the Individual segment were $14.0 billion and $13.5 billion, respectively, and for the total Company were $20.8 billion and $19.7 billion, respectively.\nGoodwill Goodwill is the excess of cost over the fair value of net assets acquired.\nThe Company tests goodwill for impairment at least annually or more frequently if events or circumstances, such as adverse changes in the business climate, indicate that there may be justification for conducting an interim test.\nImpairment testing is performed using the fair value approach, which requires the use of es

In [6]:
output_dir = './e5-large-v2'

master_result = pd.DataFrame()
for task_class in retrieval:
    df_eval = pd.read_csv(f'{output_dir}/{task_class.split("_")[0]}_eval.csv')
    df_eval['task'] = task_class
    master_result = pd.concat([master_result, df_eval])
master_result.to_csv(f'{output_dir}/master_eval.csv')


In [8]:
output_dir = './e5-large-v2_ms-marco-MiniLM-L-12-v2'
master_result = pd.DataFrame()
for task_class in retrieval:
    df_eval = pd.read_csv(f'{output_dir}/{task_class.split("_")[0]}_eval.csv')
    df_eval['task'] = task_class
    master_result = pd.concat([master_result, df_eval])
master_result.to_csv(f'{output_dir}/master_eval.csv')


In [9]:
master_result

Unnamed: 0,NDCG@1,NDCG@5,NDCG@10,MAP@1,MAP@5,MAP@10,Recall@1,Recall@5,Recall@10,P@1,P@5,P@10,task
0,0.24128,0.3179,0.34974,0.24128,0.29511,0.30816,0.24128,0.38663,0.48547,0.24128,0.07733,0.04855,FinQA
0,0.23016,0.34283,0.38143,0.23016,0.30688,0.32288,0.23016,0.45238,0.57143,0.23016,0.09048,0.05714,ConvFinQA
0,0.19178,0.11577,0.12063,0.04546,0.06534,0.06893,0.04546,0.08956,0.11182,0.19178,0.07466,0.04658,MultiHiertt


In [7]:
master_result

Unnamed: 0,NDCG@1,NDCG@5,NDCG@10,MAP@1,MAP@5,MAP@10,Recall@1,Recall@5,Recall@10,P@1,P@5,P@10,task
0,0.19477,0.27934,0.30548,0.19477,0.25262,0.2633,0.19477,0.36047,0.44186,0.19477,0.07209,0.04419,FinQA
0,0.19841,0.29131,0.32099,0.19841,0.26177,0.27487,0.19841,0.38095,0.46825,0.19841,0.07619,0.04683,ConvFinQA
0,0.13356,0.08635,0.09029,0.03354,0.04748,0.05037,0.03354,0.06749,0.08564,0.13356,0.0589,0.03699,MultiHiertt


In [2]:
import pandas as pd
pd.read_csv('../data/FinDER_qrels.tsv')

Unnamed: 0,query_id\tcorpus_id\tscore
0,q00001\tMSFT20230014\t1
1,q00001\tMSFT20230015\t1
2,q00007\tMSFT20231529\t1
3,q00008\tMSFT20231529\t1
4,q00010\tADBE20231571\t1
...,...
98,q00197\tUNH20230438\t1
99,q00200\tGOOGL20230050\t1
100,q00204\tGOOGL20230680\t1
101,q00210\tBRK.A20230396\t1
