In [101]:
import os
from pathlib import Path
import sys
sys.path.insert(0, str(Path(os.getcwd()).parent))


In [102]:
import logging
from financerag.retrieval import BM25Retriever, BM25Processor
import financerag.tasks as tasks_module

import importlib
import inspect
import os
import json
import pandas as pd
from datasets import load_dataset

from nltk.tokenize import word_tokenize, TweetTokenizer
from rank_bm25 import BM25Okapi
import nltk

import os
import pdb
import re
import shutil
import json
from pathlib import Path
# from dotenv import load_dotenv
from financerag.rerank import CrossEncoderReranker
from sentence_transformers import CrossEncoder
from financerag.retrieval import DenseRetrieval, SentenceTransformerEncoder

tweet_tokenizer = TweetTokenizer()
logging.basicConfig(level=logging.INFO)

In [103]:
from typing import Any, Callable, Dict, List, Literal, Optional

def tokenize_list_tweet(input_list: List[str]) -> List[List[str]]:
    """
    Tokenizes a list of strings using the `nltk.word_tokenize` function.

    Args:
        input_list (`List[str]`):
            A list of input strings to be tokenized.

    Returns:
        `List[List[str]]`:
            A list where each element is a list of tokens corresponding to an input string.
    """
    return list(map(tweet_tokenizer.tokenize, input_list))


def clean_text(text):
    """
    Replace all Unicode escape sequences (e.g., \u2019, \u0080) with a space.
    """
    return re.sub(r"(\\u[0-9A-Fa-f]{4})+", " ", text)


def load_jsonl(file_path):
    """
    Load a JSONL file and return its content as a list of dictionaries.
    """
    if not file_path.exists():
        raise FileNotFoundError(f"File not found at {file_path}")
    with open(file_path, "r", encoding="utf-8") as f:
        return [json.loads(clean_text(line.strip())) for line in f]


In [104]:
def first_stage_retrieval(output_dir, task_class, finder_task, ranker_model, dataset_dir = '../data', query_file = '', corpus_file='', overwrite = False):
    ranker_model_changed = ranker_model.replace("/", "_")
    output_dir_name = ranker_model_changed
    # if reranker_model is not None:
    #     output_dir_name += '_'+reranker_model
    if query_file != '':
        output_dir_name += '_'+query_file
    if corpus_file != '':
        output_dir_name += '_'+corpus_file

    current_output_dir = output_dir+'/'+output_dir_name

    os.makedirs(current_output_dir, exist_ok=True)

    query_file_name = 'queries'
    if query_file != '':
        query_file_name += '_'+query_file
    queries = load_jsonl(Path(f"{dataset_dir}/{task_class.lower()}_queries/{query_file_name}.jsonl"))
    queries_dict = {item['_id']: item['text'] for item in queries}
    finder_task.queries = queries_dict

    corpus_file_name = 'corpus'
    if corpus_file != '':
        corpus_file_name += '_'+corpus_file

    corpus = load_jsonl(Path(f"{dataset_dir}/{task_class.lower()}_corpus/{corpus_file_name}.jsonl"))
    corpus_dict = {item['_id']: {'title': item['title'], 'text': item['text']} for item in corpus}
    top_k = len(corpus_dict)
    finder_task.corpus = corpus_dict

    file_name = f"{current_output_dir}/{task_class}.json"

    if not overwrite:
        if os.path.exists(file_name):
            with open(file_name, "r") as json_file:
                retrieval_result = json.load(json_file)
            return retrieval_result
        
    if ranker_model == 'BM25':
        processor = BM25Processor(tokenizer=tweet_tokenizer.tokenize)
        corpus_documents = [{"title": value["title"], "text": value["text"]} for value in corpus_dict.values()]

        tokenized_corpus = processor.build_corpus(corpus_documents)

        bm25_model = BM25Okapi(tokenized_corpus)
        retrieval_model = BM25Retriever(bm25_model, tokenize_list_tweet)
        
        # Retrieve documents
        retrieval_result = finder_task.retrieve(
            retriever=retrieval_model,
            top_k=top_k
        )
        # Save retrieval result
        
        

    else:
        encoder_model = SentenceTransformerEncoder(
                            model_name_or_path=ranker_model,
                            query_prompt= "Instruct: Given a financial question, relevant passages that best answer the question. \nQuery: ",
                            doc_prompt="Passage: ",
                        )

        retrieval_model = DenseRetrieval(model=encoder_model)

        # Retrieve documents
        retrieval_result = finder_task.retrieve(
                                                retriever=retrieval_model,
                                                top_k=top_k
                                    )

    with open(file_name, "w") as json_file:
            json.dump(retrieval_result, json_file, indent=4)
    
    return retrieval_result

def rerank(output_dir, task_class, finder_task, ranker_model, reranker_model = 'cross-encoder/ms-marco-MiniLM-L-12-v2', dataset_dir = '../data', query_file = '', corpus_file='', overwrite = False):
    ranker_model_changed = ranker_model.replace("/", "_")

    output_dir_name = ranker_model_changed
    if reranker_model is not None:
        output_dir_name += '_'+reranker_model
    if query_file != '':
        output_dir_name += '_'+query_file
    if corpus_file != '':
        output_dir_name += '_'+corpus_file

    current_output_dir = output_dir+'/'+output_dir_name

    os.makedirs(current_output_dir, exist_ok=True)

    query_file_name = 'queries'
    if query_file != '':
        query_file_name += '_'+query_file
    queries = load_jsonl(Path(f"{dataset_dir}/{task_class.lower()}_queries/{query_file_name}.jsonl"))
    queries_dict = {item['_id']: item['text'] for item in queries}
    finder_task.queries = queries_dict

    corpus_file_name = 'corpus'
    if corpus_file != '':
        corpus_file_name += '_'+corpus_file

    corpus = load_jsonl(Path(f"{dataset_dir}/{task_class.lower()}_corpus/{corpus_file_name}.jsonl"))
    corpus_dict = {item['_id']: {'title': item['title'], 'text': item['text']} for item in corpus}
    finder_task.corpus = corpus_dict

    file_name = f"{current_output_dir}/{task_class}.json"

    if not overwrite:
        if os.path.exists(file_name):
            with open(file_name, "r") as json_file:
                reranking_result = json.load(json_file)
            return reranking_result
    
    retrieval_result = first_stage_retrieval(output_dir, task_class, finder_task, ranker_model, dataset_dir = dataset_dir, query_file = query_file, corpus_file=corpus_file, overwrite = overwrite)

    reranker = CrossEncoderReranker(model=CrossEncoder('cross-encoder/'+reranker_model))
    reranking_result = finder_task.rerank(
                        reranker=reranker,
                        results=retrieval_result,
                        top_k=100,  # Rerank the top 100 documents
                        batch_size=32
                    )
    # Save retrieval result
    with open(file_name, "w") as json_file:
        json.dump(reranking_result, json_file, indent=4)

    return reranking_result



def evaluate(retrieval_result, output_dir, task_class, finder_task, ranker_model, reranker_model = None, dataset_dir = '../data', query_file = '', corpus_file='',):
    ranker_model_changed = ranker_model.replace("/", "_")

    output_dir_name = ranker_model_changed
    if reranker_model is not None:
        output_dir_name += '_'+reranker_model
    if query_file != '':
        output_dir_name += '_'+query_file
    if corpus_file != '':
        output_dir_name += '_'+corpus_file

    current_output_dir = output_dir+'/'+output_dir_name

    # Save evaluation result
    df = pd.read_csv(f'{dataset_dir}/{task_class.split("_")[0]}_qrels.tsv', sep='\t')
    qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
    eval_result = finder_task.evaluate(qrels_dict, retrieval_result, [1, 5, 10])
    combined_result = {**eval_result[0], **eval_result[1], **eval_result[2], **eval_result[3]}
    df_eval = pd.DataFrame([combined_result])
    df_eval.to_csv(f'{current_output_dir}/{task_class.split("_")[0]}_eval.csv', index=False)

    return df_eval

In [105]:


def process_task(output_dir, task_class, finder_task, ranker_model, reranker_model = 'ms-marco-MiniLM-L-12-v2', dataset_dir = '../data', query_file = '', corpus_file='', overwrite = False):

    retrieval_result = first_stage_retrieval(output_dir, task_class, finder_task, ranker_model, dataset_dir = dataset_dir, query_file = query_file, corpus_file=corpus_file, overwrite = overwrite)

    retrieval_eval = evaluate(retrieval_result, output_dir, task_class, finder_task, ranker_model, reranker_model = None, dataset_dir = dataset_dir, query_file = query_file, corpus_file=corpus_file)

    if reranker_model is not None:
        reranking_result = rerank(output_dir, task_class, finder_task, ranker_model, reranker_model = reranker_model, dataset_dir = dataset_dir, query_file = query_file, corpus_file=corpus_file, overwrite = overwrite)
        reranking_eval = evaluate(retrieval_result, output_dir, task_class, finder_task, ranker_model, reranker_model = reranker_model, dataset_dir = dataset_dir, query_file = query_file, corpus_file=corpus_file)


    

In [109]:
output_dir = '../results'
ranker_model = 'BM25'
# ranker_model = 'intfloat/e5-large-v2'
# reranker_model = None
reranker_model = 'ms-marco-MiniLM-L-12-v2'
dataset_dir = '../data'
query_file = 'para'
corpus_file = ''
overwrite = False

In [108]:
task_class

'ConvFinQA'

In [110]:

task_list = ['FinQABench', 'FinDER',  'FinanceBench', 'TATQA', 'FinQA', 'ConvFinQA', 'MultiHiertt']

for task_class in task_list:
    print(f"Running for {task_class}")
    task_class_obj = getattr(tasks_module, task_class)
    finder_task = task_class_obj()
    process_task(output_dir, task_class, finder_task, ranker_model, reranker_model = reranker_model, dataset_dir = dataset_dir, query_file = query_file, corpus_file=corpus_file, overwrite = overwrite)


INFO:financerag.common.loader:Loading Corpus...


Running for FinQABench


INFO:financerag.common.loader:Loaded 92 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd4aa0660c', 'title': '', 'text': 'Apple Inc.\nCONSOLIDATED STATEMENTS OF OPERATIONS\n(In millions, except number of shares which are reflected in thousands and per share amounts)\nYears ended\nSeptember 24,\n2022September 25,\n2021September 26,\n2020\nNet sales:\n   Products $ 316,199 $ 297,392 $ 220,747 \n   Services  78,129  68,425  53,768 \nTotal net sales  394,328  365,817  274,515 \nCost of sales:\n   Products  201,471  192,266  151,286 \n   Services  22,075  20,715  18,273 \nTotal cost of sales  223,546  212,981  169,559 \nGross margin  170,782  152,836  104,956 \nOperating expenses:\nResearch and development  26,251  21,914  18,752 \nSelling, general and administrative  25,094  21,973  19,916 \nTotal operating expenses  51,345  43,887  38,668 \nOperating income  119,437  108,949  66,288 \nOther income/(expense), net  (334)  258  803 \nIncome before provision for income taxes 

Running for FinDER


INFO:financerag.common.loader:Loaded 13867 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'ADBE20230004', 'title': 'ADBE OVERVIEW', 'text': 'Adobe is a global technology company with a mission to change the world through personalized digital experiences. For over four decades, Adobe’s innovations have transformed how individuals, teams, businesses, enterprises, institutions, and governments engage and interact across all types of media. Our products, services and solutions are used around the world to imagine, create, manage, deliver, measure, optimize and engage with content across surfaces and fuel digital experiences. We have a diverse user base that includes consumers, communicators, creative professionals, developers, students, small and medium businesses and enterprises. We are also empowering creators by putting the power of artificial intelligence (“AI”) in their hands, and doing so in ways we believe are responsible. Our products and services help unleash crea

Running for FinanceBench


INFO:financerag.common.loader:Loaded 180 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'dd2af2336', 'title': 'PEPSICO_2022_10K', 'text': '6) Africa, Middle East and South Asia (AMESA), which includes all of our beverage and convenient food businesses in\nAfrica, the Middle East and South Asia; and\n7) Asia Pacific, Australia and New Zealand and China Region (APAC), which includes all of our beverage and convenient\nfood businesses in Asia Pacific, Australia and New Zealand, and China region.'}
INFO:financerag.common.loader:Loading Queries...
INFO:financerag.common.loader:Loaded 150 Queries.
INFO:financerag.common.loader:Query Example: {'id': 'qd2ac917a', 'text': 'What is the FY2019 - FY2020 total revenue growth rate for Block (formerly known as Square)? Answer in units of percents and round to one decimal place. Approach the question asked by assuming the standpoint of an investment banking analyst who only has access to the statement of income.'}
  qrels_dict = df.gr

Running for TATQA


INFO:financerag.common.loader:Loaded 2756 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd1b2e74c0', 'title': '', 'text': 'The following tables present the recorded investment by portfolio segment and by class, excluding commercial financing receivables and other miscellaneous financing receivables at December 31, 2019 and 2018. Commercial financing receivables are excluded from the presentation of financing receivables by portfolio segment, as they are short term in nature and the current estimated risk of loss and resulting impact to the company’s financing results are not material.\nWrite-offs of lease receivables and loan receivables were $16 million and $47 million, respectively, for the year ended December 31, 2019. Provisions for credit losses recorded for lease receivables and loan receivables were a release of $6 million and an addition of $2 million, respectively, for the year ended December 31, 2019.\nThe average recorded investment of impaired leases and l

Running for FinQA


INFO:financerag.common.loader:Loaded 2789 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd61d9e858', 'title': '', 'text': 'performance graph the performance graph below shows the five-year cumulative total stockholder return on applied common stock during the period from october 25 , 2009 through october 26 , 2014 .\nthis is compared with the cumulative total return of the standard & poor 2019s 500 stock index and the rdg semiconductor composite index over the same period .\nthe comparison assumes $ 100 was invested on october 25 , 2009 in applied common stock and in each of the foregoing indices and assumes reinvestment of dividends , if any .\ndollar amounts in the graph are rounded to the nearest whole dollar .\nthe performance shown in the graph represents past performance and should not be considered an indication of future performance .\ncomparison of 5 year cumulative total return* among applied materials , inc. , the s&p 500 index 201cs&p 201d is a registered 

Running for ConvFinQA


INFO:financerag.common.loader:Loaded 2066 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'dd4bff516', 'title': '', 'text': 'containerboard , kraft papers and saturating kraft .\nkapstone also owns victory packaging , a packaging solutions distribution company with facilities in the u.s. , canada and mexico .\nwe have included the financial results of kapstone in our corrugated packaging segment since the date of the acquisition .\non september 4 , 2018 , we completed the acquisition ( the 201cschl fcter acquisition 201d ) of schl fcter print pharma packaging ( 201cschl fcter 201d ) .\nschl fcter is a leading provider of differentiated paper and packaging solutions and a german-based supplier of a full range of leaflets and booklets .\nthe schl fcter acquisition allowed us to further enhance our pharmaceutical and automotive platform and expand our geographical footprint in europe to better serve our customers .\nwe have included the financial results of the acquired op

Running for MultiHiertt


INFO:financerag.common.loader:Loaded 10475 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd8e4ea4ac', 'title': '', 'text': '|  | Years Ended December 31, |\n|  | 2006 | 2005 |\n|  | (In millions) |\n| Investment return | $192 | $-26 |\n| Expense | 45 | 11 |\n| In-force/Persistency | -7 | -33 |\n| Policyholder dividends and other | -39 | -11 |\n| Total | $191 | $-59 |\nAs of December 31, 2006 and 2005, DAC and VOBA for the Individual segment were $14.0 billion and $13.5 billion, respectively, and for the total Company were $20.8 billion and $19.7 billion, respectively.\nGoodwill Goodwill is the excess of cost over the fair value of net assets acquired.\nThe Company tests goodwill for impairment at least annually or more frequently if events or circumstances, such as adverse changes in the business climate, indicate that there may be justification for conducting an interim test.\nImpairment testing is performed using the fair value approach, which requires the use of es

In [111]:
output_dir = '../results'
# ranker_model = 'BM25'
ranker_model = 'intfloat/e5-large-v2'
# reranker_model = None
reranker_model = 'ms-marco-MiniLM-L-12-v2'
dataset_dir = '../data'
query_file = 'para'
corpus_file = ''
overwrite = False


task_list = ['FinQABench', 'FinDER',  'FinanceBench', 'TATQA', 'FinQA', 'ConvFinQA', 'MultiHiertt']

for task_class in task_list:
    print(f"Running for {task_class}")
    task_class_obj = getattr(tasks_module, task_class)
    finder_task = task_class_obj()
    process_task(output_dir, task_class, finder_task, ranker_model, reranker_model = reranker_model, dataset_dir = dataset_dir, query_file = query_file, corpus_file=corpus_file, overwrite = overwrite)


INFO:financerag.common.loader:Loading Corpus...


Running for FinQABench


INFO:financerag.common.loader:Loaded 92 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd4aa0660c', 'title': '', 'text': 'Apple Inc.\nCONSOLIDATED STATEMENTS OF OPERATIONS\n(In millions, except number of shares which are reflected in thousands and per share amounts)\nYears ended\nSeptember 24,\n2022September 25,\n2021September 26,\n2020\nNet sales:\n   Products $ 316,199 $ 297,392 $ 220,747 \n   Services  78,129  68,425  53,768 \nTotal net sales  394,328  365,817  274,515 \nCost of sales:\n   Products  201,471  192,266  151,286 \n   Services  22,075  20,715  18,273 \nTotal cost of sales  223,546  212,981  169,559 \nGross margin  170,782  152,836  104,956 \nOperating expenses:\nResearch and development  26,251  21,914  18,752 \nSelling, general and administrative  25,094  21,973  19,916 \nTotal operating expenses  51,345  43,887  38,668 \nOperating income  119,437  108,949  66,288 \nOther income/(expense), net  (334)  258  803 \nIncome before provision for income taxes 

Running for FinDER


INFO:financerag.common.loader:Loaded 13867 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'ADBE20230004', 'title': 'ADBE OVERVIEW', 'text': 'Adobe is a global technology company with a mission to change the world through personalized digital experiences. For over four decades, Adobe’s innovations have transformed how individuals, teams, businesses, enterprises, institutions, and governments engage and interact across all types of media. Our products, services and solutions are used around the world to imagine, create, manage, deliver, measure, optimize and engage with content across surfaces and fuel digital experiences. We have a diverse user base that includes consumers, communicators, creative professionals, developers, students, small and medium businesses and enterprises. We are also empowering creators by putting the power of artificial intelligence (“AI”) in their hands, and doing so in ways we believe are responsible. Our products and services help unleash crea

Running for FinanceBench


INFO:financerag.common.loader:Loaded 180 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'dd2af2336', 'title': 'PEPSICO_2022_10K', 'text': '6) Africa, Middle East and South Asia (AMESA), which includes all of our beverage and convenient food businesses in\nAfrica, the Middle East and South Asia; and\n7) Asia Pacific, Australia and New Zealand and China Region (APAC), which includes all of our beverage and convenient\nfood businesses in Asia Pacific, Australia and New Zealand, and China region.'}
INFO:financerag.common.loader:Loading Queries...
INFO:financerag.common.loader:Loaded 150 Queries.
INFO:financerag.common.loader:Query Example: {'id': 'qd2ac917a', 'text': 'What is the FY2019 - FY2020 total revenue growth rate for Block (formerly known as Square)? Answer in units of percents and round to one decimal place. Approach the question asked by assuming the standpoint of an investment banking analyst who only has access to the statement of income.'}
INFO:sentence_transf

Running for TATQA


INFO:financerag.common.loader:Loaded 2756 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd1b2e74c0', 'title': '', 'text': 'The following tables present the recorded investment by portfolio segment and by class, excluding commercial financing receivables and other miscellaneous financing receivables at December 31, 2019 and 2018. Commercial financing receivables are excluded from the presentation of financing receivables by portfolio segment, as they are short term in nature and the current estimated risk of loss and resulting impact to the company’s financing results are not material.\nWrite-offs of lease receivables and loan receivables were $16 million and $47 million, respectively, for the year ended December 31, 2019. Provisions for credit losses recorded for lease receivables and loan receivables were a release of $6 million and an addition of $2 million, respectively, for the year ended December 31, 2019.\nThe average recorded investment of impaired leases and l

KeyboardInterrupt: 