In [None]:
%%bash
git clone https://github.com/linq-rag/FinanceRAG.git
cd FinanceRAG

In [None]:
%pip install datasets pytrec_eval nltk sentence-transformers openai

In [3]:
import sys
sys.path.insert(0, "/home/azureuser/hackathon/FinanceRAG")

In [16]:
from sentence_transformers import CrossEncoder
import logging

from financerag.rerank import CrossEncoderReranker
from financerag.retrieval import DenseRetrieval, SentenceTransformerEncoder
from financerag.tasks import FinDER, FinQABench, FinanceBench,  ConvFinQA, FinQA, MultiHiertt, TATQA

# Setup basic logging configuration to show info level messages.
logging.basicConfig(level=logging.INFO)

In [17]:
encoder_model = SentenceTransformerEncoder(
  model_name_or_path='intfloat/e5-large-v2',
  query_prompt='query: ',
  doc_prompt='passage: ',
)

reranker = CrossEncoderReranker(
  model=CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: intfloat/e5-large-v2
INFO:sentence_transformers.cross_encoder.CrossEncoder:Use pytorch device: mps


In [None]:
task = FinDER()

corpus_records = []

for doc_id in task.corpus:
  doc = task.corpus[doc_id]
  corpus_records.append({'id': str(doc_id), 'title': doc['title'], 'text': doc['text']})

corpus_lst = [r["title"] + " " + r["text"] for r in corpus_records]

corpus_records

In [None]:
# You can now search the corpus with a query
query = "Who are Microsoft`s key customers?"
query_tokens = bm25s.tokenize(query)
results = retriever.retrieve(query_tokens, k=10)
for doc in results.documents[0]:
  print(doc['id'], doc['title'])

In [20]:
task = FinDER()
for qq in task.queries:
  query = task.queries[qq]
  print(qq, query)

INFO:financerag.common.loader:Loading Corpus...
INFO:financerag.common.loader:Loaded 13867 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'ADBE20230004', 'title': 'ADBE OVERVIEW', 'text': 'Adobe is a global technology company with a mission to change the world through personalized digital experiences. For over four decades, Adobe’s innovations have transformed how individuals, teams, businesses, enterprises, institutions, and governments engage and interact across all types of media. Our products, services and solutions are used around the world to imagine, create, manage, deliver, measure, optimize and engage with content across surfaces and fuel digital experiences. We have a diverse user base that includes consumers, communicators, creative professionals, developers, students, small and medium businesses and enterprises. We are also empowering creators by putting the power of artificial intelligence (“AI”) in their hands, and doing so in ways we believe are responsi

q00001 What are the service and product offerings from Microsoft
q00002 MSFT segment breakdown
q00003 Who are Microsoft`s key customers?
q00004 What is Microsoft`s business model
q00005 MSFT Capex commitment
q00006 Which recent M&A activities has Microsoft been involved in
q00007 How much revenue does Microsoft generate from contracts with customers?
q00008 MSFT remaining performance obligation
q00009 Adobe subsidiaries of trademarks
q00010 ADBE share repurchase
q00011 fully diluted shares outstanding ADBE
q00012 Who are the members of Adobe`s management team
q00013 ADBE RPO
q00014 ADBE KPI
q00015 How are Coupang`s KPIs?"
q00016 Coupang segment margin
q00017 CPNG capital expenditure
q00018 CPNG any recent M&A activities
q00019 When did Coupang`s Farfetch consolidation start
q00020 What is FLC, and how is its revenue recognized by Coupang
q00021 When did new FLC contract begin CPNG
q00022 CPNG free cash flow
q00023 Class of Shares CPNG
q00024 any highlights from Linde`s 2023 earnings re

In [None]:
task = FinDER()


import bm25s

bm25_retrieval_results = {}

# Tokenize the corpus and index it
retriever = bm25s.BM25(corpus=corpus_records)
corpus_tokens = bm25s.tokenize(corpus_lst)
retriever.index(corpus_tokens)

for qq in task.queries:
  query = task.queries[qq]
  print(qq, query)
  query_tokens = bm25s.tokenize(query)
  results = retriever.retrieve(query_tokens, k=10)

  bm25_retrieval_results_per_query = {}
  for doc in results.documents[0]:
    print(doc['id'], doc['title'])
    bm25_retrieval_results_per_query[doc['id']] = 1.0
  bm25_retrieval_results[query] = bm25_retrieval_results_per_query
  


In [19]:
from collections import defaultdict
import bm25s

OUTPUT_FILE_NAME = "result.csv"
with open(OUTPUT_FILE_NAME, 'w') as file:
  file.write("query_id,corpus_id\n")

KEYWORD_RETRIEVAL = True

# Merging the two dictionaries
def merge_dicts(dict1: dict, dict2: dict) -> dict:
    merged = defaultdict(dict)

    # Merge the first dictionary
    for query_id, doc_scores in dict1.items():
        for doc_id, score in doc_scores.items():
            merged[query_id][doc_id] = score

    # Merge the second dictionary, handling potential conflicts
    for query_id, doc_scores in dict2.items():
        for doc_id, score in doc_scores.items():
            if doc_id in merged[query_id]:
                # Resolve conflict by taking the maximum score (you can change this logic)
                merged[query_id][doc_id] = max(merged[query_id][doc_id], score)
            else:
                merged[query_id][doc_id] = score

    return dict(merged)

def bm25_search(task):
  for doc_id in task.corpus:
    doc = task.corpus[doc_id]
    corpus_records.append({'id': str(doc_id), 'title': doc['title'], 'text': doc['text']})

  corpus_lst = [r["title"] + " " + r["text"] for r in corpus_records]

  bm25_retrieval_results = {}

  # Tokenize the corpus and index it
  retriever = bm25s.BM25(corpus=corpus_records)
  corpus_tokens = bm25s.tokenize(corpus_lst)
  retriever.index(corpus_tokens)

  for qq in task.queries:
    query = task.queries[qq]
    # print(qq, query)
    query_tokens = bm25s.tokenize(query)
    results = retriever.retrieve(query_tokens, k=100)

    bm25_retrieval_results_per_query = {}
    for doc in results.documents[0]:
      # print(doc['id'], doc['title'])
      bm25_retrieval_results_per_query[doc['id']] = 1.0
    bm25_retrieval_results[qq] = bm25_retrieval_results_per_query
  return bm25_retrieval_results

# for task in [FinDER()]: # , FinQABench(), FinanceBench(),  ConvFinQA(), FinQA(), MultiHiertt(), TATQA()]:
for task in [FinDER(), FinQABench(), FinanceBench(),  ConvFinQA(), FinQA(), MultiHiertt(), TATQA()]:
  print("### Processing: ", task.metadata.name)

  # retrieval_model = DenseRetrieval(model=encoder_model, batch_size=256)
  dense_retrieval_results = {} # task.retrieve(retriever=retrieval_model)

  bm25_retrieval_results = bm25_search(task)
  retrieval_results = merge_dicts(bm25_retrieval_results, dense_retrieval_results)

  reranking_result = task.rerank(
      reranker=reranker,
      results=retrieval_results,
      top_k=200,  # Rerank the top 100 documents
      batch_size=512
  )

  file = open(OUTPUT_FILE_NAME, 'a')

  for q_id, result in reranking_result.items():
    sorted_results = sorted(result.items(), key=lambda x: x[1], reverse=True)

    for i, (doc_id, score) in enumerate(sorted_results[:10]):
      line = f"{q_id},{doc_id}\n"
      file.write(line)
  file.close()




INFO:financerag.common.loader:Loading Corpus...
INFO:financerag.common.loader:Loaded 13867 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'ADBE20230004', 'title': 'ADBE OVERVIEW', 'text': 'Adobe is a global technology company with a mission to change the world through personalized digital experiences. For over four decades, Adobe’s innovations have transformed how individuals, teams, businesses, enterprises, institutions, and governments engage and interact across all types of media. Our products, services and solutions are used around the world to imagine, create, manage, deliver, measure, optimize and engage with content across surfaces and fuel digital experiences. We have a diverse user base that includes consumers, communicators, creative professionals, developers, students, small and medium businesses and enterprises. We are also empowering creators by putting the power of artificial intelligence (“AI”) in their hands, and doing so in ways we believe are responsi

### Processing:  FinDER


DEBUG:bm25s:Building index from IDs objects                            
                                                                              

KeyError: 'Float as of 2023 BRK'