In [1]:
import os
from pathlib import Path
import sys
sys.path.insert(0, str(Path(os.getcwd()).parent))

In [2]:
# Step 1: Import necessary libraries
# --------------------------------------
# Import required libraries for document retrieval, reranking, and logging setup.
from sentence_transformers import CrossEncoder
import logging

from financerag.rerank import CrossEncoderReranker
from financerag.retrieval import DenseRetrieval, SentenceTransformerEncoder
from financerag.tasks import FinDER

# Setup basic logging configuration to show info level messages.
logging.basicConfig(level=logging.INFO)

  from tqdm.autonotebook import tqdm, trange


In [3]:
# Step 2: Initialize FinDER Task
# --------------------------
# In this baseline example, we are using the FinDER task, one of the seven available tasks in this project.
# If you want to use a different task, for example, 'OtherTask', you can change the task initialization as follows:
#
# Example:
# from financerag.tasks import OtherTask
# finder_task = OtherTask()
#
# For this baseline, we proceed with FinDER.
finder_task = FinDER()

INFO:financerag.common.loader:Loading Corpus...
INFO:financerag.common.loader:Loaded 13867 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'ADBE20230004', 'title': 'ADBE OVERVIEW', 'text': 'Adobe is a global technology company with a mission to change the world through personalized digital experiences. For over four decades, Adobe’s innovations have transformed how individuals, teams, businesses, enterprises, institutions, and governments engage and interact across all types of media. Our products, services and solutions are used around the world to imagine, create, manage, deliver, measure, optimize and engage with content across surfaces and fuel digital experiences. We have a diverse user base that includes consumers, communicators, creative professionals, developers, students, small and medium businesses and enterprises. We are also empowering creators by putting the power of artificial intelligence (“AI”) in their hands, and doing so in ways we believe are responsi

In [8]:
# Step 3: Initialize DenseRetriever model
# -------------------------------------
# Initialize the retrieval model using SentenceTransformers. This model will be responsible
# for encoding both the queries and documents into embeddings.
#
# You can replace 'intfloat/e5-large-v2' with any other model supported by SentenceTransformers.
# For example: 'BAAI/bge-large-en-v1.5', 'Linq-AI-Research/Linq-Embed-Mistral', etc.
encoder_model = SentenceTransformerEncoder(
    model_name_or_path='intfloat/e5-large-v2',
    query_prompt='query: ',
    doc_prompt='passage: ',
)

retrieval_model = DenseRetrieval(
    model=encoder_model
)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: intfloat/e5-large-v2


In [4]:
from rank_bm25 import BM25Okapi
from financerag.retrieval import BM25Retriever

retrieval_model = BM25Retriever(model=BM25Okapi())

TypeError: BM25Okapi.__init__() missing 1 required positional argument: 'corpus'

In [5]:
import json

def load_jsonl(file_path):
    """
    Loads a JSONL file and returns a list of dictionaries.

    Args:
        file_path (str): Path to the JSONL file.

    Returns:
        List[dict]: A list of JSON objects.
    """
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return data




In [4]:
from datasets import load_dataset

# Loading a specific subset (i.e. FinDER) and a split (corpus, queries)
corpus = load_dataset("Linq-AI-Research/FinanceRAG", "FinDER", split="corpus")
corpus


Dataset({
    features: ['_id', 'title', 'text'],
    num_rows: 13867
})

In [5]:
# Initialize BM25Processor
from nltk.tokenize import word_tokenize
from financerag.retrieval import BM25Processor
processor = BM25Processor(tokenizer=word_tokenize)
# Tokenize and prepare the corpus
tokenized_corpus = processor.build_corpus(corpus)

# # Initialize BM25 model
from rank_bm25 import BM25Okapi
bm25_model = BM25Okapi(tokenized_corpus)

In [8]:
# Step 4: Perform retrieval
# ---------------------
# Use the model to retrieve relevant documents for given queries.
retrieval_model = DenseRetrieval(
    model=encoder_model
)

retrieval_result = finder_task.retrieve(
    retriever=retrieval_model
)

# Print a portion of the retrieval results to verify the output.
print(f"Retrieved results for {len(retrieval_result)} queries. Here's an example of the top 5 documents for the first query:")

for q_id, result in retrieval_result.items():
    print(f"\nQuery ID: {q_id}")
    # Sort the result to print the top 5 document ID and its score
    sorted_results = sorted(result.items(), key=lambda x: x[1], reverse=True)

    for i, (doc_id, score) in enumerate(sorted_results[:5]):
        print(f"  Document {i + 1}: Document ID = {doc_id}, Score = {score}")

    break  # Only show the first query


INFO:financerag.retrieval.dense:Encoding queries...


AttributeError: 'BM25Okapi' object has no attribute 'encode_queries'

In [13]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/sajayudhay/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

{'_id': 'ADBE20230004', 'title': 'ADBE OVERVIEW', 'text': 'Adobe is a global technology company with a mission to change the world through personalized digital experiences. For over four decades, Adobe’s innovations have transformed how individuals, teams, businesses, enterprises, institutions, and governments engage and interact across all types of media. Our products, services and solutions are used around the world to imagine, create, manage, deliver, measure, optimize and engage with content across surfaces and fuel digital experiences. We have a diverse user base that includes consumers, communicators, creative professionals, developers, students, small and medium businesses and enterprises. We are also empowering creators by putting the power of artificial intelligence (“AI”) in their hands, and doing so in ways we believe are responsible. Our products and services help unleash creativity, accelerate document productivity and power businesses in a digital world.'}


In [9]:
from nltk.tokenize import word_tokenize, TweetTokenizer
tweet_tokenizer = TweetTokenizer()


In [6]:
from financerag.retrieval import BM25Retriever

# Step 4: Perform retrieval
# ---------------------
# Use the model to retrieve relevant documents for given queries.
retrieval_model = BM25Retriever(model=bm25_model)

retrieval_result = finder_task.retrieve(
    retriever=retrieval_model
)

# Print a portion of the retrieval results to verify the output.
print(f"Retrieved results for {len(retrieval_result)} queries. Here's an example of the top 5 documents for the first query:")

for q_id, result in retrieval_result.items():
    print(f"\nQuery ID: {q_id}")
    # Sort the result to print the top 5 document ID and its score
    sorted_results = sorted(result.items(), key=lambda x: x[1], reverse=True)

    for i, (doc_id, score) in enumerate(sorted_results[:5]):
        print(f"  Document {i + 1}: Document ID = {doc_id}, Score = {score}")

    break  # Only show the first query


INFO:financerag.retrieval.bm25:Tokenizing queries with lower cases


what are the service and product offerings from microsoft
msft segment breakdown
who are microsoft`s key customers?
what is microsoft`s business model
msft capex commitment
which recent m&a activities has microsoft been involved in
how much revenue does microsoft generate from contracts with customers?
msft remaining performance obligation
adobe subsidiaries of trademarks
adbe share repurchase
fully diluted shares outstanding adbe
who are the members of adobe`s management team
adbe rpo
adbe kpi
how are coupang`s kpis?"
coupang segment margin
cpng capital expenditure
cpng any recent m&a activities
when did coupang`s farfetch consolidation start
what is flc, and how is its revenue recognized by coupang
when did new flc contract begin cpng
cpng free cash flow
class of shares cpng
any highlights from linde`s 2023 earnings result
linde segment breakdown
linde shareholder return
asset divestitures linde
what is the total number of leases held by linde
linde outstanding share
the top 3 risks 

In [7]:
# Step 5: Initialize CrossEncoder Reranker
# --------------------------------------
# The CrossEncoder model will be used to rerank the retrieved documents based on relevance.
#
# You can replace 'cross-encoder/ms-marco-MiniLM-L-12-v2' with any other model supported by CrossEncoder.
# For example: 'cross-encoder/ms-marco-TinyBERT-L-2', 'cross-encoder/stsb-roberta-large', etc.
reranker = CrossEncoderReranker(
    model=CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
)

INFO:sentence_transformers.cross_encoder.CrossEncoder:Use pytorch device: mps


In [8]:
# Step 6: Perform reranking
# -------------------------
# Rerank the top 100 retrieved documents using the CrossEncoder model.
reranking_result = finder_task.rerank(
    reranker=reranker,
    results=retrieval_result,
    top_k=100,  # Rerank the top 100 documents
    batch_size=32
)

# Print a portion of the reranking results to verify the output.
print(f"Reranking results for {len(reranking_result)} queries. Here's an example of the top 5 documents for the first query:")

for q_id, result in reranking_result.items():
    print(f"\nQuery ID: {q_id}")
    # Sort the result to print the top 5 document ID and its score
    sorted_results = sorted(result.items(), key=lambda x: x[1], reverse=True)

    for i, (doc_id, score) in enumerate(sorted_results[:5]):
        print(f"  Document {i + 1}: Document ID = {doc_id}, Score = {score}")

    break  # Only show the first query


INFO:financerag.rerank.cross_encoder:Starting To Rerank Top-100....
Batches: 100%|██████████| 675/675 [07:25<00:00,  1.51it/s]


Reranking results for 216 queries. Here's an example of the top 5 documents for the first query:

Query ID: q00001
  Document 1: Document ID = MSFT20230331, Score = 5.333600997924805
  Document 2: Document ID = MSFT20230134, Score = 3.972005844116211
  Document 3: Document ID = ORCL20230429, Score = 1.7133623361587524
  Document 4: Document ID = ORCL20230003, Score = 0.6058417558670044
  Document 5: Document ID = ORCL20230160, Score = 0.3706078827381134


In [8]:
# Step 7: Save results
# -------------------
# Save the results to the specified output directory as a CSV file.
output_dir = './results'
finder_task.save_results(output_dir=output_dir)

# Confirm the results have been saved.
print(f"Results have been saved to {output_dir}/FinDER/results.csv")

INFO:financerag.tasks.BaseTask:Output directory set to: ./results/FinDER
INFO:financerag.tasks.BaseTask:Saving top 10 results to CSV file: ./results/FinDER/results.csv
INFO:financerag.tasks.BaseTask:Writing header ['query_id', 'corpus_id'] to CSV.
INFO:financerag.tasks.BaseTask:Top 10 results saved successfully to ./results/FinDER/results.csv


Results have been saved to ./results/FinDER/results.csv


In [12]:
import pandas as pd
# Load the TSV file containing the 30% answer labels
df = pd.read_csv('../data/FinDER_qrels.tsv', sep='\t')

# Convert the TSV data into a dictionary format for evaluation
qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()

# If the retrieval or reranking result is stored in the `results` variable
# Evaluate the model on various metrics such as Recall, Precision, MAP, and nDCG
finder_task.evaluate(qrels_dict, retrieval_result, [1, 5, 10])

  qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.0000
INFO:financerag.tasks.BaseTask:NDCG@5: 0.0060
INFO:financerag.tasks.BaseTask:NDCG@10: 0.0130
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:MAP@1: 0.0000
INFO:financerag.tasks.BaseTask:MAP@5: 0.0039
INFO:financerag.tasks.BaseTask:MAP@10: 0.0065
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:Recall@1: 0.0000
INFO:financerag.tasks.BaseTask:Recall@5: 0.0078
INFO:financerag.tasks.BaseTask:Recall@10: 0.0273
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:P@1: 0.0000
INFO:financerag.tasks.BaseTask:P@5: 0.0031
INFO:financerag.tasks.BaseTask:P@10: 0.0047


({'NDCG@1': 0.0, 'NDCG@5': 0.00604, 'NDCG@10': 0.01302},
 {'MAP@1': 0.0, 'MAP@5': 0.00391, 'MAP@10': 0.00653},
 {'Recall@1': 0.0, 'Recall@5': 0.00781, 'Recall@10': 0.02734},
 {'P@1': 0.0, 'P@5': 0.00313, 'P@10': 0.00469})

In [None]:
# ({'NDCG@1': 0.0, 'NDCG@5': 0.00604, 'NDCG@10': 0.01302},
#  {'MAP@1': 0.0, 'MAP@5': 0.00391, 'MAP@10': 0.00653},
#  {'Recall@1': 0.0, 'Recall@5': 0.00781, 'Recall@10': 0.02734},
#  {'P@1': 0.0, 'P@5': 0.00313, 'P@10': 0.00469})

In [None]:
# ({'NDCG@1': 0.10938, 'NDCG@5': 0.14169, 'NDCG@10': 0.14634},
#  {'MAP@1': 0.08203, 'MAP@5': 0.12467, 'MAP@10': 0.12797},
#  {'Recall@1': 0.08203, 'Recall@5': 0.16406, 'Recall@10': 0.17578},
#  {'P@1': 0.10938, 'P@5': 0.05, 'P@10': 0.02813})

In [None]:
# ({'NDCG@1': 0.25, 'NDCG@5': 0.3363, 'NDCG@10': 0.36985},
#  {'MAP@1': 0.21875, 'MAP@5': 0.30514, 'MAP@10': 0.32151},
#  {'Recall@1': 0.21875, 'Recall@5': 0.39687, 'Recall@10': 0.48646},
#  {'P@1': 0.25, 'P@5': 0.1125, 'P@10': 0.07188})

In [10]:
import pandas as pd
# Load the TSV file containing the 30% answer labels
df = pd.read_csv('../data/FinDER_qrels.tsv', sep='\t')

# Convert the TSV data into a dictionary format for evaluation
qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()

# If the retrieval or reranking result is stored in the `results` variable
# Evaluate the model on various metrics such as Recall, Precision, MAP, and nDCG
finder_task.evaluate(qrels_dict, reranking_result, [1, 5, 10])

  qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.2500
INFO:financerag.tasks.BaseTask:NDCG@5: 0.3363
INFO:financerag.tasks.BaseTask:NDCG@10: 0.3699
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:MAP@1: 0.2188
INFO:financerag.tasks.BaseTask:MAP@5: 0.3051
INFO:financerag.tasks.BaseTask:MAP@10: 0.3215
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:Recall@1: 0.2188
INFO:financerag.tasks.BaseTask:Recall@5: 0.3969
INFO:financerag.tasks.BaseTask:Recall@10: 0.4865
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:P@1: 0.2500
INFO:financerag.tasks.BaseTask:P@5: 0.1125
INFO:financerag.tasks.BaseTask:P@10: 0.0719


({'NDCG@1': 0.25, 'NDCG@5': 0.3363, 'NDCG@10': 0.36985},
 {'MAP@1': 0.21875, 'MAP@5': 0.30514, 'MAP@10': 0.32151},
 {'Recall@1': 0.21875, 'Recall@5': 0.39687, 'Recall@10': 0.48646},
 {'P@1': 0.25, 'P@5': 0.1125, 'P@10': 0.07188})