In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import sys
#from dotenv import load_dotenv
from llama_index.core import Document
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import VectorStoreIndex
from llama_index.core.postprocessor import SentenceTransformerRerank
import faiss
from huggingface_hub import login
import pandas as pd 
from pathlib import Path
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.schema import MetadataMode, NodeWithScore, QueryBundle
from tqdm import tqdm

sys.path.append(os.path.dirname(os.getcwd()))
from tasks.eval_rag import evaluate_rag

data_dir = Path.cwd().parent / 'financerag/icaif-24-finance-rag-challenge'
print(os.listdir(data_dir))

['convfinqa_queries.jsonl', 'ConvFinQA_qrels.tsv', 'FinanceBench_qrels.tsv', 'FinDER_qrels.tsv', 'TATQA_qrels.tsv', 'finqabench_corpus.jsonl', 'finder_corpus.jsonl', 'tatqa_corpus.jsonl', 'tatqa_queries.jsonl', 'multiheirtt_corpus.jsonl', 'convfinqa_corpus.jsonl', 'finqa_queries.jsonl', 'multiheirtt_queries.jsonl', 'finqa_corpus.jsonl', 'financebench_queries.jsonl', 'FinQA_qrels.tsv', 'FinQABench_qrels.tsv', 'sample_submission_.csv', 'MultiHeirtt_qrels.tsv', 'finder_queries.jsonl', 'financebench_corpus.jsonl', 'finqabench_queries.jsonl']


In [2]:
finq_bench_corpus_path = data_dir / 'convfinqa_corpus.jsonl/corpus.jsonl'
finq_bench_query_path = data_dir / 'convfinqa_queries.jsonl/queries.jsonl'
finq_bench_tsv_path = data_dir / 'ConvFinQA_qrels.tsv'

In [3]:
finq_bench_corpus = pd.read_json(finq_bench_corpus_path, lines=True)
finq_bench_queries = pd.read_json(finq_bench_query_path, lines=True)
finq_bench_gt = pd.read_csv(finq_bench_tsv_path, sep='\t')
print("Dataset:FinQ Bench\nTotal Corpus:{}\nTotal Queries:{}".format(finq_bench_corpus.shape[0], finq_bench_queries.shape[0]))

Dataset:FinQ Bench
Total Corpus:2066
Total Queries:421


In [4]:
finq_bench_corpus.columns

Index(['_id', 'title', 'text'], dtype='object')

In [6]:
import os
import pickle
from llama_index.core import Document
import pandas as pd

# Function to process documents from DataFrame
def create_documents(df):
    """Create Documents with metadata from df"""
    documents = []
    for idx,row in df.iterrows():
        _ = Document(
            text=row['text'], 
            metadata={'_id' : row['_id'], 'title' : row['title']}
            )
        documents.append(_)
    return documents

# Load data from DataFrame
documents = create_documents(finq_bench_corpus)


In [7]:
import re
from nltk.corpus import stopwords

def clean_text(text):
    # Lowercase the text for uniformity
    text = text.lower()

    # Replace multiple spaces/newlines with a single space/newline
    text = re.sub(r'\s+', ' ', text)
    
    # Remove special characters and numeric encodings like "2019s"
    text = re.sub(r'\d{4}s?', '', text)  # Remove specific year encodings (e.g., 2019s)
    text = re.sub(r'[^a-zA-Z0-9\s\.,]', '', text)  # Keep only alphanumeric chars, spaces, periods, and commas

    # Remove isolated numbers and percentages
    text = re.sub(r'\b\d+(\.\d+)?%\b', '', text)  # Remove percentages like "12.3%"
    text = re.sub(r'\b\d+\b', '', text)  # Remove isolated numbers

    # Optional: Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])

    return text

def process_table_to_text(table_str):
    # Split the table into lines
    lines = table_str.strip().split('\n')
    
    # Remove any leading or trailing empty lines
    lines = [line.strip() for line in lines if line.strip()]
    
    # Remove separator lines (lines with dashes or pipes and dashes)
    lines = [line for line in lines if not re.match(r'^[-\s|]+$', line)]
    
    if not lines:
     #   print("Warning: No content found in the table.")
        return []
    
    # Now, assume that the first line is the header
    header_line = lines[0]
    data_lines = lines[1:]
    
    # Check if data_lines is empty
    if not data_lines:
   #     print("Warning: No data lines found in the table.")
        return []
    
    # Parse the header
    header_cells = [cell.strip() for cell in re.split(r'\||\s{2,}', header_line) if cell.strip()]
    
    # Now check if the number of header cells is less than the number of data columns
    # We'll need to add a placeholder for the missing row label header
    first_data_line = data_lines[0]
    data_cells = [cell.strip() for cell in re.split(r'\||\s{2,}', first_data_line) if cell.strip()]
    
    if len(header_cells) < len(data_cells):
        header_cells = ['Row Label'] + header_cells  # Add placeholder for row labels
    
    # Now parse data lines
    table_data = []
    for line in data_lines:
        cells = [cell.strip() for cell in re.split(r'\||\s{2,}', line) if cell.strip()]
        if cells:
            table_data.append(cells)
    
    if not table_data:
    #    print("Warning: No valid data rows found after parsing.")
        return []
    
    # Now construct sentences
    sentences = []
    for row in table_data:
        row_label = row[0]
        for col_idx in range(1, len(row)):
            if col_idx < len(header_cells):
                col_header = header_cells[col_idx]
            else:
                col_header = f"Column {col_idx}"
            value = row[col_idx]
            # Clean up value (remove $ and spaces)
            value_cleaned = value.replace('$', '').strip()
            # Construct the sentence
            sentence = f"{col_header} {row_label} value is {value_cleaned}."
            sentences.append(sentence)
    return sentences

def separate_and_clean_text(doc):
    # Ensure the input is a Document object
    if not isinstance(doc, Document):
        raise TypeError("Input must be a Document object.")
    
    # Extract the text content from the document
    text = doc.text
    
    # Regex pattern to identify table-like rows (rows with pipes '|')
    table_pattern = re.compile(r"\|")
    
    lines = text.split("\n")
    tables = []
    regular_text = []
    current_table = []
    
    # Loop through lines to separate tables and text
    for line in lines:
        if table_pattern.search(line):
            current_table.append(line)
        else:
            # If a regular text line is found after a table block, save the table
            if current_table:
                tables.append("\n".join(current_table))
                current_table = []  # Reset for the next table block
                
            # Add the line to regular text
            regular_text.append(line)
    
    # If there's a remaining table block at the end, add it
    if current_table:
        tables.append("\n".join(current_table))
    
    # Join and clean regular text lines
    text_content = "\n".join(regular_text)
    cleaned_text = clean_text(text_content)
    
    # Process tables and convert to text sentences
    table_sentences = []
    for idx, table in enumerate(tables):
   #     print(f"Processing table {idx + 1}:")
      #  print(table)
        sentences = process_table_to_text(table)
        if sentences:
            table_sentences.extend(sentences)
    #    else:
   #         print(f"No sentences generated from table {idx + 1}.")

    # Combine the cleaned text and table sentences
    all_text = cleaned_text + "\n" + "\n".join(table_sentences)
    
    return all_text

# Assuming 'documents[2]' is your document, call the function:
#all_text = separate_and_clean_text(documents[0])
#print(all_text)

In [8]:
# Assuming 'documents' is a list of Document objects
for i, doc in enumerate(documents):
    # Apply the separate and clean function to each document
    cleaned_combined_text = separate_and_clean_text(doc)
    
    # Update the document's text with the cleaned and combined text
    documents[i].text = cleaned_combined_text

# Example: Check the updated text of the first document
print("Updated Text for Document[2]:\n", documents[2].text)

Updated Text for Document[2]:
 five year million revolving , multi currency , senior unsecured credit facility maturing november , senior credit facility . . million outstanding senior credit facility december , , availability . million . senior credit facility contains provisions increase line million . also available uncommitted credit facilities totaling . million . may use excess cash borrow senior credit facility , subject limits set board directors , repurchase additional common stock . billion program expires december , . approximately . million remains authorized future repurchases plan . management believes cash flows operations available borrowings senior credit facility sufficient meet expected working capital , capital expenditure debt service needs . investment opportunities arise , believe earnings , balance sheet cash flows allow us obtain additional capital , necessary . contractual obligations entered contracts various third parties normal course business require futur

In [9]:
class Config:
   EMBED_DIMENSION =  1024
   EMBED_MODEL = "baconnier/Finance_embedding_large_en-V0.1"
   RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
   SIM_TOP_K = 50
   RERANKER_TOP_N = 30

cfg = Config()

In [10]:
# Llamaindex global settings for llm and embeddings
Settings.llm = None
Settings.embed_model = HuggingFaceEmbedding(model_name=cfg.EMBED_MODEL)

LLM is explicitly disabled. Using MockLLM.


In [11]:
from transformers import pipeline
from typing import List, Dict
from sentence_transformers import SentenceTransformer


class RetrievalAgent:
    def __init__(self, cfg, documents):
        self.cfg = cfg 
        self.documents = documents 

        self.index , self.reranker = self.initialise_retrieval_components()

    def initialise_retrieval_components(self):
        # Create FaisVectorStore to store embeddings
        fais_index = faiss.IndexFlatL2(self.cfg.EMBED_DIMENSION)
        vector_store = FaissVectorStore(faiss_index=fais_index)
        print("Vector Store Created")

        ## Can experiment with different transformations
        base_pipeline = IngestionPipeline(
            # chunk_size=256, chunk_overlap=20
            transformations=[SentenceSplitter()],
            vector_store=vector_store,
            documents=self.documents
        )
        nodes = base_pipeline.run()

        # Create vector index from base nodes
        index = VectorStoreIndex(nodes)
        print("Vector Index Initialised")
        
        # Create Reranker
        reranker = SentenceTransformerRerank(
                    model=self.cfg.RERANKER_MODEL,
                    top_n=self.cfg.RERANKER_TOP_N
                )
        print("Reranker Initialised")
        return index, reranker 

    def retrieve_nodes(self, query_str, with_reranker=True):
        query_bundle = QueryBundle(query_str)
        # configure retriever
        retriever = VectorIndexRetriever(
            index=self.index,
            similarity_top_k=self.cfg.SIM_TOP_K
        )
        retrieved_nodes = retriever.retrieve(query_bundle)

        if with_reranker:    
            retrieved_nodes = self.reranker.postprocess_nodes(
                retrieved_nodes, query_bundle
            )

        return retrieved_nodes

In [12]:
def create_df_from_nodes(nodes, extract_unique=True):
    init_rows = []
    for node in nodes:
        tmp = {
            "score" : node.score,
            "text" : node.text,
            "corpus_id" : node.metadata['_id']
        }
        init_rows.append(tmp)
    tmp_df = pd.DataFrame(init_rows)

    if not extract_unique:
        return tmp_df 
    
    final_rows = []
    for corpus_id, corpus_df in tmp_df.groupby('corpus_id'):
        max_score = corpus_df['score'].max()
        text = corpus_df[corpus_df.score == max_score].text.tolist()[0]
        final_rows.append({
            "corpus_id" : corpus_id, 
            "text" : text, 
            "score" : max_score
        })
    df = pd.DataFrame(final_rows)
    df = df.sort_values(by='score', ascending=False)
    return df


def evaluate_on_dataset(cfg, documents, queries, gt, with_reranker=True):
  
     # Initialize Retrieval Agent 
    ret_agent = RetrievalAgent(cfg=cfg, documents=documents)

    query_id_list = []
    corpus_id_list = []
    score_list = []

    for idx,row in queries.iterrows():
        query_id = row['_id']
        query_text = row['text']

        nodes = ret_agent.retrieve_nodes(query_text, with_reranker=with_reranker)
        # Extract top 10 unique corpus_id
        node_df = create_df_from_nodes(nodes)[:10]

        query_id_list.extend([query_id] * 10)
        corpus_id_list.extend(node_df.corpus_id.tolist())
        score_list.extend(node_df.score.tolist())


    final_df = pd.DataFrame({
        "query_id" : query_id_list, 
        "corpus_id" : corpus_id_list,
        "score" : score_list
    })

    # Convert the TSV data into a dictionary format for evaluation
    qrels_dict = gt.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
    results = final_df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
    print(evaluate_rag(qrels_dict, results, [1, 5, 10]))

   

## ConvFinQA

### With Reranker Algorithm

In [14]:
evaluate_on_dataset(cfg=cfg,
                    documents=documents, 
                    queries=finq_bench_queries,
                    gt=finq_bench_gt,
                    with_reranker=True)

Vector Store Created


### Without Reranker Algorithm

In [38]:
evaluate_on_dataset(cfg=cfg,
                    corpus=finq_bench_corpus, 
                    queries=finq_bench_queries,
                    gt=finq_bench_gt,
                    with_reranker=False)

Vector Store Created
Vector Index Initialised
Reranker Initialised
({'NDCG@1': 0.73333, 'NDCG@5': 0.8293, 'NDCG@10': 0.8293}, {'MAP@1': 0.73333, 'MAP@5': 0.79556, 'MAP@10': 0.79556}, {'Recall@1': 0.73333, 'Recall@5': 0.93333, 'Recall@10': 0.93333}, {'P@1': 0.73333, 'P@5': 0.18667, 'P@10': 0.09333})
