In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import torch
from dataset import *
from data_handler import *
from embeddings import *
from vector_store import *
from RAG_pipeline import *

In [2]:
dataset_manager = FinanceRAGDataset("../data")
# List available datasets
print("Available datasets:", dataset_manager.list_datasets())

# Load corpus and queries from a specific dataset
# DATASET_NAME = "ConvFinQA"
# corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)

Available datasets: ['ConvFinQA', 'FinQA', 'MultiHeritt', 'TATQA']


In [3]:
text_processor = DataHandler(Tokenizer(AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")),
                             Embedder(AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")))
EMBEDDING_DIM = 384
MODEL_INPUT_SIZE = 256

In [4]:
# Experiment without tables (and without summarization) (_nt_ns: df_results_nt_ns)
experiment_results = {}
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"Dataset: {DATASET_NAME} \tNumber of queries: {len(queries)} and Number of documents: {len(corpus)}")

    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    pipeline.manage_corpus(chunk_size=MODEL_INPUT_SIZE, remove_tables=True)
    pipeline.populate_vector_store(EMBEDDING_DIM)
    pipeline.embed_queries(MODEL_INPUT_SIZE)

    best_chunks_nt_ns = pipeline.retrieve()
    evaluation = pipeline.evaluate(best_chunks_nt_ns)
    experiment_results[DATASET_NAME] = evaluation
    print("\tFinished evaluation!")
df_results_nt_ns = pd.DataFrame(experiment_results).T
df_results_nt_ns['setting'] = 'NT_NS'
df_results_nt_ns

Token indices sequence length is longer than the specified maximum sequence length for this model (837 > 512). Running this sequence through the model will result in indexing errors


Dataset: ConvFinQA 	Number of queries: 126 and Number of documents: 101
	Finished evaluation!
Dataset: FinQA 	Number of queries: 344 and Number of documents: 247
	Finished evaluation!
Dataset: MultiHeritt 	Number of queries: 292 and Number of documents: 876
	Finished evaluation!
Dataset: TATQA 	Number of queries: 498 and Number of documents: 248
	Finished evaluation!


Unnamed: 0,ndcg,recall,precision,f1,mrr,map,setting
ConvFinQA,"{'@5': 0.5113, '@10': 0.5286}","{'@5': 0.6429, '@10': 0.6984}","{'@5': 0.1286, '@10': 0.0698}","{'@5': 0.2143, '@10': 0.127}","{'@5': 0.4676, '@10': 0.4744}","{'@5': 0.4676, '@10': 0.4744}",NT_NS
FinQA,"{'@5': 0.4265, '@10': 0.4579}","{'@5': 0.5116, '@10': 0.6105}","{'@5': 0.1023, '@10': 0.061}","{'@5': 0.1705, '@10': 0.111}","{'@5': 0.3983, '@10': 0.4109}","{'@5': 0.3983, '@10': 0.4109}",NT_NS
MultiHeritt,"{'@5': 0.2308, '@10': 0.2585}","{'@5': 0.0828, '@10': 0.1178}","{'@5': 0.0678, '@10': 0.0459}","{'@5': 0.0716, '@10': 0.0637}","{'@5': 0.0529, '@10': 0.0576}","{'@5': 0.0552, '@10': 0.0611}",NT_NS
TATQA,"{'@5': 0.3594, '@10': 0.383}","{'@5': 0.4438, '@10': 0.5161}","{'@5': 0.0888, '@10': 0.0516}","{'@5': 0.1479, '@10': 0.0938}","{'@5': 0.3312, '@10': 0.341}","{'@5': 0.3312, '@10': 0.341}",NT_NS


In [5]:
# Experiment with text table but without table summaries with LLM (_ns: no summarization)
experiment_results = {}
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"Dataset: {DATASET_NAME}\tNumber of queries: {len(queries)} and Number of documents: {len(corpus)}")

    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    pipeline.manage_corpus(chunk_size=MODEL_INPUT_SIZE)
    pipeline.populate_vector_store(EMBEDDING_DIM)
    pipeline.embed_queries(MODEL_INPUT_SIZE)

    best_chunks_ns = pipeline.retrieve()
    evaluation = pipeline.evaluate(best_chunks_ns)
    experiment_results[DATASET_NAME] = evaluation
    print("\tFinished evaluation!")
df_results_ns = pd.DataFrame(experiment_results)
df_results_ns['setting'] = 'NS'
df_results_ns

Dataset: ConvFinQA	Number of queries: 126 and Number of documents: 101
	Finished evaluation!
Dataset: FinQA	Number of queries: 344 and Number of documents: 247
	Finished evaluation!
Dataset: MultiHeritt	Number of queries: 292 and Number of documents: 876
	Finished evaluation!
Dataset: TATQA	Number of queries: 498 and Number of documents: 248
	Finished evaluation!


Unnamed: 0,ConvFinQA,FinQA,MultiHeritt,TATQA,setting
ndcg,"{'@5': 0.6203, '@10': 0.6482}","{'@5': 0.5292, '@10': 0.565}","{'@5': 0.3549, '@10': 0.3808}","{'@5': 0.4767, '@10': 0.5106}",NS
recall,"{'@5': 0.754, '@10': 0.8413}","{'@5': 0.6453, '@10': 0.7587}","{'@5': 0.1228, '@10': 0.1602}","{'@5': 0.5904, '@10': 0.6928}",NS
precision,"{'@5': 0.1508, '@10': 0.0841}","{'@5': 0.1291, '@10': 0.0759}","{'@5': 0.0986, '@10': 0.063}","{'@5': 0.1181, '@10': 0.0693}",NS
f1,"{'@5': 0.2513, '@10': 0.153}","{'@5': 0.2151, '@10': 0.1379}","{'@5': 0.1053, '@10': 0.0874}","{'@5': 0.1968, '@10': 0.126}",NS
mrr,"{'@5': 0.5761, '@10': 0.5874}","{'@5': 0.4903, '@10': 0.5046}","{'@5': 0.0859, '@10': 0.0908}","{'@5': 0.4394, '@10': 0.4537}",NS
map,"{'@5': 0.5761, '@10': 0.5874}","{'@5': 0.4903, '@10': 0.5046}","{'@5': 0.0889, '@10': 0.0955}","{'@5': 0.4394, '@10': 0.4537}",NS


In [6]:
# df_combined = pd.concat([df_results_nt_ns, df_results_ns], axis=0)
# df_combined

In [7]:
# import plotly.express as px

# # Reshape the dataframe for better plotting (melt the dataframe to long format)
# df_plot = df_combined.melt(id_vars=['setting'], 
#                            value_vars=[col for col in df_combined.columns if col.endswith('@5') or col.endswith('@10')],
#                            var_name='Metric', value_name='Score')
# df_combined

In [8]:
# Experiment with short table summary with LLM without table text (_nt_ss: no table short summary)
experiment_results = {}
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"Dataset: {DATASET_NAME} \tNumber of queries: {len(queries)} and Number of documents: {len(corpus)}")


    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    pipeline.load_table_summaries(f'../data/{DATASET_NAME}/table_summaries_{DATASET_NAME}_short.npy')
    pipeline.manage_corpus(MODEL_INPUT_SIZE, remove_tables=True)
    pipeline.populate_vector_store(EMBEDDING_DIM)
    pipeline.embed_queries(MODEL_INPUT_SIZE)

    best_chunks_nt_ss = pipeline.retrieve()
    evaluation = pipeline.evaluate(best_chunks_nt_ss)
    experiment_results[DATASET_NAME] = evaluation
    print("\tFinished evaluation!")
df_results_nt_ss = pd.DataFrame(experiment_results)
df_results_nt_ss['setting'] = 'NT_NS'
df_results_nt_ss

Dataset: ConvFinQA 	Number of queries: 126 and Number of documents: 101
	Finished evaluation!
Dataset: FinQA 	Number of queries: 344 and Number of documents: 247
	Finished evaluation!
Dataset: MultiHeritt 	Number of queries: 292 and Number of documents: 876
	Finished evaluation!
Dataset: TATQA 	Number of queries: 498 and Number of documents: 248
	Finished evaluation!


Unnamed: 0,ConvFinQA,FinQA,MultiHeritt,TATQA,setting
ndcg,"{'@5': 0.6869, '@10': 0.6997}","{'@5': 0.5857, '@10': 0.6133}","{'@5': 0.3201, '@10': 0.343}","{'@5': 0.4674, '@10': 0.5036}",NT_NS
recall,"{'@5': 0.8095, '@10': 0.8492}","{'@5': 0.6744, '@10': 0.7587}","{'@5': 0.1113, '@10': 0.1402}","{'@5': 0.5843, '@10': 0.6968}",NT_NS
precision,"{'@5': 0.1619, '@10': 0.0849}","{'@5': 0.1349, '@10': 0.0759}","{'@5': 0.0918, '@10': 0.0568}","{'@5': 0.1169, '@10': 0.0697}",NT_NS
f1,"{'@5': 0.2698, '@10': 0.1544}","{'@5': 0.2248, '@10': 0.1379}","{'@5': 0.0971, '@10': 0.0785}","{'@5': 0.1948, '@10': 0.1267}",NT_NS
mrr,"{'@5': 0.6463, '@10': 0.6516}","{'@5': 0.5558, '@10': 0.5675}","{'@5': 0.0739, '@10': 0.0778}","{'@5': 0.4281, '@10': 0.443}",NT_NS
map,"{'@5': 0.6463, '@10': 0.6516}","{'@5': 0.5558, '@10': 0.5675}","{'@5': 0.0774, '@10': 0.0823}","{'@5': 0.4281, '@10': 0.443}",NT_NS


In [9]:
# Experiment with short table summary with LLM (_ss: short summary)
experiment_results = {}
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"Dataset: {DATASET_NAME} \tNumber of queries: {len(queries)} and Number of documents: {len(corpus)}")

    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    pipeline.load_table_summaries(f'../data/{DATASET_NAME}/table_summaries_{DATASET_NAME}_short.npy')
    pipeline.manage_corpus(MODEL_INPUT_SIZE, remove_tables=False)
    pipeline.populate_vector_store(EMBEDDING_DIM)
    pipeline.embed_queries(MODEL_INPUT_SIZE)

    best_chunks_ss = pipeline.retrieve()
    evaluation = pipeline.evaluate(best_chunks_ss)
    experiment_results[DATASET_NAME] = evaluation
    print("\tFinished evaluation!")
df_results_ss = pd.DataFrame(experiment_results)
df_results_ss['setting'] = 'SS'
df_results_ss

Dataset: ConvFinQA 	Number of queries: 126 and Number of documents: 101
	Finished evaluation!
Dataset: FinQA 	Number of queries: 344 and Number of documents: 247
	Finished evaluation!
Dataset: MultiHeritt 	Number of queries: 292 and Number of documents: 876
	Finished evaluation!
Dataset: TATQA 	Number of queries: 498 and Number of documents: 248
	Finished evaluation!


Unnamed: 0,ConvFinQA,FinQA,MultiHeritt,TATQA,setting
ndcg,"{'@5': 0.6936, '@10': 0.7198}","{'@5': 0.6129, '@10': 0.6418}","{'@5': 0.384, '@10': 0.4109}","{'@5': 0.5032, '@10': 0.5391}",SS
recall,"{'@5': 0.8175, '@10': 0.8968}","{'@5': 0.7238, '@10': 0.814}","{'@5': 0.132, '@10': 0.1677}","{'@5': 0.6225, '@10': 0.7329}",SS
precision,"{'@5': 0.1635, '@10': 0.0897}","{'@5': 0.1448, '@10': 0.0814}","{'@5': 0.1062, '@10': 0.0661}","{'@5': 0.1245, '@10': 0.0733}",SS
f1,"{'@5': 0.2725, '@10': 0.1631}","{'@5': 0.2413, '@10': 0.148}","{'@5': 0.1134, '@10': 0.0917}","{'@5': 0.2075, '@10': 0.1333}",SS
mrr,"{'@5': 0.652, '@10': 0.6631}","{'@5': 0.5757, '@10': 0.5876}","{'@5': 0.0919, '@10': 0.0966}","{'@5': 0.4633, '@10': 0.4782}",SS
map,"{'@5': 0.652, '@10': 0.6631}","{'@5': 0.5757, '@10': 0.5876}","{'@5': 0.0948, '@10': 0.1009}","{'@5': 0.4633, '@10': 0.4782}",SS


In [10]:
# Experiment with long table summary with LLM (_nt_ls: no table long summary)
experiment_results = {}
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"Dataset: {DATASET_NAME} \tNumber of queries: {len(queries)} and Number of documents: {len(corpus)}")

    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    pipeline.load_table_summaries(f'../data/{DATASET_NAME}/table_summaries_{DATASET_NAME}_long.npy')
    pipeline.manage_corpus(MODEL_INPUT_SIZE, remove_tables=True)
    pipeline.populate_vector_store(EMBEDDING_DIM)
    pipeline.embed_queries(MODEL_INPUT_SIZE)

    best_chunks_nt_ls = pipeline.retrieve()
    evaluation = pipeline.evaluate(best_chunks_nt_ls)
    experiment_results[DATASET_NAME] = evaluation
    print("\tFinished evaluation!")
df_results_nt_ls = pd.DataFrame(experiment_results)
df_results_nt_ls['setting'] = 'NT_LS'
df_results_nt_ls

Dataset: ConvFinQA 	Number of queries: 126 and Number of documents: 101
	Finished evaluation!
Dataset: FinQA 	Number of queries: 344 and Number of documents: 247
	Finished evaluation!
Dataset: MultiHeritt 	Number of queries: 292 and Number of documents: 876
	Finished evaluation!
Dataset: TATQA 	Number of queries: 498 and Number of documents: 248
	Finished evaluation!


Unnamed: 0,ConvFinQA,FinQA,MultiHeritt,TATQA,setting
ndcg,"{'@5': 0.7139, '@10': 0.7297}","{'@5': 0.58, '@10': 0.6093}","{'@5': 0.3283, '@10': 0.3585}","{'@5': 0.4789, '@10': 0.5067}",NT_LS
recall,"{'@5': 0.8333, '@10': 0.881}","{'@5': 0.6831, '@10': 0.7762}","{'@5': 0.1144, '@10': 0.1524}","{'@5': 0.5763, '@10': 0.6606}",NT_LS
precision,"{'@5': 0.1667, '@10': 0.0881}","{'@5': 0.1366, '@10': 0.0776}","{'@5': 0.0938, '@10': 0.0613}","{'@5': 0.1153, '@10': 0.0661}",NT_LS
f1,"{'@5': 0.2778, '@10': 0.1602}","{'@5': 0.2277, '@10': 0.1411}","{'@5': 0.0994, '@10': 0.0848}","{'@5': 0.1921, '@10': 0.1201}",NT_LS
mrr,"{'@5': 0.6741, '@10': 0.6809}","{'@5': 0.5453, '@10': 0.5569}","{'@5': 0.0768, '@10': 0.082}","{'@5': 0.4463, '@10': 0.4582}",NT_LS
map,"{'@5': 0.6741, '@10': 0.6809}","{'@5': 0.5453, '@10': 0.5569}","{'@5': 0.0806, '@10': 0.0871}","{'@5': 0.4463, '@10': 0.4582}",NT_LS


In [11]:
# Experiment with long table summary with LLM (_ls: long summary)
experiment_results = {}
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"Dataset: {DATASET_NAME} \tNumber of queries: {len(queries)} and Number of documents: {len(corpus)}")

    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    pipeline.load_table_summaries(f'../data/{DATASET_NAME}/table_summaries_{DATASET_NAME}_long.npy')
    pipeline.manage_corpus(MODEL_INPUT_SIZE, remove_tables=False)
    pipeline.populate_vector_store(EMBEDDING_DIM)
    pipeline.embed_queries(MODEL_INPUT_SIZE)

    best_chunks_ls = pipeline.retrieve()
    evaluation = pipeline.evaluate(best_chunks_ls)
    experiment_results[DATASET_NAME] = evaluation
    print("\tFinished evaluation!")
df_results_ls = pd.DataFrame(experiment_results)
df_results_ls['setting'] = 'LS'
df_results_ls

Dataset: ConvFinQA 	Number of queries: 126 and Number of documents: 101
	Finished evaluation!
Dataset: FinQA 	Number of queries: 344 and Number of documents: 247
	Finished evaluation!
Dataset: MultiHeritt 	Number of queries: 292 and Number of documents: 876
	Finished evaluation!
Dataset: TATQA 	Number of queries: 498 and Number of documents: 248
	Finished evaluation!


Unnamed: 0,ConvFinQA,FinQA,MultiHeritt,TATQA,setting
ndcg,"{'@5': 0.711, '@10': 0.7348}","{'@5': 0.602, '@10': 0.6324}","{'@5': 0.3976, '@10': 0.4234}","{'@5': 0.5124, '@10': 0.5489}",LS
recall,"{'@5': 0.8333, '@10': 0.9048}","{'@5': 0.718, '@10': 0.814}","{'@5': 0.1348, '@10': 0.1748}","{'@5': 0.6225, '@10': 0.7369}",LS
precision,"{'@5': 0.1667, '@10': 0.0905}","{'@5': 0.1436, '@10': 0.0814}","{'@5': 0.1096, '@10': 0.0682}","{'@5': 0.1245, '@10': 0.0737}",LS
f1,"{'@5': 0.2778, '@10': 0.1645}","{'@5': 0.2393, '@10': 0.148}","{'@5': 0.1165, '@10': 0.0947}","{'@5': 0.2075, '@10': 0.134}",LS
mrr,"{'@5': 0.6698, '@10': 0.6801}","{'@5': 0.563, '@10': 0.5752}","{'@5': 0.0965, '@10': 0.1017}","{'@5': 0.4758, '@10': 0.4906}",LS
map,"{'@5': 0.6698, '@10': 0.6801}","{'@5': 0.563, '@10': 0.5752}","{'@5': 0.1003, '@10': 0.107}","{'@5': 0.4758, '@10': 0.4906}",LS


In [12]:
for i, k in best_chunks_ls.items():
    print(f"Query ID: {i}")
    # Extract texts and display a subset
    texts = [str(text) for text in k["texts"]]
    print(f"Texts (Top 5):")
    print(", ".join(texts[:5]) + (" ..." if len(texts) > 5 else ""))

    # Extract distances and display a subset
    distances = np.array(k["distances"]).flatten()
    print("\nDistances (Top 5):")
    print(", ".join(f"{d:.4f}" for d in distances[:5]) + ("..." if len(distances) > 5 else ""))
    
    true_doc = pipeline.get_true_docs(i)
    # Print the evaluation result in a more readable format
    print(f"\nEvaluation Results: \t[true doc(s): {true_doc}]")
    evaluation_metrics = evaluation.get('mrr', {})
    
    # Additional evaluation metrics
    for metric_group, values in evaluation.items():
        # print(f"{metric_group.capitalize()}:")
        str_m = ""
        for metric, value in values.items():
            str_m += f"{metric_group.capitalize()}\t {metric}: {value:.4f}, "
        print(str_m)
    print("="*50 + "\n") 
    break


Query ID: q1a731ee6
Texts (Top 5):
d1a731ff4, d1a731ff4, d1a731ff4, d1a731ff4, d1a73a852 ...

Distances (Top 5):
0.6097, 0.8689, 0.9558, 1.0072, 1.0076...

Evaluation Results: 	[true doc(s): ['d1a731ff4']]
Ndcg:
@5: 0.5124, @10: 0.5489, 
Recall:
@5: 0.6225, @10: 0.7369, 
Precision:
@5: 0.1245, @10: 0.0737, 
F1:
@5: 0.2075, @10: 0.1340, 
Mrr:
@5: 0.4758, @10: 0.4906, 
Map:
@5: 0.4758, @10: 0.4906, 



In [13]:
# Combine the dataframes df_results_ns, df_results_ss, and df_results_ls
df_combined = pd.concat([df_results_ns, df_results_ss, df_results_ls], axis=0)
df_combined

Unnamed: 0,ConvFinQA,FinQA,MultiHeritt,TATQA,setting
ndcg,"{'@5': 0.6203, '@10': 0.6482}","{'@5': 0.5292, '@10': 0.565}","{'@5': 0.3549, '@10': 0.3808}","{'@5': 0.4767, '@10': 0.5106}",NS
recall,"{'@5': 0.754, '@10': 0.8413}","{'@5': 0.6453, '@10': 0.7587}","{'@5': 0.1228, '@10': 0.1602}","{'@5': 0.5904, '@10': 0.6928}",NS
precision,"{'@5': 0.1508, '@10': 0.0841}","{'@5': 0.1291, '@10': 0.0759}","{'@5': 0.0986, '@10': 0.063}","{'@5': 0.1181, '@10': 0.0693}",NS
f1,"{'@5': 0.2513, '@10': 0.153}","{'@5': 0.2151, '@10': 0.1379}","{'@5': 0.1053, '@10': 0.0874}","{'@5': 0.1968, '@10': 0.126}",NS
mrr,"{'@5': 0.5761, '@10': 0.5874}","{'@5': 0.4903, '@10': 0.5046}","{'@5': 0.0859, '@10': 0.0908}","{'@5': 0.4394, '@10': 0.4537}",NS
map,"{'@5': 0.5761, '@10': 0.5874}","{'@5': 0.4903, '@10': 0.5046}","{'@5': 0.0889, '@10': 0.0955}","{'@5': 0.4394, '@10': 0.4537}",NS
ndcg,"{'@5': 0.6936, '@10': 0.7198}","{'@5': 0.6129, '@10': 0.6418}","{'@5': 0.384, '@10': 0.4109}","{'@5': 0.5032, '@10': 0.5391}",SS
recall,"{'@5': 0.8175, '@10': 0.8968}","{'@5': 0.7238, '@10': 0.814}","{'@5': 0.132, '@10': 0.1677}","{'@5': 0.6225, '@10': 0.7329}",SS
precision,"{'@5': 0.1635, '@10': 0.0897}","{'@5': 0.1448, '@10': 0.0814}","{'@5': 0.1062, '@10': 0.0661}","{'@5': 0.1245, '@10': 0.0733}",SS
f1,"{'@5': 0.2725, '@10': 0.1631}","{'@5': 0.2413, '@10': 0.148}","{'@5': 0.1134, '@10': 0.0917}","{'@5': 0.2075, '@10': 0.1333}",SS
