In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import torch
from dataset import *
from data_handler import *
from embeddings import *
from vector_store import *
from RAG_pipeline import *

In [2]:
dataset_manager = FinanceRAGDataset("../data")
# List available datasets
print("Available datasets:", dataset_manager.list_datasets())

# Load corpus and queries from a specific dataset
# DATASET_NAME = "ConvFinQA"
# corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)

Available datasets: ['ConvFinQA', 'FinQA', 'MultiHeritt', 'TATQA']


In [3]:
text_processor = DataHandler(Tokenizer(AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")),
                             Embedder(AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")))
EMBEDDING_DIM = 384
MODEL_INPUT_SIZE = 256

In [4]:
def split_column_metrics(df):
    for metric in ['ndcg', 'recall', 'mrr']:
        df[f'{metric}_5'] = df[metric].apply(lambda x: x['@5'])
        df[f'{metric}_10'] = df[metric].apply(lambda x: x['@10'])
        df.drop(metric, axis=1, inplace=True)
    df.reset_index(inplace=True)
    df.rename(columns={'index': 'dataset'}, inplace=True)
    return df

### Experimental removing the tables from the text 

In [5]:
# Experiment without tables (and without summarization) (_nt_ns: df_results_nt_ns)
experiment_results = {}
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"Dataset: {DATASET_NAME} \tNumber of queries: {len(queries)} and Number of documents: {len(corpus)}")

    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    pipeline.manage_corpus(chunk_size=MODEL_INPUT_SIZE, remove_tables=True)
    pipeline.populate_vector_store(EMBEDDING_DIM)
    pipeline.embed_queries(MODEL_INPUT_SIZE)

    best_chunks_nt_ns = pipeline.retrieve()
    evaluation = pipeline.evaluate(best_chunks_nt_ns)
    experiment_results[DATASET_NAME] = evaluation
    print("\tFinished evaluation!")
df_results_nt_ns = pd.DataFrame(experiment_results).T
# df_results_nt_ns['setting'] = 'NT_NS'
df_results_nt_ns = split_column_metrics(df_results_nt_ns)
df_results_nt_ns

Token indices sequence length is longer than the specified maximum sequence length for this model (837 > 512). Running this sequence through the model will result in indexing errors


Dataset: ConvFinQA 	Number of queries: 126 and Number of documents: 101
	Finished evaluation!
Dataset: FinQA 	Number of queries: 344 and Number of documents: 247
	Finished evaluation!
Dataset: MultiHeritt 	Number of queries: 292 and Number of documents: 876
	Finished evaluation!
Dataset: TATQA 	Number of queries: 498 and Number of documents: 248
	Finished evaluation!


Unnamed: 0,dataset,ndcg_5,ndcg_10,recall_5,recall_10,mrr_5,mrr_10
0,ConvFinQA,0.5113,0.5286,0.6429,0.6984,0.4676,0.4744
1,FinQA,0.4265,0.4579,0.5116,0.6105,0.3983,0.4109
2,MultiHeritt,0.2308,0.2585,0.0828,0.1178,0.0529,0.0576
3,TATQA,0.3594,0.383,0.4438,0.5161,0.3312,0.341


In [7]:
# Experiment with short table summary with LLM without table text (_nt_ss: no table short summary)
experiment_results = {}
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"Dataset: {DATASET_NAME} \tNumber of queries: {len(queries)} and Number of documents: {len(corpus)}")

    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    pipeline.load_table_summaries(f'../data/{DATASET_NAME}/table_summaries_{DATASET_NAME}_short.npy')
    pipeline.manage_corpus(MODEL_INPUT_SIZE, remove_tables=True)
    pipeline.populate_vector_store(EMBEDDING_DIM)
    pipeline.embed_queries(MODEL_INPUT_SIZE)

    best_chunks_nt_ss = pipeline.retrieve()
    evaluation = pipeline.evaluate(best_chunks_nt_ss)
    experiment_results[DATASET_NAME] = evaluation
    print("\tFinished evaluation!")
df_results_nt_ss = pd.DataFrame(experiment_results).T
# df_results_nt_ss['setting'] = 'NT_SS'
df_results_nt_ss = split_column_metrics(df_results_nt_ss)
df_results_nt_ss

Dataset: ConvFinQA 	Number of queries: 126 and Number of documents: 101
	Finished evaluation!
Dataset: FinQA 	Number of queries: 344 and Number of documents: 247
	Finished evaluation!
Dataset: MultiHeritt 	Number of queries: 292 and Number of documents: 876
	Finished evaluation!
Dataset: TATQA 	Number of queries: 498 and Number of documents: 248
	Finished evaluation!


Unnamed: 0,dataset,ndcg_5,ndcg_10,recall_5,recall_10,mrr_5,mrr_10
0,ConvFinQA,0.6869,0.6997,0.8095,0.8492,0.6463,0.6516
1,FinQA,0.5857,0.6133,0.6744,0.7587,0.5558,0.5675
2,MultiHeritt,0.3201,0.343,0.1113,0.1402,0.0739,0.0778
3,TATQA,0.4674,0.5036,0.5843,0.6968,0.4281,0.443


In [9]:
# Experiment with long table summary with LLM (_nt_ls: no table long summary)
experiment_results = {}
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"Dataset: {DATASET_NAME} \tNumber of queries: {len(queries)} and Number of documents: {len(corpus)}")

    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    pipeline.load_table_summaries(f'../data/{DATASET_NAME}/table_summaries_{DATASET_NAME}_long.npy')
    pipeline.manage_corpus(MODEL_INPUT_SIZE, remove_tables=True)
    pipeline.populate_vector_store(EMBEDDING_DIM)
    pipeline.embed_queries(MODEL_INPUT_SIZE)

    best_chunks_nt_ls = pipeline.retrieve()
    evaluation = pipeline.evaluate(best_chunks_nt_ls)
    experiment_results[DATASET_NAME] = evaluation
    print("\tFinished evaluation!")
df_results_nt_ls = pd.DataFrame(experiment_results).T
# df_results_nt_ls['setting'] = 'NT_LS'
df_results_nt_ls = split_column_metrics(df_results_nt_ls)
df_results_nt_ls

Dataset: ConvFinQA 	Number of queries: 126 and Number of documents: 101
	Finished evaluation!
Dataset: FinQA 	Number of queries: 344 and Number of documents: 247
	Finished evaluation!
Dataset: MultiHeritt 	Number of queries: 292 and Number of documents: 876
	Finished evaluation!
Dataset: TATQA 	Number of queries: 498 and Number of documents: 248
	Finished evaluation!


Unnamed: 0,dataset,ndcg_5,ndcg_10,recall_5,recall_10,mrr_5,mrr_10
0,ConvFinQA,0.7139,0.7297,0.8333,0.881,0.6741,0.6809
1,FinQA,0.58,0.6093,0.6831,0.7762,0.5453,0.5569
2,MultiHeritt,0.3283,0.3585,0.1144,0.1524,0.0768,0.082
3,TATQA,0.4789,0.5067,0.5763,0.6606,0.4463,0.4582


### Experimental keeping the tables in the text 

In [6]:
# Experiment with text table but without table summaries with LLM (_ns: no summarization)
experiment_results = {}
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"Dataset: {DATASET_NAME}\tNumber of queries: {len(queries)} and Number of documents: {len(corpus)}")

    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    pipeline.manage_corpus(chunk_size=MODEL_INPUT_SIZE)
    pipeline.populate_vector_store(EMBEDDING_DIM)
    pipeline.embed_queries(MODEL_INPUT_SIZE)

    best_chunks_ns = pipeline.retrieve()
    evaluation = pipeline.evaluate(best_chunks_ns)
    experiment_results[DATASET_NAME] = evaluation
    print("\tFinished evaluation!")
df_results_ns = pd.DataFrame(experiment_results).T
# df_results_ns['setting'] = 'NS'
df_results_ns = split_column_metrics(df_results_ns)
df_results_ns.to_excel("df_results_ns.xlsx", index=False)
df_results_ns

Dataset: ConvFinQA	Number of queries: 126 and Number of documents: 101
	Finished evaluation!
Dataset: FinQA	Number of queries: 344 and Number of documents: 247
	Finished evaluation!
Dataset: MultiHeritt	Number of queries: 292 and Number of documents: 876
	Finished evaluation!
Dataset: TATQA	Number of queries: 498 and Number of documents: 248
	Finished evaluation!


Unnamed: 0,dataset,ndcg_5,ndcg_10,recall_5,recall_10,mrr_5,mrr_10
0,ConvFinQA,0.6203,0.6482,0.754,0.8413,0.5761,0.5874
1,FinQA,0.5292,0.565,0.6453,0.7587,0.4903,0.5046
2,MultiHeritt,0.3549,0.3808,0.1228,0.1602,0.0859,0.0908
3,TATQA,0.4767,0.5106,0.5904,0.6928,0.4394,0.4537


In [8]:
# Experiment with short table summary with LLM (_ss: short summary)
experiment_results = {}
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"Dataset: {DATASET_NAME} \tNumber of queries: {len(queries)} and Number of documents: {len(corpus)}")

    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    pipeline.load_table_summaries(f'../data/{DATASET_NAME}/table_summaries_{DATASET_NAME}_short.npy')
    pipeline.manage_corpus(MODEL_INPUT_SIZE, remove_tables=False)
    pipeline.populate_vector_store(EMBEDDING_DIM)
    pipeline.embed_queries(MODEL_INPUT_SIZE)

    best_chunks_ss = pipeline.retrieve()
    evaluation = pipeline.evaluate(best_chunks_ss)
    experiment_results[DATASET_NAME] = evaluation
    print("\tFinished evaluation!")
df_results_ss = pd.DataFrame(experiment_results).T
# df_results_ss['setting'] = 'SS'
df_results_ss = split_column_metrics(df_results_ss)
df_results_ss.to_excel("df_results_ss.xlsx", index=False)
df_results_ss

Dataset: ConvFinQA 	Number of queries: 126 and Number of documents: 101
	Finished evaluation!
Dataset: FinQA 	Number of queries: 344 and Number of documents: 247
	Finished evaluation!
Dataset: MultiHeritt 	Number of queries: 292 and Number of documents: 876
	Finished evaluation!
Dataset: TATQA 	Number of queries: 498 and Number of documents: 248
	Finished evaluation!


Unnamed: 0,dataset,ndcg_5,ndcg_10,recall_5,recall_10,mrr_5,mrr_10
0,ConvFinQA,0.6936,0.7198,0.8175,0.8968,0.652,0.6631
1,FinQA,0.6129,0.6418,0.7238,0.814,0.5757,0.5876
2,MultiHeritt,0.384,0.4109,0.132,0.1677,0.0919,0.0966
3,TATQA,0.5032,0.5391,0.6225,0.7329,0.4633,0.4782


In [10]:
# Experiment with long table summary with LLM (_ls: long summary)
experiment_results = {}
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"Dataset: {DATASET_NAME} \tNumber of queries: {len(queries)} and Number of documents: {len(corpus)}")

    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    pipeline.load_table_summaries(f'../data/{DATASET_NAME}/table_summaries_{DATASET_NAME}_long.npy')
    pipeline.manage_corpus(MODEL_INPUT_SIZE, remove_tables=False)
    pipeline.populate_vector_store(EMBEDDING_DIM)
    pipeline.embed_queries(MODEL_INPUT_SIZE)

    best_chunks_ls = pipeline.retrieve()
    evaluation = pipeline.evaluate(best_chunks_ls)
    experiment_results[DATASET_NAME] = evaluation
    print("\tFinished evaluation!")
df_results_ls = pd.DataFrame(experiment_results).T
# df_results_ls['setting'] = 'LS'
df_results_ls = split_column_metrics(df_results_ls)
df_results_ls.to_excel("df_results_ls.xlsx", index=False)
df_results_ls

Dataset: ConvFinQA 	Number of queries: 126 and Number of documents: 101
	Finished evaluation!
Dataset: FinQA 	Number of queries: 344 and Number of documents: 247
	Finished evaluation!
Dataset: MultiHeritt 	Number of queries: 292 and Number of documents: 876
	Finished evaluation!
Dataset: TATQA 	Number of queries: 498 and Number of documents: 248
	Finished evaluation!


Unnamed: 0,dataset,ndcg_5,ndcg_10,recall_5,recall_10,mrr_5,mrr_10
0,ConvFinQA,0.711,0.7348,0.8333,0.9048,0.6698,0.6801
1,FinQA,0.602,0.6324,0.718,0.814,0.563,0.5752
2,MultiHeritt,0.3976,0.4234,0.1348,0.1748,0.0965,0.1017
3,TATQA,0.5124,0.5489,0.6225,0.7369,0.4758,0.4906
