# Project Work in NLP

Alessio Conti alessio.conti3@studio.unibo.it

Alice Turrini alice.turrini@studio.unibo.it

In [41]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import torch
from lib.dataset import *   
from lib.data_handler import *
from lib.embeddings import *
from lib.vector_store import *
from lib.RAG_pipeline import *
from lib.utils import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the dataset

In [2]:
dataset_manager = FinanceRAGDataset("../data")
# List available datasets
print("Available datasets:", dataset_manager.list_datasets())

# Load corpus and queries from a specific dataset
# DATASET_NAME = "ConvFinQA"
# corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)

Available datasets: ['ConvFinQA', 'FinQA', 'MultiHeritt', 'TATQA']


#### Dataset exploration

Check words in the datasets

In [3]:
for DATASET_NAME in dataset_manager.list_datasets():
    print("========= DATASET:", DATASET_NAME, " =========\n")
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    print(f"Starting number of documents: {len(corpus)} and number of queries: {len(queries)}")

    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"After reducing: number of documents: {len(corpus)} and number of queries: {len(queries)}\n")

    corpus_df = pd.DataFrame(list(corpus.items()), columns=["id", "text"])
    corpus_df["text"] = corpus_df["text"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else "")
    queries_df = pd.DataFrame(list(queries.items()), columns=["id", "text"])
    plot_count_words(corpus_df, queries_df)


Starting number of documents: 2066 and number of queries: 421
After reducing: number of documents: 101 and number of queries: 126

Max word count: 1586
Average word count: 685.39
Max word count in queries: 35
Average word count in queries: 13.89



Starting number of documents: 2789 and number of queries: 1147
After reducing: number of documents: 247 and number of queries: 344

Max word count: 1661
Average word count: 685.09
Max word count in queries: 43
Average word count in queries: 16.71



Starting number of documents: 10475 and number of queries: 974
After reducing: number of documents: 876 and number of queries: 292

Max word count: 2654
Average word count: 474.03
Max word count in queries: 46
Average word count in queries: 17.97



Starting number of documents: 2756 and number of queries: 1663
After reducing: number of documents: 248 and number of queries: 498

Max word count: 1100
Average word count: 287.44
Max word count in queries: 32
Average word count in queries: 12.35


Check the table presence in the datasets

In [4]:
check_table_presence(dataset_manager)

Dataset: ConvFinQA has 101 documents in corpus
	101 documents with 1 table
            0 documents with 2 tables
            0 documents with 3 tables
            0 documents with 4 tables 
            0 documents with more than 4 tables

Dataset: FinQA has 247 documents in corpus
	247 documents with 1 table
            0 documents with 2 tables
            0 documents with 3 tables
            0 documents with 4 tables 
            0 documents with more than 4 tables

Dataset: MultiHeritt has 876 documents in corpus
	491 documents with 1 table
            46 documents with 2 tables
            7 documents with 3 tables
            6 documents with 4 tables 
            1 documents with more than 4 tables

Dataset: TATQA has 248 documents in corpus
	248 documents with 1 table
            0 documents with 2 tables
            0 documents with 3 tables
            0 documents with 4 tables 
            0 documents with more than 4 tables



## Load the sentence transformer all-MiniLM-L6-v2

In [5]:
text_processor = DataHandler(Tokenizer(AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")),
                             Embedder(AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")))
EMBEDDING_DIM = 384
MODEL_INPUT_SIZE = 256

In [6]:
results_folder = "../results"

## Table summaries

### Experimental removing the tables from the text 

#### No summary (baseline)

In [7]:
# Experiment without tables (and without summarization) (_nt_ns: df_results_nt_ns)
experiment_results = {}
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"Dataset: {DATASET_NAME} \tNumber of queries: {len(queries)} and Number of documents: {len(corpus)}")

    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    pipeline.manage_corpus(chunk_size=MODEL_INPUT_SIZE, remove_tables=True)
    pipeline.populate_vector_store(EMBEDDING_DIM)
    pipeline.embed_queries(MODEL_INPUT_SIZE)

    best_chunks_nt_ns = pipeline.retrieve()
    evaluation = pipeline.evaluate(best_chunks_nt_ns)
    experiment_results[DATASET_NAME] = evaluation
    print("\tFinished evaluation!")
df_results_nt_ns = pd.DataFrame(experiment_results).T
df_results_nt_ns = split_column_metrics(df_results_nt_ns)
df_results_nt_ns

Token indices sequence length is longer than the specified maximum sequence length for this model (837 > 512). Running this sequence through the model will result in indexing errors


Dataset: ConvFinQA 	Number of queries: 126 and Number of documents: 101
	Finished evaluation!
Dataset: FinQA 	Number of queries: 344 and Number of documents: 247
	Finished evaluation!
Dataset: MultiHeritt 	Number of queries: 292 and Number of documents: 876
	Finished evaluation!
Dataset: TATQA 	Number of queries: 498 and Number of documents: 248
	Finished evaluation!


Unnamed: 0,dataset,ndcg_5,ndcg_10,recall_5,recall_10,mrr_5,mrr_10
0,ConvFinQA,0.5113,0.5286,0.6429,0.6984,0.4676,0.4744
1,FinQA,0.4265,0.4579,0.5116,0.6105,0.3983,0.4109
2,MultiHeritt,0.2308,0.2585,0.0828,0.1178,0.0529,0.0576
3,TATQA,0.3594,0.383,0.4438,0.5161,0.3312,0.341


#### Short Summary

In [8]:
# Experiment with short table summary with LLM without table text (_nt_ss: no table short summary)
experiment_results = {}
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"Dataset: {DATASET_NAME} \tNumber of queries: {len(queries)} and Number of documents: {len(corpus)}")

    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    pipeline.load_table_summaries(f'../data/{DATASET_NAME}/table_summaries_{DATASET_NAME}_short.npy')
    pipeline.manage_corpus(MODEL_INPUT_SIZE, remove_tables=True)
    pipeline.populate_vector_store(EMBEDDING_DIM)
    pipeline.embed_queries(MODEL_INPUT_SIZE)

    best_chunks_nt_ss = pipeline.retrieve()
    evaluation = pipeline.evaluate(best_chunks_nt_ss)
    experiment_results[DATASET_NAME] = evaluation
    print("\tFinished evaluation!")
df_results_nt_ss = pd.DataFrame(experiment_results).T
df_results_nt_ss = split_column_metrics(df_results_nt_ss)
df_results_nt_ss

Dataset: ConvFinQA 	Number of queries: 126 and Number of documents: 101
	Finished evaluation!
Dataset: FinQA 	Number of queries: 344 and Number of documents: 247
	Finished evaluation!
Dataset: MultiHeritt 	Number of queries: 292 and Number of documents: 876
	Finished evaluation!
Dataset: TATQA 	Number of queries: 498 and Number of documents: 248
	Finished evaluation!


Unnamed: 0,dataset,ndcg_5,ndcg_10,recall_5,recall_10,mrr_5,mrr_10
0,ConvFinQA,0.6869,0.6997,0.8095,0.8492,0.6463,0.6516
1,FinQA,0.5857,0.6133,0.6744,0.7587,0.5558,0.5675
2,MultiHeritt,0.3201,0.343,0.1113,0.1402,0.0739,0.0778
3,TATQA,0.4674,0.5036,0.5843,0.6968,0.4281,0.443


#### Long summary

In [9]:
# Experiment with long table summary with LLM (_nt_ls: no table long summary)
experiment_results = {}
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"Dataset: {DATASET_NAME} \tNumber of queries: {len(queries)} and Number of documents: {len(corpus)}")

    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    pipeline.load_table_summaries(f'../data/{DATASET_NAME}/table_summaries_{DATASET_NAME}_long.npy')
    pipeline.manage_corpus(MODEL_INPUT_SIZE, remove_tables=True)
    pipeline.populate_vector_store(EMBEDDING_DIM)
    pipeline.embed_queries(MODEL_INPUT_SIZE)

    best_chunks_nt_ls = pipeline.retrieve()
    evaluation = pipeline.evaluate(best_chunks_nt_ls)
    experiment_results[DATASET_NAME] = evaluation
    print("\tFinished evaluation!")
df_results_nt_ls = pd.DataFrame(experiment_results).T
df_results_nt_ls = split_column_metrics(df_results_nt_ls)
df_results_nt_ls

Dataset: ConvFinQA 	Number of queries: 126 and Number of documents: 101
	Finished evaluation!
Dataset: FinQA 	Number of queries: 344 and Number of documents: 247
	Finished evaluation!
Dataset: MultiHeritt 	Number of queries: 292 and Number of documents: 876
	Finished evaluation!
Dataset: TATQA 	Number of queries: 498 and Number of documents: 248
	Finished evaluation!


Unnamed: 0,dataset,ndcg_5,ndcg_10,recall_5,recall_10,mrr_5,mrr_10
0,ConvFinQA,0.7139,0.7297,0.8333,0.881,0.6741,0.6809
1,FinQA,0.58,0.6093,0.6831,0.7762,0.5453,0.5569
2,MultiHeritt,0.3283,0.3585,0.1144,0.1524,0.0768,0.082
3,TATQA,0.4789,0.5067,0.5763,0.6606,0.4463,0.4582


### Experimental keeping the tables in the text 

#### No summary (baseline)

In [11]:
# Experiment with text table but without table summaries with LLM (_ns: no summarization)
experiment_results = {}
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"Dataset: {DATASET_NAME}\tNumber of queries: {len(queries)} and Number of documents: {len(corpus)}")

    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    pipeline.manage_corpus(chunk_size=MODEL_INPUT_SIZE)
    pipeline.populate_vector_store(EMBEDDING_DIM)
    pipeline.embed_queries(MODEL_INPUT_SIZE)

    best_chunks_ns = pipeline.retrieve()
    evaluation = pipeline.evaluate(best_chunks_ns)
    experiment_results[DATASET_NAME] = evaluation
    print("\tFinished evaluation!")
df_results_ns = pd.DataFrame(experiment_results).T
df_results_ns = split_column_metrics(df_results_ns)
df_results_ns.to_excel(f"{results_folder}/df_results_ns.xlsx", index=False)
df_results_ns

Dataset: ConvFinQA	Number of queries: 126 and Number of documents: 101
	Finished evaluation!
Dataset: FinQA	Number of queries: 344 and Number of documents: 247
	Finished evaluation!
Dataset: MultiHeritt	Number of queries: 292 and Number of documents: 876
	Finished evaluation!
Dataset: TATQA	Number of queries: 498 and Number of documents: 248
	Finished evaluation!


Unnamed: 0,dataset,ndcg_5,ndcg_10,recall_5,recall_10,mrr_5,mrr_10
0,ConvFinQA,0.6203,0.6482,0.754,0.8413,0.5761,0.5874
1,FinQA,0.5292,0.565,0.6453,0.7587,0.4903,0.5046
2,MultiHeritt,0.3549,0.3808,0.1228,0.1602,0.0859,0.0908
3,TATQA,0.4767,0.5106,0.5904,0.6928,0.4394,0.4537


#### Short Summary

In [12]:
# Experiment with short table summary with LLM (_ss: short summary)
experiment_results = {}
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"Dataset: {DATASET_NAME} \tNumber of queries: {len(queries)} and Number of documents: {len(corpus)}")

    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    pipeline.load_table_summaries(f'../data/{DATASET_NAME}/table_summaries_{DATASET_NAME}_short.npy')
    pipeline.manage_corpus(MODEL_INPUT_SIZE, remove_tables=False)
    pipeline.populate_vector_store(EMBEDDING_DIM)
    pipeline.embed_queries(MODEL_INPUT_SIZE)

    best_chunks_ss = pipeline.retrieve()
    evaluation = pipeline.evaluate(best_chunks_ss)
    experiment_results[DATASET_NAME] = evaluation
    print("\tFinished evaluation!")
df_results_ss = pd.DataFrame(experiment_results).T
df_results_ss = split_column_metrics(df_results_ss)
df_results_ss.to_excel(f"{results_folder}/df_results_ss.xlsx", index=False)
df_results_ss

Dataset: ConvFinQA 	Number of queries: 126 and Number of documents: 101
	Finished evaluation!
Dataset: FinQA 	Number of queries: 344 and Number of documents: 247
	Finished evaluation!
Dataset: MultiHeritt 	Number of queries: 292 and Number of documents: 876
	Finished evaluation!
Dataset: TATQA 	Number of queries: 498 and Number of documents: 248
	Finished evaluation!


Unnamed: 0,dataset,ndcg_5,ndcg_10,recall_5,recall_10,mrr_5,mrr_10
0,ConvFinQA,0.6936,0.7198,0.8175,0.8968,0.652,0.6631
1,FinQA,0.6129,0.6418,0.7238,0.814,0.5757,0.5876
2,MultiHeritt,0.384,0.4109,0.132,0.1677,0.0919,0.0966
3,TATQA,0.5032,0.5391,0.6225,0.7329,0.4633,0.4782


#### Long summary

In [13]:
# Experiment with long table summary with LLM (_ls: long summary)
experiment_results = {}
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"Dataset: {DATASET_NAME} \tNumber of queries: {len(queries)} and Number of documents: {len(corpus)}")

    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    pipeline.load_table_summaries(f'../data/{DATASET_NAME}/table_summaries_{DATASET_NAME}_long.npy')
    pipeline.manage_corpus(MODEL_INPUT_SIZE, remove_tables=False)
    pipeline.populate_vector_store(EMBEDDING_DIM)
    pipeline.embed_queries(MODEL_INPUT_SIZE)

    best_chunks_ls = pipeline.retrieve()
    evaluation = pipeline.evaluate(best_chunks_ls)
    experiment_results[DATASET_NAME] = evaluation
    print("\tFinished evaluation!")
df_results_ls = pd.DataFrame(experiment_results).T
df_results_ls = split_column_metrics(df_results_ls)
df_results_ls.to_excel(f"{results_folder}/df_results_ls.xlsx", index=False)
df_results_ls

Dataset: ConvFinQA 	Number of queries: 126 and Number of documents: 101
	Finished evaluation!
Dataset: FinQA 	Number of queries: 344 and Number of documents: 247
	Finished evaluation!
Dataset: MultiHeritt 	Number of queries: 292 and Number of documents: 876
	Finished evaluation!
Dataset: TATQA 	Number of queries: 498 and Number of documents: 248
	Finished evaluation!


Unnamed: 0,dataset,ndcg_5,ndcg_10,recall_5,recall_10,mrr_5,mrr_10
0,ConvFinQA,0.711,0.7348,0.8333,0.9048,0.6698,0.6801
1,FinQA,0.602,0.6324,0.718,0.814,0.563,0.5752
2,MultiHeritt,0.3976,0.4234,0.1348,0.1748,0.0965,0.1017
3,TATQA,0.5124,0.5489,0.6225,0.7369,0.4758,0.4906


## Query expansion
All done with the best settings discovered above: long summary, with the original table left in the text  

#### Rephrase the original query

In [14]:
# Experiment with query expansion rephrase and long table summary with LLM (qer_ls: long summary)
experiment_results = {}
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"Dataset: {DATASET_NAME} \tNumber of queries: {len(queries)} and Number of documents: {len(corpus)}")

    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    pipeline.load_table_summaries(f'../data/{DATASET_NAME}/table_summaries_{DATASET_NAME}_long.npy')
    pipeline.load_query_expansion(f'../data/{DATASET_NAME}/queries_expanded_{DATASET_NAME}_rephrase.npy', 10)
    pipeline.manage_corpus(MODEL_INPUT_SIZE, remove_tables=False)
    pipeline.populate_vector_store(EMBEDDING_DIM)
    pipeline.embed_queries(MODEL_INPUT_SIZE, mode='trunk')

    best_chunks_ls = pipeline.retrieve()
    evaluation = pipeline.evaluate(best_chunks_ls)
    experiment_results[DATASET_NAME] = evaluation
    print("\tFinished evaluation!")
df_results_ls = pd.DataFrame(experiment_results).T
df_results_ls = split_column_metrics(df_results_ls)
df_results_ls.to_excel(f"{results_folder}/df_results_qr.xlsx", index=False)
df_results_ls

Dataset: ConvFinQA 	Number of queries: 126 and Number of documents: 101
	Finished evaluation!
Dataset: FinQA 	Number of queries: 344 and Number of documents: 247
	Finished evaluation!
Dataset: MultiHeritt 	Number of queries: 292 and Number of documents: 876
	Finished evaluation!
Dataset: TATQA 	Number of queries: 498 and Number of documents: 248
	Finished evaluation!


Unnamed: 0,dataset,ndcg_5,ndcg_10,recall_5,recall_10,mrr_5,mrr_10
0,ConvFinQA,0.7445,0.7554,0.8492,0.881,0.7086,0.7134
1,FinQA,0.576,0.6088,0.6919,0.7936,0.5374,0.5509
2,MultiHeritt,0.4067,0.4259,0.1384,0.1696,0.0987,0.103
3,TATQA,0.4935,0.5302,0.6004,0.7149,0.4582,0.4731


#### Q2D: give a pseudo document answer

In [15]:
# Experiment with query expansion rephrase and long table summary with LLM (qer_ls: long summary)
experiment_results = {}
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    print(f"Dataset: {DATASET_NAME} \tNumber of queries: {len(queries)} and Number of documents: {len(corpus)}")

    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    pipeline.load_table_summaries(f'../data/{DATASET_NAME}/table_summaries_{DATASET_NAME}_long.npy')
    pipeline.load_query_expansion(f'../data/{DATASET_NAME}/queries_expanded_{DATASET_NAME}_Q2D.npy', 10)
    pipeline.manage_corpus(MODEL_INPUT_SIZE, remove_tables=False)
    pipeline.populate_vector_store(EMBEDDING_DIM)
    pipeline.embed_queries(MODEL_INPUT_SIZE, mode='trunk')

    best_chunks_ls = pipeline.retrieve()
    evaluation = pipeline.evaluate(best_chunks_ls)
    experiment_results[DATASET_NAME] = evaluation
    print("\tFinished evaluation!")
df_results_ls = pd.DataFrame(experiment_results).T
df_results_ls = split_column_metrics(df_results_ls)
df_results_ls.to_excel(f"{results_folder}/df_results_q2d.xlsx", index=False)
df_results_ls

Dataset: ConvFinQA 	Number of queries: 126 and Number of documents: 101
	Finished evaluation!
Dataset: FinQA 	Number of queries: 344 and Number of documents: 247
	Finished evaluation!
Dataset: MultiHeritt 	Number of queries: 292 and Number of documents: 876
	Finished evaluation!
Dataset: TATQA 	Number of queries: 498 and Number of documents: 248
	Finished evaluation!


Unnamed: 0,dataset,ndcg_5,ndcg_10,recall_5,recall_10,mrr_5,mrr_10
0,ConvFinQA,0.7226,0.7412,0.8175,0.873,0.6901,0.6982
1,FinQA,0.5666,0.5989,0.6744,0.7762,0.5308,0.5437
2,MultiHeritt,0.3864,0.4142,0.1331,0.1733,0.0942,0.0996
3,TATQA,0.4746,0.5099,0.5944,0.7048,0.435,0.4494


## Final results

In [None]:
df_results_ns = pd.read_excel(f"{results_folder}/df_results_ns.xlsx")
df_results_ss = pd.read_excel(f"{results_folder}/df_results_ss.xlsx")
df_results_ls = pd.read_excel(f"{results_folder}/df_results_ls.xlsx")
df_results_qr = pd.read_excel(f"{results_folder}/df_results_qr.xlsx")
df_results_q2d = pd.read_excel(f"{results_folder}/df_results_q2d.xlsx")

In [49]:
df_results_ns

Unnamed: 0,dataset,ndcg_5,ndcg_10,recall_5,recall_10,mrr_5,mrr_10
0,ConvFinQA,0.6203,0.6482,0.754,0.8413,0.5761,0.5874
1,FinQA,0.5292,0.565,0.6453,0.7587,0.4903,0.5046
2,MultiHeritt,0.3549,0.3808,0.1228,0.1602,0.0859,0.0908
3,TATQA,0.4767,0.5106,0.5904,0.6928,0.4394,0.4537


Let's see the table handling approaches

In [46]:
plot_table_results(df_results_ns, df_results_ss, df_results_ls)

The Long sumamry is the best approach!
Only the FinQA prefer the short summaries

In [None]:
plot_table_results_all(df_results_ns, df_results_ss, df_results_ls, df_results_qr, df_results_q2d)

Plot the best approach overall, that appears to be the long summary with the query rephrase, to see the comparison of performances over the datasets

In [51]:
plot_one_approach(df_results_qr, "LS + Query Rephrase")