In [4]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import torch
from dataset import *
from data_handler import *
from embeddings import *
from vector_store import *
from RAG_pipeline import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
dataset_manager = FinanceRAGDataset("../data")
# List available datasets
print("Available datasets:", dataset_manager.list_datasets())

# Load corpus and queries from a specific dataset
# DATASET_NAME = "ConvFinQA"
# corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)

Available datasets: ['ConvFinQA', 'FinQA', 'MultiHeritt', 'TATQA']


In [34]:
text_processor = DataHandler(Tokenizer(AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")),
                             Embedder(AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")))
EMBEDDING_DIM = 384
MODEL_INPUT_SIZE = 256

In [30]:
# Experiment without table embedding with LLM
experiment_results = {}
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    pipeline.embed_corpus(MODEL_INPUT_SIZE)
    pipeline.populate_vector_store(EMBEDDING_DIM)
    pipeline.embed_queries(MODEL_INPUT_SIZE)
    evaluation = pipeline.evaluate()
    experiment_results[DATASET_NAME] = evaluation
experiment_results

{'ConvFinQA': {'ndcg': {'@5': 0.6203079672174378, '@10': 0.6482060685886607},
  'recall': {'@5': 0.753968253968254, '@10': 0.8412698412698413}},
 'FinQA': {'ndcg': {'@5': 0.5291651355233714, '@10': 0.565027020238846},
  'recall': {'@5': 0.6453488372093024, '@10': 0.7587209302325582}},
 'MultiHeritt': {'ndcg': {'@5': 0.35486964947430705,
   '@10': 0.3808145154850899},
  'recall': {'@5': 0.12278693528693527, '@10': 0.16019044691989898}},
 'TATQA': {'ndcg': {'@5': 0.4767318924611678, '@10': 0.5105819003850763},
  'recall': {'@5': 0.5903614457831325, '@10': 0.6927710843373494}}}

In [None]:
# Experiment with short table summary with LLM
experiment_results = {}
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    pipeline.load_table_summaries(f'../data/{DATASET_NAME}/table_summaries_{DATASET_NAME}_short.npy')
    pipeline.embed_corpus(MODEL_INPUT_SIZE)
    pipeline.populate_vector_store(EMBEDDING_DIM)
    pipeline.embed_queries(MODEL_INPUT_SIZE)
    evaluation = pipeline.evaluate()
    experiment_results[DATASET_NAME] = evaluation
experiment_results

{'ConvFinQA': {'ndcg': {'@5': 0.6936408876086481, '@10': 0.7198161662557143},
  'recall': {'@5': 0.8174603174603174, '@10': 0.8968253968253969}},
 'FinQA': {'ndcg': {'@5': 0.6128869038473453, '@10': 0.6418173092154987},
  'recall': {'@5': 0.7238372093023255, '@10': 0.813953488372093}},
 'MultiHeritt': {'ndcg': {'@5': 0.3839623199665212,
   '@10': 0.41085262128752975},
  'recall': {'@5': 0.1319805099770853, '@10': 0.1676716928429257}},
 'TATQA': {'ndcg': {'@5': 0.5032479648916659, '@10': 0.5391244336550254},
  'recall': {'@5': 0.6224899598393574, '@10': 0.7329317269076305}}}

In [35]:
# Experiment with long table summary with LLM
experiment_results = {}
for DATASET_NAME in dataset_manager.list_datasets():
    corpus, queries, qrels = dataset_manager.load_dataset(DATASET_NAME)
    corpus, queries = reduce_dataset_size(corpus, queries, qrels)
    pipeline = RAGPipeline(corpus, queries, qrels, text_processor)
    pipeline.load_table_summaries(f'../data/{DATASET_NAME}/table_summaries_{DATASET_NAME}_long.npy')
    pipeline.embed_corpus(MODEL_INPUT_SIZE)
    pipeline.populate_vector_store(EMBEDDING_DIM)
    pipeline.embed_queries(MODEL_INPUT_SIZE)
    evaluation = pipeline.evaluate()
    experiment_results[DATASET_NAME] = evaluation
experiment_results

Token indices sequence length is longer than the specified maximum sequence length for this model (1801 > 512). Running this sequence through the model will result in indexing errors


{'ConvFinQA': {'ndcg': {'@5': 0.7110419674913173, '@10': 0.7348281191886693},
  'recall': {'@5': 0.8333333333333334, '@10': 0.9047619047619048}},
 'FinQA': {'ndcg': {'@5': 0.6019617723665021, '@10': 0.6323604451900163},
  'recall': {'@5': 0.7180232558139535, '@10': 0.813953488372093}},
 'MultiHeritt': {'ndcg': {'@5': 0.3975642943217569, '@10': 0.4233684141461655},
  'recall': {'@5': 0.13475536069714153, '@10': 0.1748048279726362}},
 'TATQA': {'ndcg': {'@5': 0.5123626228747631, '@10': 0.5488969508922538},
  'recall': {'@5': 0.6224899598393574, '@10': 0.7369477911646586}}}