In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import torch
from dataset import *
from data_handler import *
from embeddings import *
from vector_store import *

In [80]:
dataset_manager = FinanceRAGDataset("../data")

# List available datasets
print("Available datasets:", dataset_manager.list_datasets())

# Load corpus and queries from a specific dataset
DATASET_NAME = "TATQA"
convfinqa_corpus, convfinqa_queries, convfinqa_qrels = dataset_manager.load_dataset(DATASET_NAME)

Available datasets: ['ConvFinQA', 'FinQA', 'MultiHeritt', 'TATQA']


In [81]:
# Only docs with labels
doc_ids = np.array([query["_id"] for query in convfinqa_corpus])
doc_texts = np.array([query["text"] for query in convfinqa_corpus])
mask_queries = np.isin(doc_ids, np.unique(convfinqa_qrels["corpus_id"]))
doc_ids = doc_ids[mask_queries]
doc_texts = [[doc] for doc in doc_texts[mask_queries]]
corpus = dict(zip(doc_ids, doc_texts))

# Get the queries
query_ids = np.array([query["_id"] for query in convfinqa_queries])
query_texts = np.array([query["text"] for query in convfinqa_queries])
mask_queries = np.isin(query_ids, np.unique(convfinqa_qrels["query_id"]))
query_ids = query_ids[mask_queries]
query_texts = query_texts[mask_queries]
queries = dict(zip(query_ids, query_texts))

In [82]:
print(len(corpus), len(queries))

248 498


In [83]:
# # Extract tables from the corpus
# tables = {}
# for id, doc in corpus.items():
#     tables[id] = extract_tables(doc[0])

# # Save
# np.save(f'extracted_tables_{DATASET_NAME}.npy', tables) 
# # Load
# # read_dictionary = np.load('extracted_tables_{DATASET_NAME}.npy',allow_pickle='TRUE').item()

In [53]:
# Load extracted tables
table_summaries = np.load('table_summaries.npy', allow_pickle='TRUE').item()
# Add table summaries to corpus
for idx, text in table_summaries.items():
    corpus[idx].append(text)

In [None]:
text_processor = DataHandler(Tokenizer(AutoTokenizer.from_pretrained("ProsusAI/finbert")),
                             Embedder(AutoModel.from_pretrained("ProsusAI/finbert")))

corpus_embedding = {}
for idx, text in corpus.items():
    embedded_doc = (
        text_processor.load_data(text)
        # .to_lowercase()
        # .remove_punctuation()
        # .remove_numbers()
        # .remove_stopwords()
        # .lemmatize_text()
        # .remove_extra_whitespace()
        .tokenize()
        .chunk_split(max_length=512)
        .embed(method='last_layer')
        .get_data()
    )
    corpus_embedding[idx] = embedded_doc

EMBEDDING_DIM = 768
vector_store = FaissVectorStore(dimension=EMBEDDING_DIM)
# Populate vector store
for i, doc in corpus_embedding.items():
    # Add vectors to store
    vector_store.add_embeddings(np.array(doc), [i for e in range(len(doc))], [{'chunk': e} for e in range(len(doc))])
# Save vector store to file
vector_store.save(f'../vector_store/{DATASET_NAME}')

queries_embedding = {}
for idx, text in queries.items():
    embedded_query = (
        text_processor.load_data(text)
        # .to_lowercase()
        # .remove_punctuation()
        # .remove_stopwords()
        # .lemmatize_text()
        # .remove_extra_whitespace()
        .tokenize()
        .chunk_split(max_length=512)
        .embed(method='last_layer')
        .get_data()
    )
    queries_embedding[idx] = embedded_query

best_match = {}
for query_id, query in queries_embedding.items():
    distances, indices, result_texts, result_metadata = vector_store.similarity_search(
        query, 
        k=10
    )
    best_match[query_id] = result_texts

n_match = 0
for query_id, result in best_match.items():
    idx = np.argmax(convfinqa_qrels['query_id']==query_id)
    if convfinqa_qrels['corpus_id'][idx] in result:
        n_match += 1

n_match

Token indices sequence length is longer than the specified maximum sequence length for this model (760 > 512). Running this sequence through the model will result in indexing errors


4

### all-MiniLM-L6-v2

In [87]:
text_processor = DataHandler(Tokenizer(AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")),
                             Embedder(AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")))
corpus_embedding = {}
for idx, text in corpus.items():
    embedded_doc = (
        text_processor.load_data(text)
        # .to_lowercase()
        # .remove_numbers()
        # .remove_punctuation()
        # .remove_stopwords()
        # .lemmatize_text()
        # .remove_extra_whitespace()
        .tokenize()
        .chunk_split(max_length=256)
        .embed()
        .get_data()
    )
    # embedded_doc = model.encode(text)
    corpus_embedding[idx] = embedded_doc

EMBEDDING_DIM = 384
vector_store = FaissVectorStore(dimension=EMBEDDING_DIM)
# Populate vector store
for i, doc in corpus_embedding.items():
    # Add vectors to store
    vector_store.add_embeddings(np.array(doc), [i for e in range(len(doc))], [{'chunk': e} for e in range(len(doc))])
# Save vector store to file
vector_store.save(f'../vector_store/{DATASET_NAME}')

queries_embedding = {}
for idx, text in queries.items():
    embedded_query = (
        text_processor.load_data(text)
        # .to_lowercase()
        # .remove_numbers()
        # .remove_punctuation()
        # .remove_stopwords()
        # .lemmatize_text()
        # .remove_extra_whitespace()
        .tokenize()
        .chunk_split(max_length=256)
        .embed()
        .get_data()
    )
    queries_embedding[idx] = embedded_query

best_match = {}
for query_id, query in queries_embedding.items():
    distances, indices, result_texts, result_metadata = vector_store.similarity_search(
        query, 
        k=10
    )
    best_match[query_id] = result_texts

n_match = 0
for query_id, result in best_match.items():
    idx = np.argmax(convfinqa_qrels['query_id']==query_id)
    if convfinqa_qrels['corpus_id'][idx] in result:
        n_match += 1

n_match

Token indices sequence length is longer than the specified maximum sequence length for this model (878 > 512). Running this sequence through the model will result in indexing errors


331

In [88]:
n_match/len(queries)

0.6646586345381527