In [2]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from dataset import *
from data_handler import *
from embeddings import *
from vector_store import *

In [3]:
dataset_manager = FinanceRAGDataset("../data")

# List available datasets
print("Available datasets:", dataset_manager.list_datasets())

# Load corpus and queries from a specific dataset
DATASET_NAME = "ConvFinQA"
convfinqa_corpus, convfinqa_queries, convfinqa_qrels = dataset_manager.load_dataset(DATASET_NAME)

Available datasets: ['ConvFinQA', 'FinQA', 'MultiHeritt', 'TATQA']


In [None]:
# Extract tables from the corpus
tables = {}
for doc in convfinqa_corpus:
    tables[doc['_id']] = extract_tables(doc['text'])

# # Save
# np.save('extracted_tables.npy', tables) 
# # Load
# read_dictionary = np.load('extracted_tables.npy',allow_pickle='TRUE').item()
# print(read_dictionary['dd4bff516'][0])

In [None]:
text_processor = DataHandler(Tokenizer(AutoTokenizer.from_pretrained("ProsusAI/finbert")),
                             Embedder(AutoModel.from_pretrained("ProsusAI/finbert")))

# Only docs with labels
doc_ids = np.array([query["_id"] for query in convfinqa_corpus])
doc_texts = np.array([query["text"] for query in convfinqa_corpus])
mask_queries = np.isin(doc_ids, np.unique(convfinqa_qrels["corpus_id"]))
doc_ids = doc_ids[mask_queries]
doc_texts = doc_texts[mask_queries]
corpus = dict(zip(doc_ids, doc_texts))

corpus_embedding = {}
for idx, text in corpus.items():
    embedded_doc = (
        text_processor.load_data(text)
        .to_lowercase()
        .remove_punctuation()
        .remove_numbers()
        .remove_stopwords()
        .lemmatize_text()
        .remove_extra_whitespace()
        .tokenize()
        .embed()
        .get_data()
    )
    corpus_embedding[idx] = embedded_doc

# All documenst
# corpus_embedding = {}
# for idx, documnet in enumerate(convfinqa_corpus[:50]):
#     print(idx)
#     embedded_doc = (
#         text_processor.load_data(documnet['text'])
#         .to_lowercase()
#         .remove_punctuation()
#         .remove_stopwords()
#         .lemmatize_text()
#         .remove_extra_whitespace()
#         .tokenize()
#         .embed()
#         .get_data()
#     )
#     corpus_embedding[documnet['_id']] = embedded_doc

Token indices sequence length is longer than the specified maximum sequence length for this model (818 > 512). Running this sequence through the model will result in indexing errors


In [75]:
EMBEDDING_DIM = 768
vector_store = FaissVectorStore(dimension=EMBEDDING_DIM)
# Populate vector store
for i, doc in corpus_embedding.items():
    # Add vectors to store
    vector_store.add_embeddings(np.array(doc), [i for e in range(len(doc))], [{'chunk': e} for e in range(len(doc))])
# Save vector store to file
vector_store.save(f'../vector_store/{DATASET_NAME}')

In [76]:
# Test the retrieval 

# Get the queries
query_ids = np.array([query["_id"] for query in convfinqa_queries])
query_texts = np.array([query["text"] for query in convfinqa_queries])
mask_queries = np.isin(query_ids, np.unique(convfinqa_qrels["query_id"]))
query_ids = query_ids[mask_queries]
query_texts = query_texts[mask_queries]
queries = dict(zip(query_ids, query_texts))

In [78]:
queries_embedding = {}
for idx, text in queries.items():
    embedded_query = (
        text_processor.load_data(text)
        .to_lowercase()
        .remove_punctuation()
        .remove_stopwords()
        .lemmatize_text()
        .remove_extra_whitespace()
        .tokenize()
        .embed()
        .get_data()
    )
    queries_embedding[idx] = embedded_query

In [83]:
best_match = {}
for query_id, query in queries_embedding.items():
    distances, indices, result_texts, result_metadata = vector_store.similarity_search(
        query, 
        k=5
    )
    print(f"Query: {query_id}")
    print(f"Best match: {result_texts}")
    print(f"Distance: {distances}")
    print(f"indices: {indices}")
    best_match[query_id] = result_texts

n_match = 0
for query_id, result in best_match.items():
    idx = np.argmax(convfinqa_qrels['query_id']==query_id)
    if convfinqa_qrels['corpus_id'][idx] in result:
        n_match += 1

n_match

Query: qd496f102
Best match: ['dd497692a', 'dd4970aa2', 'dd4980376', 'dd4987f04', 'dd49763da']
Distance: [[82.24646  82.74661  85.83011  87.57293  88.800026]]
indices: [[ 3 14 62 18 98]]
Query: qd49859fc
Best match: ['dd497692a', 'dd4980376', 'dd4970aa2', 'dd4987f04', 'dd496d30c']
Distance: [[ 78.655426  81.87396  100.3492   110.28862  115.15927 ]]
indices: [[ 3 62 14 18 86]]
Query: qd498d6fc
Best match: ['dd4970aa2', 'dd497692a', 'dd49763da', 'dd4980376', 'dd4987f04']
Distance: [[115.74117  116.29761  118.263916 119.39206  120.51479 ]]
indices: [[14  3 98 62 18]]
Query: qd496d118
Best match: ['dd4980376', 'dd498a524', 'dd497692a', 'dd4970aa2', 'dd4980772']
Distance: [[69.09505  75.395096 83.41676  84.371574 84.49885 ]]
indices: [[ 62 133   3  14  31]]
Query: qd498f394
Best match: ['dd49870ae', 'dd4980376', 'dd49763da', 'dd497e602', 'dd498a524']
Distance: [[ 88.61063 108.28818 114.49191 119.74736 129.94211]]
indices: [[ 20  62  98 121 133]]
Query: qd4980326
Best match: ['dd497692a', 'd

7

### all-MiniLM-L6-v2

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2', similarity_fn_name='cosine')
corpus_embedding = {}
for idx, text in corpus.items():
    embedded_doc = model.encode(text)
    corpus_embedding[idx] = embedded_doc


EMBEDDING_DIM = 384
vector_store = FaissVectorStore(dimension=EMBEDDING_DIM)
# Populate vector store
for i, doc in corpus_embedding.items():
    # Add vectors to store
    vector_store.add_embeddings([np.array(doc)], [i])
# Save vector store to file
vector_store.save(f'../vector_store/{DATASET_NAME}')

# model = SentenceTransformer('all-MiniLM-L6-v2')
# text_processor = DataHandler(Tokenizer(AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")),
#                              SentenceTransformer('all-MiniLM-L6-v2', similarity_fn_name='cosine'))
# corpus_embedding = {}
# for idx, text in corpus.items():
#     embedded_doc = (
#         text_processor.load_data(text)
#         .to_lowercase()
#         .remove_punctuation()
#         .remove_stopwords()
#         .lemmatize_text()
#         .remove_extra_whitespace()
#         .tokenize()
#         .embed()
#         .get_data()
#     )
#     # embedded_doc = model.encode(text)
#     corpus_embedding[idx] = embedded_doc

# EMBEDDING_DIM = 384
# vector_store = FaissVectorStore(dimension=EMBEDDING_DIM)
# # Populate vector store
# for i, doc in corpus_embedding.items():
#     # Add vectors to store
#     vector_store.add_embeddings(np.array(doc), [i for e in range(len(doc))], [{'chunk': e} for e in range(len(doc))])
# # Save vector store to file
# vector_store.save(f'../vector_store/{DATASET_NAME}')

In [None]:
# Get the queries
query_ids = np.array([query["_id"] for query in convfinqa_queries])
query_texts = np.array([query["text"] for query in convfinqa_queries])
mask_queries = np.isin(query_ids, np.unique(convfinqa_qrels["query_id"]))
query_ids = query_ids[mask_queries]
query_texts = query_texts[mask_queries]
queries = dict(zip(query_ids, query_texts))

In [None]:
queries_embedding = {}
for idx, text in queries.items():
    embedded_query = model.encode(text)
    queries_embedding[idx] = embedded_query

In [None]:
best_match = {}
for query_id, query in queries_embedding.items():
    distances, indices, result_texts, result_metadata = vector_store.similarity_search(
        query, 
        k=5
    )
    print(f"Query: {query_id}")
    print(f"Best match: {result_texts}")
    print(f"Distance: {distances}")
    print(f"indices: {indices}")

    best_match[query_id] = result_texts
n_match = 0
for query_id, result in best_match.items():
    idx = np.argmax(convfinqa_qrels['query_id']==query_id)
    if convfinqa_qrels['corpus_id'][idx] in result:
        n_match += 1
n_match