In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from dataset import *
from data_handler import *
from embeddings import *
from vector_store import *

In [2]:
dataset_manager = FinanceRAGDataset("../data")

# List available datasets
print("Available datasets:", dataset_manager.list_datasets())

# Load corpus and queries from a specific dataset
DATASET_NAME = "ConvFinQA"
convfinqa_corpus, convfinqa_queries, convfinqa_qrels = dataset_manager.load_dataset(DATASET_NAME)

Available datasets: ['ConvFinQA', 'FinQA', 'MultiHeritt', 'TATQA']


In [None]:
# Extract tables from the corpus
tables = {}
for doc in convfinqa_corpus:
    tables[doc['_id']] = extract_tables(doc['text'])

# # Save
# np.save('extracted_tables.npy', tables) 
# # Load
# read_dictionary = np.load('extracted_tables.npy',allow_pickle='TRUE').item()
# print(read_dictionary['dd4bff516'][0])

In [3]:
text_processor = DataHandler(Tokenizer(AutoTokenizer.from_pretrained("ProsusAI/finbert")),
                             Embedder(AutoModel.from_pretrained("ProsusAI/finbert")))

embedding = {}
for idx, documnet in enumerate(convfinqa_corpus):
    print(idx)
    embedded_doc = (
        text_processor.load_data(documnet['text'])
        .to_lowercase()
        .remove_punctuation()
        .remove_stopwords()
        .lemmatize_text()
        .remove_extra_whitespace()
        .tokenize()
        .embed()
        .get_data()
    )
    embedding[documnet['_id']] = embedded_doc

Some weights of the model checkpoint at ProsusAI/finbert were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512). Running this sequence through the model will result in indexing errors


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [6]:
EMBEDDING_DIM = 768
vector_store = FaissVectorStore(dimension=EMBEDDING_DIM)
# Populate vector store
for i, doc in embedding.items():
    # Add vectors to store
    vector_store.add_embeddings(np.array(doc), [i for e in range(len(doc))], [{'chunk': e} for e in range(len(doc))])
# Save vector store to file
vector_store.save(f'../vector_store/{DATASET_NAME}')

In [17]:

# Simulate a query embedding
query_embedding = np.random.rand(1, EMBEDDING_DIM)

# Perform similarity search
distances, indices, result_texts, result_metadata = vector_store.similarity_search(
    query_embedding, 
    k=2,
    filter_metadata={"category": "AI"}
)

# Print results
for dist, text, meta in zip(distances[0], result_texts, result_metadata):
    print(f"Distance: {dist}")
    print(f"Text: {text}")
    print(f"Metadata: {meta}")
    print("---")

# Save vector store
# vector_store.save(f'../vector_store/{DATASET_NAME}')

# Load vector store
# loaded_store = FaissVectorStore.load(f'../vector_store/{DATASET_NAME}', dimension=EMBEDDING_DIM)

Distance: 445.6521911621094
Text: dd4bff516
Metadata: {'chunk': 1}
---
Distance: 451.95758056640625
Text: dd4b93b5e
Metadata: {'chunk': 1}
---
