In [151]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from dataset import *
from data_handler import *
from embeddings import *
from vector_store import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [152]:
dataset_manager = FinanceRAGDataset("../data")

# List available datasets
print("Available datasets:", dataset_manager.list_datasets())

# Load corpus and queries from a specific dataset
DATASET_NAME = "ConvFinQA"
convfinqa_corpus, convfinqa_queries, convfinqa_qrels = dataset_manager.load_dataset(DATASET_NAME)

Available datasets: ['ConvFinQA', 'FinQA', 'MultiHeritt', 'TATQA']


In [None]:
text_processor = DataHandler(Tokenizer(AutoTokenizer.from_pretrained("ProsusAI/finbert")),
                             Embedder(AutoModel.from_pretrained("ProsusAI/finbert")))

embedding = {}
for idx, documnet in enumerate(convfinqa_corpus):
    print(idx)
    embedded_doc = (
        text_processor.load_data(documnet['text'])
        .to_lowercase()
        .remove_punctuation()
        .remove_stopwords()
        .remove_numbers()
        .lemmatize_text()
        .remove_extra_whitespace()
        .tokenize()
        .embed()
        .get_data()
    )
    embedding[idx] = embedded_doc

In [None]:
EMBEDDING_DIM = 768
vector_store = FaissVectorStore(dimension=EMBEDDING_DIM)
# Populate vector store
for i, doc in embedding.items():
    # Add vectors to store
    print(doc)
    print(i)
    vector_store.add_embeddings(np.array(doc), [str(i) for e in range(len(doc))])
# Save vector store to file
vector_store.save(f'../vector_store/{DATASET_NAME}')

[[ 0.1349003   0.22586876 -0.39407843 ... -0.49132106  0.21237546
   0.65658605]
 [-0.18108624  0.05043052 -0.5433197  ... -0.63953054 -0.78564227
   0.51162016]]
0
[[-6.10675551e-02  1.14977777e-01 -9.44313630e-02 -2.51915663e-01
   3.53511363e-01  3.93662184e-01  2.26264834e-01 -1.22988269e-01
  -4.45489705e-01  4.33152020e-02 -2.41951734e-01 -1.48390919e-01
  -1.45658404e-01  3.84687454e-01 -4.24942285e-01  3.98328304e-01
   2.97812521e-01  3.92830282e-01 -1.51607677e-01  7.99889386e-01
   9.88714397e-02 -4.54491377e-01  8.27151656e-01 -1.17308132e-01
  -2.14935951e-02  4.60084081e-02 -9.86451149e-01 -1.52195215e-01
  -2.61653543e-01  6.65963292e-01 -1.06717184e-01  3.39078486e-01
   4.35479879e-01 -2.90915340e-01  9.66252685e-01 -8.60326469e-01
   9.43163276e-01 -4.87437308e-01  8.27625394e-01 -1.10918455e-01
   4.51289415e-02  6.98374331e-01  2.53335178e-01 -7.46211708e-01
   5.03286541e-01  2.24708945e-01 -3.92508364e+00  3.46162528e-01
  -6.58662170e-02 -4.95137393e-01  6.571761

In [149]:

# Simulate a query embedding
query_embedding = np.random.rand(1, EMBEDDING_DIM)

# Perform similarity search
distances, indices, result_texts, result_metadata = vector_store.similarity_search(
    query_embedding, 
    k=2,
    filter_metadata={"category": "AI"}
)

# Print results
for dist, text, meta in zip(distances[0], result_texts, result_metadata):
    print(f"Distance: {dist}")
    print(f"Text: {text}")
    print(f"Metadata: {meta}")
    print("---")

# Save vector store
# vector_store.save(f'../vector_store/{DATASET_NAME}')

# Load vector store
# loaded_store = FaissVectorStore.load(f'../vector_store/{DATASET_NAME}', dimension=EMBEDDING_DIM)

Distance: 303.53717041015625
Text: 289
Metadata: {}
---
Distance: 455.27191162109375
Text: 308
Metadata: {}
---
