# Create various vector databases and other retrieval mechanisms
Test area to create clients for the necessary retrieval mechanisms

In [1]:
import json
import os

# relies on the data being in the ConvFinQA/data/ directory. Please run scripts/unzip_data.sh to unzip the data.
data_dir = os.path.join('..', 'ConvFinQA/data/')

train_data = json.load(open(os.path.join(data_dir, 'train.json')))

In [2]:
import lancedb
from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry
from lancedb.rerankers import CohereReranker

from notebook_utils import format_data

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]



## Create the vector DB

Get our documents formatted into a list

In [3]:
docs = [format_data(chunk) for chunk in train_data]

Define our embedding model

In [4]:
sentence_model = get_registry().get("sentence-transformers").create(name="BAAI/bge-small-en-v1.5")

Define the schema we'll use to save our data in the vector DB

In [5]:
class Document(LanceModel):
    id: str
    text: str = sentence_model.SourceField()
    vector: Vector(384) = sentence_model.VectorField() # type: ignore
    year: int

Create the DB

In [6]:
db = lancedb.connect("../.my_db")

# tbl = db.create_table(
#     "financial_docs",
#     data=docs,
#     schema=Document,
# )

tbl = db.open_table("financial_docs")

LanceDB allows us to very easily add a Full Text Search index to the table (based on BM25: https://lancedb.github.io/lancedb/fts/)
This adds almost no latency and is great for picking out key words in query that a vector search might miss

In [4]:
# add full text search index
tbl.create_fts_index("text", replace=True)

Query the DB

In [7]:
query = "what is the percentage change in estimated fair value of the cash flow hedges from 2005 to 2006?"

results = tbl.search(query, query_type="hybrid") # uses both the vector index and the TFIDF index
results = results.limit(10)

results = results.to_list()


In [8]:
results

[{'id': 'Double_MAR/2005/page_52.pdf',
  'text': 'fair value of financial instruments we believe that the fair values of current assets and current liabilities approximate their reported carrying amounts .\nthe fair values of non-current financial assets , liabilities and derivatives are shown in the following table. .\n\n|                                                | 2005 carrying amount   | 2005 fair value   | 2005 carrying amount   | fair value   |\n|:-----------------------------------------------|:-----------------------|:------------------|:-----------------------|:-------------|\n| notes and other long-term assets               | $ 1374                 | $ 1412            | $ 1702                 | $ 1770       |\n| long-term debt and other long-term liabilities | $ 1636                 | $ 1685            | $ 848                  | $ 875        |\n| derivative instruments                         | $ 6                    | $ 6               | $ 2014                 | $ 2014 

## Rerank the results
Using cohere because it's the best reranker

In [11]:
import cohere

co = cohere.Client(os.getenv("COHERE_API_KEY"))

unranked_results = [
    {"text": result['text'], "id": result['id']} for result in results
]

ranked_results = co.rerank(
    model="rerank-english-v3.0",
    query=query,
    documents=unranked_results,
    top_n=10,
)

In [12]:
ranked_results.results

[RerankResponseResultsItem(document=None, index=5, relevance_score=0.9815244),
 RerankResponseResultsItem(document=None, index=7, relevance_score=0.9815244),
 RerankResponseResultsItem(document=None, index=9, relevance_score=0.952131),
 RerankResponseResultsItem(document=None, index=3, relevance_score=0.81844676),
 RerankResponseResultsItem(document=None, index=1, relevance_score=0.7572562),
 RerankResponseResultsItem(document=None, index=0, relevance_score=0.68563384),
 RerankResponseResultsItem(document=None, index=2, relevance_score=0.23335676),
 RerankResponseResultsItem(document=None, index=4, relevance_score=0.23335676),
 RerankResponseResultsItem(document=None, index=6, relevance_score=0.030100338),
 RerankResponseResultsItem(document=None, index=8, relevance_score=0.000981172)]

In [13]:
for result in ranked_results.results:
    idx = result.index
    doc = unranked_results[idx]
    print(f"\nScore: {result.relevance_score:.4f}")
    print(f"Document ID: {doc['id']}")
    print(f"Text: {doc['text'][:200]}...") 


Score: 0.9815
Document ID: Single_FIS/2006/page_88.pdf-4
Text: through the certegy merger , the company has an obligation to service $ 200 million ( aggregate principal amount ) of unsecured 4.75% ( 4.75 % ) fixed-rate notes due in 2008 .
the notes were recorded ...

Score: 0.9815
Document ID: Double_FIS/2006/page_88.pdf
Text: through the certegy merger , the company has an obligation to service $ 200 million ( aggregate principal amount ) of unsecured 4.75% ( 4.75 % ) fixed-rate notes due in 2008 .
the notes were recorded ...

Score: 0.9521
Document ID: Single_RCL/2006/page_37.pdf-2
Text: note 9 .
retirement plan we maintain a defined contribution pension plan covering full-time shoreside employees who have completed the minimum period of continuous service .
annual contributions to th...

Score: 0.8184
Document ID: Single_AMT/2003/page_85.pdf-2
Text: american tower corporation and subsidiaries notes to consolidated financial statements 2014 ( continued ) maturities 2014as of decembe