In [1]:
import chromadb
from chromadb.config import Settings
import os
from chromadb.utils import embedding_functions
import pandas as pd
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

# Ensure no limitations in Jupyter Notebook output
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# https://cookbook.openai.com/examples/vector_databases/chroma/hyde-with-chroma-and-openai

In [2]:
# Create embedding function
EMBEDDING_MODEL = "text-embedding-3-small"
# embedding_function = OpenAIEmbeddingFunction(api_key=os.environ.get('OPENAI_API_KEY'), model_name=EMBEDDING_MODEL)
embedding_function = OpenAIEmbeddingFunction(api_key=os.environ.get('OPENAI_API_KEY'), model_name="text-embedding-ada-002")

# Create Chroma client (EphemeralClient or PersistentClient)
chroma_client = chromadb.PersistentClient(path="./data/chroma/")
rkb_collection = chroma_client.create_collection(name="RegulatoryKnowledgeBase3", embedding_function=embedding_function)

In [3]:
# Import regulatory requirements
df = pd.read_excel('eu_reg_requirements.xlsx')
df = df.tail(10)

df.head(2)

Unnamed: 0,ID,Document,Paragraph_Article,Domain,Reference,Main_text,cl100k_base_tokens,Ontology_classes
2773,2810,"ECB Guide to internal models, 2024",Paragraph 125,Other,Annex,"Note that the methods below apply to banks that use a pseudo Monte Carlo simulation method and not to banks that apply a quasi Monte Carlo simulation method. In this context, a pseudo Monte Carlo simulation method is defined as a method that utilises a random number generator based on an algorithm creating a sequence of desired length N of numbers that mimic independent samples drawn from a uniform distribution. A quasi Monte Carlo simulation method is defined as a method that utilises a low-discrepancy sequence of numbers, which is deterministically uniformly distributed (e.g. Sobol).",113,Other
2774,2811,"ECB Guide to internal models, 2024",Paragraph 125,Other,Annex,"Irrespective of the numerical method implemented for its estimation of the EEPE (e.g. types of random number generators), the institution should provide an analysis as part of its validation framework demonstrating that its approach has a reasonable accuracy as required by Article 368(1)(f) of the CRR (as referenced by Article 293(1)(a) of the CRR). This analysis should include an assessment of convergence and an error estimation.",92,Other


In [4]:
# Create input items
ids = df['ID'].astype(str).tolist()
texts = ('Reference: ' + df['Reference'] + ' | Domain: ' + df['Domain'] + ' | Main text: ' + df['Main_text']).tolist()
metadatas = df[['Document', 'Paragraph_Article', 'Reference', 'Domain']].to_dict(orient='records')

# Add data to the Chroma collection
rkb_collection.add(
    documents=texts,
    ids=ids,
    metadatas=metadatas,
)

In [5]:
data = rkb_collection.get(include=["embeddings", "documents", "metadatas"])

# Convert embeddings to strings to make them compatible with DataFrame
data['embeddings'] = [str(embedding) for embedding in data['embeddings']]

# Create the DataFrame
df = pd.DataFrame({
    'id': data['ids'],
    'document': data['documents'],
    'metadata': data['metadatas'],
    'embeddings': data['embeddings']
})

df.head()

Unnamed: 0,id,document,metadata,embeddings
0,2810,"Reference: Annex | Domain: Other | Main text: Note that the methods below apply to banks that use a pseudo Monte Carlo simulation method and not to banks that apply a quasi Monte Carlo simulation method. In this context, a pseudo Monte Carlo simulation method is defined as a method that utilises a random number generator based on an algorithm creating a sequence of desired length N of numbers that mimic independent samples drawn from a uniform distribution. A quasi Monte Carlo simulation method is defined as a method that utilises a low-discrepancy sequence of numbers, which is deterministically uniformly distributed (e.g. Sobol).","{'Document': 'ECB Guide to internal models, 2024', 'Domain': 'Other', 'Paragraph_Article': 'Paragraph 125', 'Reference': 'Annex'}",[-0.03082872 -0.00796316 0.01106956 ... -0.00404039 -0.01330422\n -0.02986013]
1,2811,"Reference: Annex | Domain: Other | Main text: Irrespective of the numerical method implemented for its estimation of the EEPE (e.g. types of random number generators), the institution should provide an analysis as part of its validation framework demonstrating that its approach has a reasonable accuracy as required by Article 368(1)(f) of the CRR (as referenced by Article 293(1)(a) of the CRR). This analysis should include an assessment of convergence and an error estimation.","{'Document': 'ECB Guide to internal models, 2024', 'Domain': 'Other', 'Paragraph_Article': 'Paragraph 125', 'Reference': 'Annex'}",[-0.01393504 -0.01012784 -0.00273243 ... 0.00502686 0.01423153\n -0.02792399]
2,2812,"Reference: Annex | Domain: Other | Main text: In the following, ""MC run"" refers to a pseudo Monte Carlo simulation with N scenarios calculated with one particular set of random numbers.","{'Document': 'ECB Guide to internal models, 2024', 'Domain': 'Other', 'Paragraph_Article': 'Paragraph 125', 'Reference': 'Annex'}",[-0.0155222 -0.02197821 0.01169024 ... -0.01770197 -0.0171605\n -0.02168665]
3,2813,Reference: Annex > A.1 Method 1 | Domain: Other | Main text: Let $_{𝐸𝐸𝑃𝐸}$$_{𝑁}$$_{(𝛼}$ ̂ ) denote the estimator of the EEPE for one given netting set $_{𝛼}$ obtained from one MC run with $_{𝑁}$ simulations (e.g. $_{𝑁 = 2000)}$.,"{'Document': 'ECB Guide to internal models, 2024', 'Domain': 'Other', 'Paragraph_Article': 'Paragraph 125', 'Reference': 'Annex > A.1 Method 1'}",[-0.00971035 -0.01784643 0.01868232 ... -0.01383412 0.00392176\n -0.05670176]
4,2814,"Reference: Annex > A.1 Method 1 | Domain: Other | Main text: The institution can estimate an MC error on $_{𝐸𝐸𝑃𝐸}$$_{𝑁}$$_{(𝛼}$ ̂ $_{)}$, on the basis of a 95% confidence level, by using a set of several MC runs. In what follows, notations are simplified: $_{𝐸𝐸𝑃𝐸}$$_{𝑁}$$_{(𝛼}$ ̂ ) is replaced by $_{𝐸𝐸𝑃𝐸}$ ̂ ; $_{𝛼}$ and $_{𝑁}$ are dropped, since the calculations detailed below are performed on the same netting set $_{𝛼}$ and with the same number of simulations per MC run, $_{𝑁}$.","{'Document': 'ECB Guide to internal models, 2024', 'Domain': 'Other', 'Paragraph_Article': 'Paragraph 125', 'Reference': 'Annex > A.1 Method 1'}",[-0.01270331 -0.00617464 -0.00262223 ... -0.00159572 -0.00619547\n -0.03629122]


In [7]:
results = rkb_collection.query(
    query_texts=["What analysis requirements apply to EEPE estimation?"],
    n_results=3,
)

results

{'ids': [['2811', '2813', '2818']],
 'embeddings': None,
 'documents': [['Reference: Annex | Domain: Other | Main text: Irrespective of the numerical method implemented for its estimation of the EEPE (e.g. types of random number generators), the institution should provide an analysis as part of its validation framework demonstrating that its approach has a reasonable accuracy as required by Article 368(1)(f) of the CRR (as referenced by Article 293(1)(a) of the CRR). This analysis should include an assessment of convergence and an error estimation.',
   'Reference: Annex > A.1 Method 1 | Domain: Other | Main text: Let $_{𝐸𝐸𝑃𝐸}$$_{𝑁}$$_{(𝛼}$ ̂ ) denote the estimator of the EEPE for one given netting set $_{𝛼}$ obtained from one MC run with $_{𝑁}$ simulations (e.g. $_{𝑁 = 2000)}$.',
   'Reference: Annex > A.1 Method 1 | Domain: Other | Main text: The rationale of this formula is as follows.']],
 'uris': None,
 'data': None,
 'metadatas': [[{'Document': 'ECB Guide to internal models, 2024