In [1]:
!python3.10 -m venv chromadb_env

In [2]:
!source chromadb_env/bin/activate

In [None]:
!pip install --upgrade chromadb

In [None]:
!pip install sentence_transformers

In [8]:
import json
from pathlib import Path
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions

client = chromadb.Client(settings=chromadb.config.Settings(persist_directory="/home/jupyter/rbi-bot/chromadb_data"))

# Load SentenceTransformer model
model_name = "all-mpnet-base-v2"
model = SentenceTransformer(model_name)

# Get or create your collection
collection_name = "rbi_embeddings"
collection = client.get_or_create_collection(collection_name)

def read_embeddings(file_path):
    """Reads embeddings from a JSON file and converts them to lists of floats."""
    try:
        with open(file_path, 'r') as file:
            for line in file:
                data = json.loads(line)
                # Convert embeddings to a list of floats
                data['chunk-embedding'] = [float(x) for x in data['chunk-embedding']] 
                yield data
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except json.JSONDecodeError:
        print(f"Invalid JSON format in file: {file_path}")
        
def purge_data(collection):
    try:
        # Fetch all existing IDs in the collection
        existing_ids = collection.get()["ids"]

        # If there are IDs to delete, delete them
        if existing_ids:
            collection.delete(ids=existing_ids) 

    except KeyError:
        # If the collection is empty, the 'ids' key won't exist
        pass

def add_embeddings_to_chroma(embeddings_directory, collection, batch_size=1000):
    """Adds embeddings from JSON files to a ChromaDB collection."""
    
    # Delete all existing data from the collection
    purge_data(collection)
    
    seen_ids = set()
    batch_data = []

    total_embeddings_added = 0 
    for file_path in embeddings_directory.glob("*.json"):
        for embedding_data in read_embeddings(file_path):
            try:
                unique_id = f"{embedding_data['document-id']}_{embedding_data['chunk-id']}"

                if unique_id in seen_ids:
                    # print(f"Skipping duplicate ID: {unique_id}")
                    continue  # Skip this embedding

                seen_ids.add(unique_id)

                batch_data.append(
                    {
                        "ids": unique_id,
                        "embedding": embedding_data["chunk-embedding"],
                        "metadata": {
                            "document_id": embedding_data["document-id"],
                            "text": embedding_data["chunk-text"],
                        },
                        "document": embedding_data["chunk-text"],
                    }
                )
                
                if len(batch_data) >= batch_size:
                    # Add all the data in batch_data to the collection
                    ids = []
                    embeddings = []
                    metadatas = []
                    documents = []
                    for data in batch_data:
                        ids.append(data["ids"])
                        embeddings.append(data["embedding"])
                        metadatas.append(data["metadata"])
                        documents.append(data["document"])
                    collection.add(ids=ids, embeddings=embeddings, metadatas=metadatas, documents=documents)
                    total_embeddings_added += len(batch_data)
                    batch_data = []  

            except KeyError as e:
                print(f"Missing key in embedding data: {e}")

    # Add any remaining embeddings
    if batch_data:
         # Add all the data in batch_data to the collection
        ids = []
        embeddings = []
        metadatas = []
        documents = []
        for data in batch_data:
            ids.append(data["ids"])
            embeddings.append(data["embedding"])
            metadatas.append(data["metadata"])
            documents.append(data["document"])
        collection.add(ids=ids, embeddings=embeddings, metadatas=metadatas, documents=documents)
        total_embeddings_added += len(batch_data)
        
    print("Embeddings added to Chroma DB successfully!")
    print(f"Total Embeddings Added: {total_embeddings_added}")  # Print total count

def search_rbi_documents(query_text, n_results=3, collection=collection):
    """Searches RBI documents based on a query."""
    query_embedding = model.encode(query_text).tolist()
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results,
        include=["documents", "distances", "metadatas"]
    )
    return results

In [9]:
# Main execution
embeddings_directory = Path("/home/jupyter/rbi-bot/embeddings/")
add_embeddings_to_chroma(embeddings_directory, collection)

Embeddings added to Chroma DB successfully!
Total Embeddings Added: 1503


In [None]:
query_text = "What are the guidelines for opening a new bank account?"
search_results = search_rbi_documents(query_text)

for i in range(len(search_results['ids'][0])):  # Iterate over the indices of the results
    document_id = search_results['ids'][0][i]      # Get the document ID at the index
    document = search_results['documents'][0][i]   # Get the document text
    distance = search_results['distances'][0][i]   # Get the distance
    metadata = search_results['metadatas'][0][i]   # Get the metadata (dictionary)
    
    print(f"\nDocument: {document}")
    print(f"Distance: {distance}")
    print(f"Metadata: {metadata}")  # No need for [0] here, as metadata is already a dictionary

In [None]:
collection_info = collection.get()
print(collection_info)

In [6]:
print(client.get_settings())

environment='' chroma_api_impl='chromadb.api.segment.SegmentAPI' chroma_server_nofile=None chroma_server_thread_pool_size=40 tenant_id='default' topic_namespace='default' chroma_server_host=None chroma_server_headers=None chroma_server_http_port=None chroma_server_ssl_enabled=False chroma_server_ssl_verify=None chroma_server_api_default_path='/api/v1' chroma_server_cors_allow_origins=[] is_persistent=False persist_directory='/home/jupyter/rbi-bot/chromadb_data' chroma_memory_limit_bytes=0 chroma_segment_cache_policy=None allow_reset=False chroma_auth_token_transport_header=None chroma_client_auth_provider=None chroma_client_auth_credentials=None chroma_server_auth_ignore_paths={'/api/v1': ['GET'], '/api/v1/heartbeat': ['GET'], '/api/v1/version': ['GET']} chroma_overwrite_singleton_tenant_database_access_from_auth=False chroma_server_authn_provider=None chroma_server_authn_credentials=None chroma_server_authn_credentials_file=None chroma_server_authz_provider=None chroma_server_authz_co