### Storing chunks into a vector database


In [1]:
import chromadb
from chromadb.utils import embedding_functions
import json
import os

# 1. Initialize the Persistent Client
# This saves the database to a folder on your laptop so it persists after you close the script.
# You will find there sqlite file for metadata and HNSW file(s) for the vectors)
client = chromadb.PersistentClient(path="./medcare_vector_db")



In [2]:
# 2. Define the Embedding Function Explicitly
# We use 'all-MiniLM-L6-v2' which is fast, lightweight, and runs locally.
# If you want a medical-specific model later, you just change the 'model_name'.
# This will download the model to something like:
#    ~/.cache/torch/sentence_transformers/
emb_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)



  from .autonotebook import tqdm as notebook_tqdm
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1174.34it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [3]:
# 3. Create (or get) the Collection
# In the context of vector databases like ChromaDB, a Collection is the equivalent of a Table 
# in a traditional relational database (like SQL).
# Organization: 
#   It is a way to group related documents and their corresponding vector embeddings together. 
#   For example, you might have one collection called "medcare_internal_docs" 
#   for employee policies and another completely separate collection called "patient_records" for medical data.
#
# Here, we pass the embedding function so Chroma knows exactly how to process text.
collection = client.get_or_create_collection(
    name="medcare_internal_docs", 
    embedding_function=emb_fn    # Used for inserting into the DB and also for searching
)



In [4]:
# 4. Load your JSONL file and insert in batches
jsonl_file_path = "medcare_knowledge_base.jsonl" # <--- Update this to your filename

batch_size = 100
documents = []
metadatas = []
ids = []
total_inserted = 0

with open(jsonl_file_path, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        chunk_data = json.loads(line)
        
        text_content = chunk_data.get("text", "")
        metadata_part = chunk_data.get("metadata", {})
        source_info = metadata_part.get("source_file", "unknown_document")
        
        if text_content:
            documents.append(text_content)
            metadatas.append({"source": source_info, "chunk_index": i})
            ids.append(f"medcare_{i}")
            
        # Insert in batches to prevent memory issues
        if len(documents) >= batch_size:
            collection.add(documents=documents, metadatas=metadatas, ids=ids)
            total_inserted += len(documents)
            documents = []  # Reset batch
            metadatas = []
            ids = []
            print(f"Inserted {total_inserted} chunks so far...")

    # Insert any remaining documents after the loop finishes
    if documents:
        collection.add(documents=documents, metadatas=metadatas, ids=ids)
        total_inserted += len(documents)

print(f"\n✅ Success! Loaded a total of {total_inserted} Medcare chunks into the vector database.")


Inserted 100 chunks so far...
Inserted 200 chunks so far...
Inserted 300 chunks so far...
Inserted 400 chunks so far...
Inserted 500 chunks so far...

✅ Success! Loaded a total of 523 Medcare chunks into the vector database.
