In [None]:
# Import necessary libraries and modules
import os   
from langchain_huggingface import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
import chromadb
from langchain_experimental.text_splitter import SemanticChunker 
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings

In [None]:
# Directory path for text judgments
TXT_DIRECTORY = 'Judgement_txt' 
# Variable for collection name
COLLECTION_NAME = "legal_judgments"  

# Initialize models for embedding, semantic chunking and encoding
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")   
encoder_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')   
semantic_chunker = SemanticChunker(embedding_model)   

In [None]:
def init_chromadb(collection_name):
    """
    Initializes ChromaDB with persistence settings.

    Parameters:
    - collection_name: Name of the collection to initialize in ChromaDB.

    Output:
    - Returns an initialized collection from ChromaDB.
    """
    client = chromadb.Client(Settings(is_persistent=True, persist_directory='./chromadb'))   
    collection = client.get_or_create_collection(collection_name)
    print(f"ChromaDB initialized and collection '{collection_name}' ready")
    return collection

In [None]:
def add_documents_to_chromadb(collection, txt_directory, encoder_model, semantic_chunker):
    """
    Adds documents to ChromaDB after semantic chunking.

    Parameters:
    - collection: The ChromaDB collection where documents are added.
    - txt_directory: Directory containing judgment text files.
    - encoder_model: Sentence transformer model for encoding chunks.
    - semantic_chunker: Object for semantic chunking of documents.

    Output:
    - None
    """
    for filename in os.listdir(txt_directory):
        if filename.endswith(".txt"):
            file_path = os.path.join(txt_directory, filename)
 
            with open(file_path, 'r', encoding='utf-8') as file:
                judgment_text = file.read()
 
            docs = semantic_chunker.create_documents([judgment_text]) 
            chunks = []
            for doc in docs:
                chunks.append(doc.page_content)
             
            chunk_embeddings = [encoder_model.encode(chunk).tolist() for chunk in chunks]
 
            for idx, chunk in enumerate(chunks):
                chunk_id = f"{filename}_chunk_{idx}"
                print(chunk_embeddings[idx])
                collection.add(
                    documents=[chunk],
                    embeddings=[chunk_embeddings[idx]],
                    ids=[chunk_id]
                )
 
            print(f"Added semantic chunks of {filename} to ChromaDB")
    return encoder_model

In [None]:
# Initialize the ChromaDB collection
collection = init_chromadb(COLLECTION_NAME)

# Add documents to the collection and update the encoder model
add_documents_to_chromadb(collection, TXT_DIRECTORY, encoder_model, semantic_chunker)