In [9]:
# Import necessary libraries and modules
import os
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from rank_bm25 import BM25Okapi
import pickle

In [10]:
# Directory and file paths
JUDGMENTS_DIR = "Judgement_txt"   
CHUNKED_DOCS_FILE = "chunked_documents.pkl"  
BM25_RETRIEVER_FILE = "bm25_retriever.pkl"   

In [11]:
def chunk_and_store_documents(judgments_dir):
    """
    Processes judgment text files by splitting them into manageable chunks, 
    stores the chunked documents on disk, and creates a BM25Retriever for retrieval.

    Parameters:
    - judgments_dir: The directory containing judgment text files.

    Output:
    - None: Outputs include the chunked documents and BM25Retriever saved to disk.
    """
    documents = []   
 
    for filename in os.listdir(judgments_dir):
        if filename.endswith(".txt"):   
            file_path = os.path.join(judgments_dir, filename)
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()   
                documents.append(Document(page_content=content, metadata={"source": filename}))

    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)  
    chunked_docs = text_splitter.split_documents(documents)  
 
    with open(CHUNKED_DOCS_FILE, "wb") as f:
        pickle.dump(chunked_docs, f)   
    print(f"Chunked documents stored in {CHUNKED_DOCS_FILE}")

    tokenized_chunks = []   
    for doc in chunked_docs:
        content = doc.page_content   
        doc_tokens = content.split()  
        tokenized_chunks.append(doc_tokens)   

    retriever = BM25Okapi(tokenized_chunks)   
 
    with open(BM25_RETRIEVER_FILE, "wb") as f:
        pickle.dump(retriever, f)   
    print(f"BM25Retriever stored in {BM25_RETRIEVER_FILE}")



In [None]:
# Process and store judgment text files with chunking and retrieval setup.
chunk_and_store_documents(JUDGMENTS_DIR)

In [None]:
# def find_query(query, top_k=5): 
#     with open(BM25_RETRIEVER_FILE, "rb") as f:
#         retriever = pickle.load(f)

#     with open(CHUNKED_DOCS_FILE, "rb") as f:
#         corpus = pickle.load(f)
    
#     processed_query = query.split() 
#     results = retriever.get_top_n(processed_query, corpus, top_k) 

#     for i, result in enumerate(results):
#         print(f"Result {i + 1}:")
#         print(f"Content: {result.page_content[:500]}...")
#         print(f"Source: {result.metadata['source']}")
#         print("-" * 50)

#     return results

In [None]:
# query = "What is the main constitutional challenge presented in the petition under Article 32 of the Indian Constitution regarding the U.P. Land Tenures (Regulation of Transfers) Act 1952 and the Indian Forest (U.P. Amendment) Act 1956?"
# results = find_query(query)