In [1]:
import os
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm
import chromadb
from chromadb.utils import embedding_functions

In [2]:
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
DATA_DIR = os.path.join(PROJECT_ROOT, "data")
# PROCESSED_CHUNKS_PATH = os.path.join(DATA_DIR, "pakistan_laws_chunks.csv")
PROCESSED_CHUNKS_PATH = os.path.join(DATA_DIR, "pakistan_laws_chunks_with_embeddings.csv")
VECTOR_DB_DIR = os.path.join(DATA_DIR, "chroma_db") # Directory to store ChromaDB data

# Ensure the data directory exists (already should from previous steps)
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(VECTOR_DB_DIR, exist_ok=True) # Create directory for the vector DB

print(f"Project root: {PROJECT_ROOT}")
print(f"Data directory: {DATA_DIR}")
print(f"Processed chunks path: {PROCESSED_CHUNKS_PATH}")
print(f"Vector DB directory: {VECTOR_DB_DIR}")

Project root: /Users/apple/PycharmProjects/LLM_Production_01_RAG/Haq_ooq_RAG
Data directory: /Users/apple/PycharmProjects/LLM_Production_01_RAG/Haq_ooq_RAG/data
Processed chunks path: /Users/apple/PycharmProjects/LLM_Production_01_RAG/Haq_ooq_RAG/data/pakistan_laws_chunks_with_embeddings.csv
Vector DB directory: /Users/apple/PycharmProjects/LLM_Production_01_RAG/Haq_ooq_RAG/data/chroma_db


In [4]:
# --- Load the Processed Chunks (now with embeddings) ---
try:
    chunks_df = pd.read_csv(PROCESSED_CHUNKS_PATH)
    # Convert the 'embedding' column from string representation of list to actual list of floats
    # This is crucial because Pandas saves lists as strings in CSVs
    chunks_df['embedding'] = chunks_df['embedding'].apply(eval) # eval() safely converts string representation of list to list
    
    print(f"\nSuccessfully loaded {len(chunks_df)} chunks from {PROCESSED_CHUNKS_PATH}")
    print("\nFirst 5 chunks:")
    print(chunks_df.head())
    print("\nDataFrame columns:", chunks_df.columns.tolist())
    print("\nExample chunk content (first 200 chars):")
    if not chunks_df.empty:
        print(chunks_df['chunk_content'].iloc[0][:200])
        print(f"\nShape of the first embedding: {len(chunks_df['embedding'].iloc[0])}")
    else:
        print("DataFrame is empty.")

except FileNotFoundError:
    print(f"Error: {PROCESSED_CHUNKS_PATH} not found.")
    print("Please ensure 'pakistan_laws_chunks_with_embeddings.csv' is in your 'data/' directory.")
    chunks_df = pd.DataFrame() # Initialize an empty DataFrame to prevent further errors
except Exception as e:
    print(f"An error occurred while loading or processing the CSV: {e}")
    chunks_df = pd.DataFrame()


Successfully loaded 30122 chunks from /Users/apple/PycharmProjects/LLM_Production_01_RAG/Haq_ooq_RAG/data/pakistan_laws_chunks_with_embeddings.csv

First 5 chunks:
                                         source_file          section_title  \
0  administrator00532129aba2e10fe634ab8fbd94c50b.pdf  Preamble/Introduction   
1  administrator00532129aba2e10fe634ab8fbd94c50b.pdf  Preamble/Introduction   
2  administrator00532129aba2e10fe634ab8fbd94c50b.pdf  Preamble/Introduction   
3  administrator00532129aba2e10fe634ab8fbd94c50b.pdf  Preamble/Introduction   
4  administrator00532129aba2e10fe634ab8fbd94c50b.pdf  Preamble/Introduction   

                                       chunk_content  chunk_length  \
0  THE PRIVATISATION COMMISSION ORDINANCE, 2000 P...          1499   
1  Annual Report. 38. Information to Public. 39. ...          1498   
2  day of October, 1999, and the Provisional Cons...          1495   
3  (g) “person” includes an individual, partnersh...          1499   
4  Act, 19

In [4]:
model = SentenceTransformer("BAAI/bge-large-en-v1.5")

try:
    # Set device to 'cuda' if GPU is available, otherwise 'cpu'
    model = SentenceTransformer("BAAI/bge-large-en-v1.5", device='cuda' if torch.cuda.is_available() else 'cpu')
    print("Embedding model loaded successfully.")
    print(f"Model will run on: {model.device}")
except Exception as e:
    print(f"Error loading embedding model: {e}")
    # ... (rest of your fallback logic)

Embedding model loaded successfully.
Model will run on: cpu


## Generate Embeddings

In [None]:
import numpy
print(f"\nGenerating embeddings for {len(chunks_df)} chunks...")
tqdm.pandas(desc='Generating embeddings...')

chunks_df['embedding'] = chunks_df['chunk_content'].progress_apply(lambda x: model.encode(x).tolist())
print("\nEmbeddings generated successfully!")
print("\nDataFrame with embeddings (first chunk's embedding snippet):")



Generating embeddings for 30122 chunks...


Generating embeddings...:   0%|          | 0/30122 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


In [5]:
# Print a snippet of the first embedding to confirm its presence and type
if not chunks_df.empty and 'embedding' in chunks_df.columns:
    print(chunks_df[['source_file', 'section_title', 'chunk_content', 'embedding']].head(1))
    print(f"\nShape of the first embedding: {len(chunks_df['embedding'].iloc[0])}")
else:
    print("No chunks or embedding column found.")

                                         source_file          section_title  \
0  administrator00532129aba2e10fe634ab8fbd94c50b.pdf  Preamble/Introduction   

                                       chunk_content  \
0  THE PRIVATISATION COMMISSION ORDINANCE, 2000 P...   

                                           embedding  
0  [-0.020446470007300377, -0.03650873154401779, ...  

Shape of the first embedding: 1024


In [6]:
# --- Initialize ChromaDB Client ---
print(f"\nInitializing ChromaDB client at: {VECTOR_DB_DIR}")

client = chromadb.PersistentClient(path=VECTOR_DB_DIR)
COLLECTION_NAME = "pakistan_laws_chunks_collection"
EMBEDDING_MODEL_NAME = "BAAI/bge-large-en-v1.5"

try:
    collection = client.get_or_create_collection(COLLECTION_NAME, embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(model_name=EMBEDDING_MODEL_NAME, device='cpu'))
    print(f"ChromaDB collection '{COLLECTION_NAME}' accessed/created.")
    print(f"Current count in collection: {collection.count()} chunks.")
except Exception as e:
    print(f"Error accessing/creating ChromaDB collection: {e}")
    print("Please ensure ChromaDB is correctly installed and accessible.")
    # Exit or handle error appropriately if collection cannot be created/accessed
    exit()


Initializing ChromaDB client at: /Users/apple/PycharmProjects/LLM_Production_01_RAG/Haq_ooq_RAG/data/chroma_db
ChromaDB collection 'pakistan_laws_chunks_collection' accessed/created.
Current count in collection: 0 chunks.


In [7]:
# prepare data for chroma db
ids = [f"chunk_{i}" for i in range(len(chunks_df))]
documents = chunks_df['chunk_content'].tolist()
metadatas = chunks_df[['source_file', 'section_title', 'chunk_length', 'start_index_in_section']].to_dict(orient='records')
embeddings = chunks_df['embedding'].tolist()

In [8]:
# --- Add Embeddings to the Collection ---
if collection.count() == len(ids):
    print("All chunks already appear to be in the collection. Skipping add operation.")
else:
    print(f"\nAdding {len(ids)} chunks to the ChromaDB collection. This might take a moment...")
    BATCH_SIZE = 500
    for i in tqdm(range(0, len(ids), BATCH_SIZE), desc="Adding chunks to ChromaDB"):
        batch_ids = ids[i:i+BATCH_SIZE]
        batch_documents = documents[i:i+BATCH_SIZE]
        batch_metadatas = metadatas[i:i+BATCH_SIZE]
        batch_embeddings = embeddings[i:i+BATCH_SIZE]
        
        collection.add(
            ids=batch_ids,
            documents=batch_documents,
            metadatas=batch_metadatas,
            embeddings=batch_embeddings
        )
        print("\nAll chunks successfully added to ChromaDB!")
        
print(f"\nFinal count in ChromaDB collection '{COLLECTION_NAME}': {collection.count()} chunks.")
print("\n--- Task 3: Embedding Generation and Vector Database Indexing complete! ---")


Adding 30122 chunks to the ChromaDB collection. This might take a moment...


Adding chunks to ChromaDB:   0%|          | 0/61 [00:00<?, ?it/s]


All chunks successfully added to ChromaDB!

All chunks successfully added to ChromaDB!

All chunks successfully added to ChromaDB!

All chunks successfully added to ChromaDB!

All chunks successfully added to ChromaDB!

All chunks successfully added to ChromaDB!

All chunks successfully added to ChromaDB!

All chunks successfully added to ChromaDB!

All chunks successfully added to ChromaDB!

All chunks successfully added to ChromaDB!

All chunks successfully added to ChromaDB!

All chunks successfully added to ChromaDB!

All chunks successfully added to ChromaDB!

All chunks successfully added to ChromaDB!

All chunks successfully added to ChromaDB!

All chunks successfully added to ChromaDB!

All chunks successfully added to ChromaDB!

All chunks successfully added to ChromaDB!

All chunks successfully added to ChromaDB!

All chunks successfully added to ChromaDB!

All chunks successfully added to ChromaDB!

All chunks successfully added to ChromaDB!

All chunks successfully added t