# Task 2: Text Chunking, Embedding, and Vector Store Indexing

This notebook processes cleaned complaint narratives, generates embeddings, and indexes them in a ChromaDB vector store with persistence. It uses LangChain's RecursiveCharacterTextSplitter for chunking.

**Objectives**:
- Chunk cleaned narratives from `filtered_complaints.csv` using RecursiveCharacterTextSplitter
- Generate embeddings using SentenceTransformers
- Index embeddings and metadata (Complaint ID, Mapped_Product) in a persistent ChromaDB store

**File Paths**:
- Input: `data/processed/filtered_complaints.csv`
- Output: ChromaDB collection in `data/embeddings/`

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import logging
from sentence_transformers import SentenceTransformer
import chromadb
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Optional utility module for chunking and embedding (not used in this run)
# from embedding_utils import chunk_text, generate_embeddings
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define paths
# Get the path to the current notebook file
# If running directly in a .py file, use Path(__file__).parent
# For Jupyter notebooks, Path.cwd() often works for the notebook's directory.
# Then, go up one level to reach the project root.
PROJECT_ROOT = Path.cwd().parent


CSV_PATH = PROJECT_ROOT / 'data' / 'processed' / 'filtered_complaints.csv'
EMBEDDINGS_DIR = PROJECT_ROOT / 'data' / 'embeddings'
EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Input CSV: {CSV_PATH}")
print(f"Output directory: {EMBEDDINGS_DIR}")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Input CSV: c:\Users\hp\Desktop\projects\10 Acadamy -KAIM5\financial-complaint-chatbot\data\processed\filtered_complaints.csv
Output directory: c:\Users\hp\Desktop\projects\10 Acadamy -KAIM5\financial-complaint-chatbot\data\embeddings


In [2]:
# Load and process data
df = pd.read_csv(CSV_PATH)
if 'Cleaned_Narrative' not in df.columns or 'Complaint ID' not in df.columns or 'Mapped_Product' not in df.columns:
    logger.error("Required columns (Cleaned_Narrative, Complaint ID, Mapped_Product) not found in CSV")
else:
    # Initialize RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=50,
        length_function=len,
        add_start_index=True
    )
    
    # Chunk narratives and associate with metadata
    all_chunks = []
    chunk_metadata = []
    for idx, row in df.iterrows():
        chunks = text_splitter.split_text(row['Cleaned_Narrative'])
        for chunk in chunks:
            all_chunks.append(chunk)
            chunk_metadata.append({
                'complaint_id': row['Complaint ID'],
                'mapped_product': row['Mapped_Product']
            })
    
    # Generate embeddings
    embeddings = model.encode(all_chunks, convert_to_numpy=True)
    
    if len(all_chunks) == 0 or embeddings.size == 0:
        logger.error("No chunks or embeddings generated")
    else:
        logger.info(f"Generated {len(all_chunks)} chunks and {embeddings.shape[0]} embeddings")
        print(f"First chunk: {all_chunks[0][:100]}...")
        print(f"Embedding shape: {embeddings.shape}")  # Should be (n_chunks, 384) for all-MiniLM-L6-v2

Batches:   0%|          | 0/24073 [00:00<?, ?it/s]

INFO:__main__:Generated 770308 chunks and 770308 embeddings


First chunk: apt transunion consumer solutions security freeze request dear sir madam social security resident su...
Embedding shape: (770308, 384)


In [4]:
# Initialize PersistentClient and create collection
client = chromadb.PersistentClient(path=str(EMBEDDINGS_DIR))
collection = client.get_or_create_collection(name="complaint_embeddings", metadata={"hnsw:space": "cosine"})

# Define a batch size, well within ChromaDB's limit (e.g., 5000 or less)
batch_size = 5000 # You can adjust this, but keep it below the max_batch_size (5461)

# Get the total number of chunks/embeddings
total_chunks = len(all_chunks)

# Add embeddings, chunks, and metadata to ChromaDB in batches
for i in range(0, total_chunks, batch_size):
    # Determine the end index for the current batch
    end_index = min(i + batch_size, total_chunks)

    # Slice the data for the current batch
    batch_embeddings = embeddings[i:end_index].tolist()
    batch_documents = all_chunks[i:end_index]
    batch_metadatas = chunk_metadata[i:end_index]
    batch_ids = [f"chunk_{j}" for j in range(i, end_index)]

    logger.info(f"Adding batch {i//batch_size + 1}/{(total_chunks + batch_size - 1)//batch_size} (from index {i} to {end_index-1})...")
    collection.add(
        embeddings=batch_embeddings,
        documents=batch_documents,
        metadatas=batch_metadatas,
        ids=batch_ids
    )
    logger.info(f"Batch {i//batch_size + 1} added. Current total vectors: {collection.count()}")


logger.info(f"Indexed {collection.count()} vectors in ChromaDB")
print(f"Total vectors in collection: {collection.count()}")

INFO:__main__:Adding batch 1/155 (from index 0 to 4999)...
INFO:__main__:Batch 1 added. Current total vectors: 5000
INFO:__main__:Adding batch 2/155 (from index 5000 to 9999)...
INFO:__main__:Batch 2 added. Current total vectors: 10000
INFO:__main__:Adding batch 3/155 (from index 10000 to 14999)...
INFO:__main__:Batch 3 added. Current total vectors: 15000
INFO:__main__:Adding batch 4/155 (from index 15000 to 19999)...
INFO:__main__:Batch 4 added. Current total vectors: 20000
INFO:__main__:Adding batch 5/155 (from index 20000 to 24999)...
INFO:__main__:Batch 5 added. Current total vectors: 25000
INFO:__main__:Adding batch 6/155 (from index 25000 to 29999)...
INFO:__main__:Batch 6 added. Current total vectors: 30000
INFO:__main__:Adding batch 7/155 (from index 30000 to 34999)...
INFO:__main__:Batch 7 added. Current total vectors: 35000
INFO:__main__:Adding batch 8/155 (from index 35000 to 39999)...
INFO:__main__:Batch 8 added. Current total vectors: 40000
INFO:__main__:Adding batch 9/155

Total vectors in collection: 770308


In [None]:
# Optional: Test similarity search
query_text = "issue on "
query_embedding = model.encode([query_text], convert_to_numpy=True)[0]

results = collection.query(
    query_embeddings=[query_embedding.tolist()],
    n_results=5
)

print("\nTop 5 similar chunks:")
for i, doc in enumerate(results['documents'][0]):
    complaint_id = results['metadatas'][0][i]['complaint_id']
    mapped_product = results['metadatas'][0][i]['mapped_product']
    print(f"Rank {i+1}: Complaint ID = {complaint_id}, Product = {mapped_product}, Chunk = {doc[:100]}...")