In [None]:
# notebooks/vector_store.ipynb - Cell 1: Imports
import pandas as pd
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import hashlib
import json
import warnings
warnings.filterwarnings('ignore')

print("All packages imported successfully!")

In [None]:
# Cell 2: Load data
data_path = Path('../data/processed/filtered_complaints.csv')

if not data_path.exists():
    print(f"ERROR: File not found: {data_path}")
    print("Run EDA notebook first to create filtered_complaints.csv")
else:
    df = pd.read_csv(data_path)
    print(f"Loaded {len(df)} complaints")
    print("\nProduct distribution:")
    print(df['Product'].value_counts())
    df.head()

In [None]:
# Cell 3: Create stratified sample
def create_sample(df, sample_size=15000):
    """Create stratified sample by product"""
    print(f"Creating stratified sample of {sample_size} complaints...")
    
    samples = []
    for product, group in df.groupby('Product'):
        n_samples = max(1, int(sample_size * len(group) / len(df)))
        samples.append(group.sample(n=min(n_samples, len(group)), random_state=42))
    
    sampled_df = pd.concat(samples, ignore_index=True)
    print(f"Sampled {len(sampled_df)} complaints from {len(df)} total")
    print("\nSample distribution:")
    print(sampled_df['Product'].value_counts())
    return sampled_df

sampled_df = create_sample(df, 15000)

In [None]:
# Cell 4: Text chunking function
def chunk_text(text, chunk_size=500, overlap=50):
    """Split text into overlapping chunks"""
    if not text or len(text) < chunk_size:
        return [text]
    
    chunks = []
    start = 0
    text_length = len(text)
    
    while start < text_length:
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap
        
        if text_length - start < overlap:
            break
    
    return chunks

# Test chunking
test_text = "This is a test complaint about credit card issues. " * 30
test_chunks = chunk_text(test_text)
print(f"Test: {len(test_text)} characters -> {len(test_chunks)} chunks")
print(f"Chunk 1 preview: {test_chunks[0][:100]}...")

In [None]:
# Cell 5: Create all chunks
print("Creating chunks for all complaints...")

all_chunks = []
all_metadatas = []
all_ids = []

for idx, row in sampled_df.iterrows():
    complaint_id = str(row.get('Complaint ID', idx))
    narrative = str(row.get('cleaned_narrative', ''))
    
    if len(narrative.strip()) < 10:
        continue
    
    chunks = chunk_text(narrative)
    
    for chunk_idx, chunk in enumerate(chunks):
        chunk_id = f"{complaint_id}_{chunk_idx}"
        
        metadata = {
            'complaint_id': complaint_id,
            'product': str(row.get('Product', 'Unknown')),
            'date_received': str(row.get('Date received', 'Unknown')),
            'chunk_index': chunk_idx,
            'total_chunks': len(chunks)
        }
        
        all_chunks.append(chunk)
        all_metadatas.append(metadata)
        all_ids.append(chunk_id)

print(f"\nCreated {len(all_chunks)} chunks from {len(sampled_df)} complaints")
print(f"Average chunks per complaint: {len(all_chunks)/len(sampled_df):.1f}")

In [None]:
# Cell 6: Initialize embedding model
print("Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print(f"Model loaded: {model.get_sentence_embedding_dimension()} dimensions")

# Test embedding
test_embedding = model.encode(["Test sentence"])
print(f"Test embedding shape: {test_embedding.shape}")

In [None]:
# Cell 7: Create embeddings
print(f"Creating embeddings for {len(all_chunks)} chunks...")

# Process in batches to avoid memory issues
batch_size = 1000
all_embeddings = []

for i in range(0, len(all_chunks), batch_size):
    end_idx = min(i + batch_size, len(all_chunks))
    batch = all_chunks[i:end_idx]
    
    embeddings = model.encode(batch, show_progress_bar=False)
    all_embeddings.append(embeddings)
    
    print(f"Processed batch {i//batch_size + 1}/{(len(all_chunks)+batch_size-1)//batch_size}")

embeddings_array = np.vstack(all_embeddings)
print(f"\nEmbeddings shape: {embeddings_array.shape}")

In [None]:
# Cell 8: Initialize ChromaDB
print("Initializing ChromaDB...")
chroma_client = chromadb.PersistentClient(
    path="../vector_store",
    settings=Settings(anonymized_telemetry=False)
)

# Create collection
collection_name = "complaints"
try:
    collection = chroma_client.get_collection(collection_name)
    print(f"Using existing collection: {collection_name}")
except:
    collection = chroma_client.create_collection(
        name=collection_name,
        metadata={"hnsw:space": "cosine", "description": "CFPB Complaints"}
    )
    print(f"Created new collection: {collection_name}")

In [None]:
# Cell 9: Add to vector store
print(f"Adding {len(all_chunks)} documents to vector store...")

# Add in batches
batch_size = 1000
for i in range(0, len(all_chunks), batch_size):
    end_idx = min(i + batch_size, len(all_chunks))
    
    collection.add(
        embeddings=embeddings_array[i:end_idx].tolist(),
        documents=all_chunks[i:end_idx],
        metadatas=all_metadatas[i:end_idx],
        ids=all_ids[i:end_idx]
    )
    
    print(f"Added batch {i//batch_size + 1}/{(len(all_chunks)+batch_size-1)//batch_size}")

print(f"\nTotal documents in collection: {collection.count()}")

In [None]:
# Cell 10: Save metadata
metadata = {
    'total_complaints': len(sampled_df),
    'total_chunks': len(all_chunks),
    'products_distribution': sampled_df['Product'].value_counts().to_dict(),
    'chunk_size': 500,
    'chunk_overlap': 50,
    'embedding_model': 'all-MiniLM-L6-v2',
    'vector_db': 'chromadb'
}

metadata_path = Path('../vector_store/metadata.json')
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"Metadata saved to: {metadata_path}")
print("\n=== Summary ===")
print(f"Complaints processed: {len(sampled_df)}")
print(f"Total chunks created: {len(all_chunks)}")
print(f"Vector store saved to: ../vector_store/")

In [None]:
# Cell 11: Test query
test_queries = [
    "credit card payment issues",
    "loan application problems",
    "money transfer delays"
]

print("Testing vector store queries...\n")

for query in test_queries:
    print(f"Query: '{query}'")
    
    results = collection.query(
        query_texts=[query],
        n_results=2
    )
    
    for i, (doc, metadata) in enumerate(zip(results['documents'][0], results['metadatas'][0])):
        print(f"  Result {i+1}:")
        print(f"    Product: {metadata.get('product', 'N/A')}")
        print(f"    Text: {doc[:100]}...")
    print()