In [None]:
%pip install  langchain sentence-transformers chromadb nltk

In [5]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import chromadb
import os
from uuid import uuid4

In [6]:
# Load the preprocessed dataset
df = pd.read_csv('../data/processed/preprocessed_complaints.csv')


In [7]:
print("\nColumns in the DataFrame:")
df.columns


Columns in the DataFrame:


Index(['Complaint ID', 'Product', 'Sub-product', 'Cleaned_Narrative'], dtype='object')

In [8]:
# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # Target chunk size in characters
    chunk_overlap=50,  # Overlap to maintain context
    length_function=len,
    separators=["\n\n", "\n", ". ", " ", ""]
)

# Initialize the embedding model
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Initialize ChromaDB client and create/get collection
client = chromadb.PersistentClient(path="./vector_store")
collection = client.get_or_create_collection(name="cfpb_complaints")

# SAMPLE ONLY 100 ROWS TO SAVE MEMORY
sample_df = df.sample(n=100, random_state=42)  # You can change n=100 as needed

# Function to chunk text and generate metadata
def process_narrative(row):
    narrative = row['Cleaned_Narrative']
    complaint_id = row['Complaint ID']
    product = row['Product']
    sub_product = row['Sub-product'] if pd.notnull(row['Sub-product']) else ""
    
    # Split the narrative into chunks
    chunks = text_splitter.split_text(narrative)
    
    # Prepare data for vector store
    documents = []
    embeddings = []
    metadatas = []
    ids = []
    
    for i, chunk in enumerate(chunks):
        # Generate embedding for the chunk
        embedding = embedding_model.encode(chunk).tolist()
        
        # Create unique ID for the chunk
        chunk_id = f"{complaint_id}_{i}"
        
        # Store chunk, embedding, and metadata
        documents.append(chunk)
        embeddings.append(embedding)
        metadatas.append({
            "Complaint ID": complaint_id,
            "Product": product,
            "Sub-product": sub_product,
            "Chunk Index": i
        })
        ids.append(chunk_id)
    
    return documents, embeddings, metadatas, ids

# Process only the sample
all_documents = []
all_embeddings = []
all_metadatas = []
all_ids = []

for _, row in sample_df.iterrows():
    docs, embs, metas, ids = process_narrative(row)
    all_documents.extend(docs)
    all_embeddings.extend(embs)
    all_metadatas.extend(metas)
    all_ids.extend(ids)

# Add to ChromaDB collection
collection.add(
    documents=all_documents,
    embeddings=all_embeddings,
    metadatas=all_metadatas,
    ids=all_ids,
)
print(f"Indexed {len(all_documents)} chunks into ChromaDB vector store at './vector_store'.")

Indexed 200 chunks into ChromaDB vector store at './vector_store'.
