In [5]:
import pandas as pd
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from chromadb import Client
import numpy as np


In [6]:

# Step 1: Load the Cleaned Dataset
df_filtered = pd.read_csv('../data/filtered_complaints.csv')


In [7]:

# Step 2: Text Cleaning Function
def clean_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.strip()


In [8]:

# Step 3: Create the Cleaned Narrative Column
df_filtered['cleaned_narrative'] = df_filtered['Consumer complaint narrative'].apply(clean_text)


In [None]:

# Step 4: Text Chunking
chunk_size = 512  # Number of characters per chunk
chunk_overlap = 20  # Overlap between chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

# Function to chunk narratives
def chunk_narratives(narrative):
    return text_splitter.split_text(narrative)

# Apply chunking to the cleaned narratives
df_filtered['chunks'] = df_filtered['cleaned_narrative'].apply(chunk_narratives)


In [10]:

# Step 5: Flatten DataFrame for Embedding
chunks = []
for idx, row in df_filtered.iterrows():
    for chunk in row['chunks']:
        chunks.append({
            'id': row['Complaint ID'],
            'product': row['Product'],
            'chunk': chunk
        })

df_chunks = pd.DataFrame(chunks)


In [11]:

# Step 6: Choose an Embedding Model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


In [None]:

# Step 7: Generate Embeddings in Batches
batch_size = 16  # Adjust based on your memory capacity
embeddings = []

for i in range(0, len(df_chunks), batch_size):
    batch = df_chunks['chunk'][i:i + batch_size].tolist()
    embeddings.extend(model.encode(batch).tolist())

df_chunks['embedding'] = embeddings


In [13]:

# Step 8: Create or Access a Vector Store Using ChromaDB
client = Client()

# Access existing collection or create a new one
# try:
#     vector_store = client.get_collection("complaint_embeddings")
# except Exception as e:
#     print(f"Error accessing collection: {e}")
vector_store = client.create_collection("complaints_embedding")


In [14]:


for _, row in df_chunks.iterrows():
    try:
        # Validate data
        if not isinstance(row['embedding'], list) or not all(isinstance(x, float) for x in row['embedding']):
            print(f"Skipping invalid embedding for ID {row['id']}: {row['embedding']}")
            continue
        if row['chunk'] is None or not isinstance(row['chunk'], str):
            print(f"Skipping invalid chunk for ID {row['id']}: {row['chunk']}")
            continue
        
        # Convert embedding to numpy array if required by vector store
        embedding = np.array(row['embedding'], dtype=np.float32)
        
        # Add to vector store
        vector_store.add(
            ids=[str(row['id'])],
            embeddings=[embedding],
            documents=[row['chunk']],
            metadatas=[{'id': str(row['id']), 'product': row['product']}]
        )
    except Exception as e:
        print(f"Error processing ID {row['id']}: {e}")
        continue

: 

In [None]:

# Step 9: Persist the Vector Store
vector_store.persist("vector_store/")