# Cell 1 — Imports

In [5]:
import pandas as pd
import numpy as np


# Cell 2 — Load data

In [6]:
df = pd.read_csv("../data/processed/filtered_complaints.csv")

print(df.shape)
df.head()


(2458054, 8)


Unnamed: 0,Complaint ID,Product,Issue,Sub-issue,cleaned_narrative,Date received,Company,State
0,12351447,Credit reporting,Problem with fraud alerts or security freezes,,xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx apt xx...,2025-03-06,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",TX
1,12203565,Credit reporting,Incorrect information on your report,Information is missing that should be on the r...,subject dispute of unauthorized hard inquiries...,2025-02-26,Experian Information Solutions Inc.,FL
2,14080390,Credit reporting,Improper use of your report,Reporting company used your report improperly,after checking my report i found numerous acco...,2025-06-14,"Fidelity National Information Services, Inc. (...",FL
3,14069121,Credit card,Getting a credit card,Card opened without my consent or knowledge,a xxxx xxxx card was opened under my name by a...,2025-06-13,"CITIBANK, N.A.",TX
4,14061897,Savings account,Managing an account,Deposits and withdrawals,i made the mistake of using my wellsfargo debi...,2025-06-13,WELLS FARGO & COMPANY,ID


# Cell 3 — Sanity checks (important)

In [7]:
df["Product"].value_counts()


Product
Credit reporting    1930121
Credit card          197126
Savings account      155204
Money transfers       98685
Personal loan         76918
Name: count, dtype: int64

# Decide sample size

In [8]:
SAMPLE_SIZE = 12000


In [9]:
# Perform stratified sampling (CORE CODE)

df_sampled = (
    df.groupby("Product", group_keys=False)
      .apply(lambda x: x.sample(
          n=int(len(x) / len(df) * SAMPLE_SIZE),
          random_state=42
      ))
)


  .apply(lambda x: x.sample(


In [10]:
# Validate the result

print("Sample size:", len(df_sampled))
df_sampled["Product"].value_counts()


Sample size: 11997


Product
Credit reporting    9422
Credit card          962
Savings account      757
Money transfers      481
Personal loan        375
Name: count, dtype: int64

In [11]:
# Save sampled dataset

df_sampled.to_csv(
    "../data/processed/sampled_complaints.csv",
    index=False
)


In [12]:
# Load sampled data


df = pd.read_csv("../data/processed/sampled_complaints.csv")


In [13]:
# Initialize text splitter

from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)


In [14]:
# Create text chunks + metadata

documents = []

for _, row in df.iterrows():
    chunks = text_splitter.split_text(row["cleaned_narrative"])

    for chunk in chunks:
        documents.append({
            "text": chunk,
            "metadata": {
                "complaint_id": row["Complaint ID"],
                "product": row["Product"]
            }
        })


In [15]:
# Sanity check

len(documents), documents[0]


(31480,
 {'text': 'first phone call i called to confirm promo and what credit bureau is used since my credit is frozen they answered my question the promo he had said it is in an ongoing promo which means to me that it is running and something they always have to discover this means that it is something they sometimes have very misleading i call because one of the credit bureaus was unable to be un frozen i called they ran a different and approved prior to this again verified the promotion i did not have time to',
  'metadata': {'complaint_id': 3272302, 'product': 'Credit card'}})

In [16]:
# Load embedding model

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")


In [17]:
# Generate embeddings

texts = [doc["text"] for doc in documents]

embeddings = model.encode(
    texts,
    show_progress_bar=True,
    convert_to_numpy=True
)


Batches:   0%|          | 0/984 [00:00<?, ?it/s]

In [18]:
# Verify embedding shape
 
embeddings.shape



(31480, 384)

In [22]:
# Define correct persistent path

from pathlib import Path
import chromadb
from chromadb.config import Settings

BASE_DIR = Path.cwd().parent
VECTOR_DIR = BASE_DIR / "vector_store" / "complaints_chroma"


In [20]:
# Create ONE Chroma client (persistent)

client = chromadb.Client(
    Settings(
        persist_directory=str(VECTOR_DIR),
        anonymized_telemetry=False
    )
)


In [23]:
# Create / get collection

collection = client.get_or_create_collection("complaints")


In [26]:
ids = [str(i) for i in range(len(documents))]
texts = [doc["text"] for doc in documents]
metadatas = [doc["metadata"] for doc in documents]


In [27]:
# Add embeddings in batches

batch_size = 1000

for i in range(0, len(ids), batch_size):
    collection.add(
        ids=ids[i:i + batch_size],
        documents=texts[i:i + batch_size],
        embeddings=embeddings[i:i + batch_size].tolist(),
        metadatas=metadatas[i:i + batch_size],
    )

print("All embeddings added successfully ✅")



All embeddings added successfully ✅


In [28]:
collection.count()


31480