# Chunking, Embedding and Indexing

## Load Modules

In [1]:
import sys
import os
import pandas as pd

sys.path.append(os.path.abspath(".."))

from src.sampler import ComplaintSampler
from src.vectorizer import VectorPipeline

  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [2]:
INPUT_FILE = '../data/processed/filtered_complaints.csv'
VECTOR_DB_DIR = '../vector_store'

df = pd.read_csv(INPUT_FILE)
print(f"Loaded {len(df)} rows.")

Loaded 486622 rows.


## Stratified Sampling

In [3]:
sampler = ComplaintSampler(df)
sample_df = sampler.stratified_sample(n_samples=10000)

print("\nOriginal Distribution (%):")
print(df['CrediTrust_Product'].value_counts(normalize=True)),
print("\nSample Distribution (%):")
print(sample_df['CrediTrust_Product'].value_counts(normalize=True))

[Sampler] Creating stratified sample of 10000 complaints...
[Sampler] Sample shape: (9999, 9)

Original Distribution (%):
CrediTrust_Product
Credit Cards        0.405089
Savings Accounts    0.318937
Money Transfers     0.202827
Personal Loans      0.073147
Name: proportion, dtype: float64

Sample Distribution (%):
CrediTrust_Product
Credit Cards        0.405141
Savings Accounts    0.318932
Money Transfers     0.202820
Personal Loans      0.073107
Name: proportion, dtype: float64


  sample_df = self.df.groupby('CrediTrust_Product', group_keys=False).apply(


## Vector Store Building

In [4]:
pipeline = VectorPipeline(chunk_size=500, chunk_overlap=50)

vector_db = pipeline.create_vector_store(sample_df, persist_dir=VECTOR_DB_DIR)

[VectorPipeline] Loading embedding model (all-MiniLM-L6-v2)...
[VectorPipeline] Converting 9999 rows to Documents...
[VectorPipeline] Splitting text (Chunk Size: 500)...
[VectorPipeline] Generated 27486 chunks from 9999 original complaints.
[VectorPipeline] Clearing existing vector store at ../vector_store...
[VectorPipeline] Embedding chunks and saving to ChromaDB (this takes time)...
[VectorPipeline] Success! Vector store saved to ../vector_store


## Test Retrieval

In [5]:
query = "hidden fees on my credit card statement"

res = vector_db.similarity_search(query, k=3)
print(f"Query: {query}\n")
for i, doc in enumerate(res):
    print(f"--- Result {i+1} ---")
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}\n")

Query: hidden fees on my credit card statement

--- Result 1 ---
Content: a customer over 1200 he said it was up to me if i wanted to see my credit go down the tubes i stated that i have other charge cards that carry a balance and once paid in full i do not get hit with hidden fees any assistance you can give to get this reversed is greatly appreciated
Metadata: {'issue': 'Fees or interest', 'product': 'Credit Cards', 'state': 'FL', 'complaint_id': '2499336', 'date': '2017-06-01'}

--- Result 2 ---
Content: my card despite having already cleared more than the statement balance and having nearly available credit each time that i do this i am charged fees
Metadata: {'issue': 'Trouble using your card', 'complaint_id': '5855276', 'product': 'Credit Cards', 'date': '2022-08-08', 'state': 'FL'}

--- Result 3 ---
Content: is hiding data about their fees thereby making it hard for me to know just how much they are costing me these are subtle but very concerning changes to online payment user e