In [2]:
import pandas as pd

# Load cleaned complaints CSV with at least columns: 'Complaint ID', 'Consumer complaint narrative', 'Product'
df = pd.read_csv("../data/filtered_complaints.csv")

# Optional: inspect first few rows
print(df.head())


  Date received                      Product  \
0    2025-06-13                  Credit card   
1    2025-06-13  Checking or savings account   
2    2025-06-12                  Credit card   
3    2025-06-12                  Credit card   
4    2025-06-09                  Credit card   

                                  Sub-product  \
0                           Store credit card   
1                            Checking account   
2  General-purpose credit card or charge card   
3  General-purpose credit card or charge card   
4  General-purpose credit card or charge card   

                                             Issue  \
0                            Getting a credit card   
1                              Managing an account   
2               Other features, terms, or problems   
3             Incorrect information on your report   
4  Problem with a purchase shown on your statement   

                                           Sub-issue  \
0        Card opened without my con

Step 2: Define a chunking function

In [3]:
def chunk_text(text, doc_id, product, chunk_size=100, chunk_overlap=20):
    words = text.split()
    chunks = []
    start = 0
    idx = 0
    while start < len(words):
        end = min(start + chunk_size, len(words))
        chunk_words = words[start:end]
        chunk_text = " ".join(chunk_words)
        chunks.append({
            "doc_id": doc_id,
            "chunk_index": idx,
            "chunk_text": chunk_text,
            "product": product
        })
        start += chunk_size - chunk_overlap
        idx += 1
    return chunks


Step 3: Iterate over dataframe rows and create text_chunks


In [4]:
text_chunks = []

for _, row in df.iterrows():
    doc_id = row['Complaint ID']  # or whatever your ID column is named
    product = row['Product']
    narrative = row['Consumer complaint narrative']
    
    # Some narratives might be NaN or empty, skip those
    if pd.isna(narrative) or len(str(narrative).strip()) == 0:
        continue
    
    chunks = chunk_text(str(narrative), doc_id, product)
    text_chunks.extend(chunks)

print(f"Total chunks created: {len(text_chunks)}")


Total chunks created: 956535


In [1]:
# import os
# from dotenv import load_dotenv

# load_dotenv()  # loads variables from .env into environment

# API_TOKEN = os.getenv("HF_API_TOKEN")
# print(API_TOKEN)

In [6]:
import sys
import os

# ✅ Add the project root to the Python path
sys.path.append(os.path.abspath(".."))

# ✅ Import your embedding and vectorstore tools
from app.embeddings.embeddings import embed_text_chunks
from app.embeddings.vectorstore import build_faiss_index, save_faiss_index

# ✅ Assuming `text_chunks` is already defined from your chunking process
# Example format:
# text_chunks = [
#     {"doc_id": "123", "chunk_index": 0, "chunk_text": "some cleaned text", "product": "Credit card"},
#     ...
# ]

# 🔁 Generate embeddings
embedded_chunks = embed_text_chunks(text_chunks)

# 🧠 Build FAISS index and metadata
index, metadata = build_faiss_index(embedded_chunks)

# 💾 Save index and metadata to base path (FAISS will handle .bin and .pkl automatically)
save_faiss_index(index, metadata, "../data/embeddings/faiss_index")


  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 29892/29892 [7:32:08<00:00,  1.10it/s]       


In [None]:
# import sys 
# import os

# # Add the project root to the Python path
# sys.path.append(os.path.abspath(".."))

# # Now your imports will work:
# from app.embeddings.embeddings import embed_text_chunks
# from app.embeddings.vectorstore import build_faiss_index, save_faiss_index

# # Suppose you already have text_chunks (from chunking step)
# # Example:
# # text_chunks = [{"doc_id": "123", "chunk_index": 0, "chunk_text": "some cleaned text", "product": "Credit card"}]

# embedded_chunks = embed_text_chunks(text_chunks)

# index, metadata = build_faiss_index(embedded_chunks)

# save_faiss_index(index, metadata, "../data/embeddings/faiss_index.bin", "../data/embeddings/faiss_metadata.pkl")