In [1]:
import os, ast, faiss, pickle
import pandas as pd
import numpy as np
from tqdm import tqdm

# --- Setup Paths ---
EMBEDDED_CSV = "embedded_chunks_partial.csv"
COMPLAINTS_CSV = "cleaned_complaints.csv"
SAVE_DIR = "/content/drive/MyDrive/faiss_consumer_vector_store"
os.makedirs(SAVE_DIR, exist_ok=True)

INDEX_PATH = os.path.join(SAVE_DIR, "faiss.index")
METADATA_PATH = os.path.join(SAVE_DIR, "metadata.pkl")

# --- Load product metadata (small, safe) ---
complaints_df = pd.read_csv(COMPLAINTS_CSV, usecols=['Complaint ID', 'Product'])
complaints_df.rename(columns={'Complaint ID': 'original_narrative_id'}, inplace=True)
complaints_df['original_narrative_id'] = complaints_df['original_narrative_id'].astype(str)

# --- Initialize FAISS index ---
DIM = 384  # all-MiniLM-L6-v2 output size
index = faiss.IndexFlatIP(DIM)  # Cosine similarity (after normalization)

# --- Chunked embedding ingestion ---
CHUNK_SIZE = 50000
metadata_rows = []

print("🚀 Building FAISS index in chunks...")

for chunk in pd.read_csv(EMBEDDED_CSV, chunksize=CHUNK_SIZE):
    chunk['embedding'] = chunk['embedding'].apply(ast.literal_eval)
    chunk['original_narrative_id'] = chunk['original_narrative_id'].astype(str)

    # Merge with complaint product
    merged = pd.merge(chunk, complaints_df, on='original_narrative_id', how='left')
    merged['Product'] = merged['Product'].fillna('Unknown Product')

    # Build and normalize chunk matrix
    embeddings = np.vstack(merged['embedding'].values).astype('float32')
    faiss.normalize_L2(embeddings)
    index.add(embeddings)

    # Save metadata (excluding embeddings)
    metadata_rows.extend(merged.drop(columns=['embedding']).to_dict(orient='records'))

    print(f"✅ Processed {len(embeddings)} vectors. Total so far: {index.ntotal}")

# --- Save index and metadata ---
faiss.write_index(index, INDEX_PATH)

with open(METADATA_PATH, 'wb') as f:
    pickle.dump(metadata_rows, f)

print(f"\n🎉 FAISS index saved to: {INDEX_PATH}")
print(f"🧠 Metadata saved to: {METADATA_PATH} ({len(metadata_rows)} records)")


🚀 Building FAISS index in chunks...
✅ Processed 50000 vectors. Total so far: 50000
✅ Processed 50000 vectors. Total so far: 100000
✅ Processed 50000 vectors. Total so far: 150000
✅ Processed 50000 vectors. Total so far: 200000
✅ Processed 50000 vectors. Total so far: 250000
✅ Processed 50000 vectors. Total so far: 300000
✅ Processed 50000 vectors. Total so far: 350000
✅ Processed 50000 vectors. Total so far: 400000
✅ Processed 50000 vectors. Total so far: 450000
✅ Processed 50000 vectors. Total so far: 500000
✅ Processed 50000 vectors. Total so far: 550000
✅ Processed 50000 vectors. Total so far: 600000
✅ Processed 50000 vectors. Total so far: 650000
✅ Processed 50000 vectors. Total so far: 700000
✅ Processed 50000 vectors. Total so far: 750000
✅ Processed 50000 vectors. Total so far: 800000
✅ Processed 50000 vectors. Total so far: 850000
✅ Processed 50000 vectors. Total so far: 900000
✅ Processed 50000 vectors. Total so far: 950000
✅ Processed 50000 vectors. Total so far: 1000000
✅ Pr

In [5]:
import pickle
import faiss
from sentence_transformers import SentenceTransformer

# Load FAISS + metadata
index = faiss.read_index("../data/faiss_consumer_vector_store/faiss.index")
with open("../data/faiss_consumer_vector_store/metadata.pkl", 'rb') as f:
    metadata = pickle.load(f)


In [7]:
model = SentenceTransformer("all-MiniLM-L6-v2")
query = "bank closed my account"
vec = model.encode([query]).astype('float32')
faiss.normalize_L2(vec)

D, I = index.search(vec, k=5)

for rank, idx in enumerate(I[0]):
    row = metadata[idx]
    print(f"{rank+1}. Product: {row['Product']} | ID: {row['original_narrative_id']}")
    print(f"   Text: {row['chunk_text'][:150]}...\n")


1. Product: Checking or savings account | ID: 12331637
   Text: Truist bank closed my account but still I have not gotten my money from my account...

2. Product: Checking or savings account | ID: 4841999

3. Product: Checking or savings account | ID: 6963821
   Text: bank closed my account and wont return my funds or even tell the reason they closed my account or how to get my funds back...

4. Product: Checking or savings account | ID: 12034340
   Text: Bank closed my bank account with no reason given. After 4 plus years of business. Called to find out they refused to tell me why. They closed account ...

5. Product: Money transfer, virtual currency, or money service | ID: 12024135
   Text: The bank account closed and I cant even see my transactions any more. I need help...

