# Text Chunking, Embedding, and Vector Store Indexing

In [1]:
import pandas as pd

df=pd.read_csv("../data/processed/filtered_complaints.csv")
df = df[['Complaint ID', 'Mapped Product', 'cleaned_narrative']].dropna()


In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

chunk_size = 300
chunk_overlap = 50

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

chunks = []

for idx, row in df.iterrows():
    text = row['cleaned_narrative']
    chunked_texts = text_splitter.split_text(text)
    for chunk in chunked_texts:
        chunks.append({
            'chunk': chunk,
            'complaint_id': row['Complaint ID'],
            'product': row['Mapped Product']
        })

# Convert to DataFrame
chunk_df = pd.DataFrame(chunks)

In [3]:
chunk_df.head()

Unnamed: 0,chunk,complaint_id,product
0,a xxxx xxxx card was opened under my name by a...,14069121,Credit card
1,and immediately closed the card however they h...,14069121,Credit card
2,i made the mistake of using my wellsfargo debi...,14061897,Savings account
3,i went into the branch and was told they could...,14061897,Savings account
4,i waited a few days and got a letter stating m...,14061897,Savings account


In [4]:
chunk_df.to_csv('../data/processed/text_chunks.csv', index=False)


# Load Model and Encode Chunks

In [8]:
import os
os.environ["USE_TF_KERAS"] = "1"


In [6]:
import tf_keras as keras
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pandas as pd

chunks_df=pd.read_csv("../data/processed/text_chunks.csv")
model=SentenceTransformer('all-MiniLM-L6-v2')

print("🔄 Generating embeddings...")
embeddings = model.encode(chunks_df['chunk'].tolist(), show_progress_bar=True)

# Convert to float32 numpy array (required by FAISS)
embeddings = np.array(embeddings).astype('float32')


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:  19%|#8        | 21.0M/112M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


model.safetensors:  26%|##5       | 31.5M/122M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🔄 Generating embeddings...


Batches:   0%|          | 0/76859 [00:00<?, ?it/s]

KeyboardInterrupt: 