In [1]:
import pandas as pd

df = pd.read_csv("../data/filtered_complaints.csv")

In [2]:
def chunk_text(text, max_words=100):
    words = text.split()
    return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)]

df["chunks"] = df["cleaned_narrative"].apply(lambda x: chunk_text(x, max_words=100))

In [3]:
chunks_df = df[["Product", "chunks"]].explode("chunks").rename(columns={"chunks": "text_chunk"}).reset_index(drop=True)


In [4]:
chunks_df.head()


Unnamed: 0,Product,text_chunk
0,Credit card,a xxxx xxxx card was opened under my name by a...
1,Credit card,dear cfpb i have a secured credit card with ci...
2,Credit card,within 14 days to fill out i called customer s...
3,Credit card,i have a citi rewards cards the credit balance...
4,Credit card,balance of my card from 840000 down to 540000 ...


In [11]:
from langchain.embeddings import HuggingFaceEmbeddings

# Use sentence-transformers model
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


In [7]:
from tqdm import tqdm
tqdm.pandas()
chunks_df["embedding"] = chunks_df["text_chunk"].progress_apply(lambda x: embedding_model.embed_query(x))


100%|██████████| 199732/199732 [2:16:28<00:00, 24.39it/s]   


In [8]:
chunks_df.head()

Unnamed: 0,Product,text_chunk,embedding
0,Credit card,a xxxx xxxx card was opened under my name by a...,"[-0.025095298886299133, 0.0361095629632473, -0..."
1,Credit card,dear cfpb i have a secured credit card with ci...,"[-0.026905223727226257, 0.004757718648761511, ..."
2,Credit card,within 14 days to fill out i called customer s...,"[0.00014222634490579367, -0.001448684372007846..."
3,Credit card,i have a citi rewards cards the credit balance...,"[-0.0588565431535244, 0.016239767894148827, -0..."
4,Credit card,balance of my card from 840000 down to 540000 ...,"[-0.025715159252285957, 0.01586422324180603, 0..."


In [10]:
chunks_df.to_csv("../data/complaint_chunks.csv", index=False)


In [13]:
df = pd.read_csv("../data/complaint_chunks.csv")

In [19]:
import ast

df["embedding"].apply(type).value_counts()


embedding
<class 'list'>    199732
Name: count, dtype: int64

In [20]:
df["embedding"].apply(type).value_counts()


embedding
<class 'list'>    199732
Name: count, dtype: int64

In [21]:
df["embedding"] = df["embedding"].progress_apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

100%|██████████| 199732/199732 [00:00<00:00, 567058.65it/s]


In [24]:
from langchain.schema import Document

docs = []
for i, row in df.iterrows():
    metadata = {
        "source_index": i,
        "product": row["Product"]
    }
    docs.append(
        Document(page_content=row["text_chunk"], metadata=metadata)
    )

In [28]:
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [30]:
from langchain.vectorstores import FAISS
vector_store = FAISS.from_documents(docs, embedding_model)


In [31]:
vector_store.save_local("../vector_store")