In [1]:
!pip install langchain langchain-community faiss-cpu sentence-transformers

import sys
from pathlib import Path
import pandas as pd

Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-se

In [2]:
%cd "/content/drive/MyDrive/Tenx program/week-6"

from scripts.chunk_embed_index import run_chunk_embed_index

/content/drive/MyDrive/Tenx program/week-6


In [8]:
ZIP = Path("data/filtered_complaints.csv")
OUTPUT_DIR = Path("vector_store")
CHUNK_SIZE = 500
CHUNK_OVERLAP = 100
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"


In [4]:
df = pd.read_csv(ZIP, nrows=5)
df.head()

Unnamed: 0,Product,Consumer complaint narrative,clean_narrative
0,Credit card,A XXXX XXXX card was opened under my name by a...,a xxxx xxxx card was opened under my name by a...
1,Checking or savings account,I made the mistake of using my wellsfargo debi...,i made the mistake of using my wellsfargo debi...
2,Credit card,"Dear CFPB, I have a secured credit card with c...","dear cfpb, i have a secured credit card with c..."
3,Credit card,I have a Citi rewards cards. The credit balanc...,i have a citi rewards cards. the credit balanc...
4,Credit card,b'I am writing to dispute the following charge...,b i am writing to dispute the following charge...


In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    keep_separator=True,
)

sample_text = df.loc[0, "clean_narrative"]
chunks = splitter.split_text(sample_text)
print("Example chunks:")
for i, c in enumerate(chunks):
    print(f"--- chunk {i} ---\n{c}\n")


Example chunks:
--- chunk 0 ---
a xxxx xxxx card was opened under my name by a fraudster. i received a notice from xxxx that an account was just opened under my name. i reached out to xxxx xxxx to state that this activity was unauthorized and not me. xxxx xxxx confirmed this was fraudulent and immediately closed the card. however, they have failed to remove this from the three credit agencies and this fraud is now impacting my credit score based on a hard credit pull done by xxxx xxxx that was done by a fraudster.



In [6]:
run_chunk_embed_index(
    input_csv=ZIP,
    output_dir=OUTPUT_DIR,
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    embedding_model=EMBED_MODEL,
)


→ Created 260362 chunks from 97759 complaints.


  embedder = HuggingFaceEmbeddings(model_name=embedding_model)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✓ Vector store saved to vector_store


In [9]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

vs = FAISS.load_local(str(OUTPUT_DIR),
                        HuggingFaceEmbeddings(model_name=EMBED_MODEL),
                        allow_dangerous_deserialization=True)
print(f"Loaded FAISS index with {vs.index.ntotal} vectors")

Loaded FAISS index with 260362 vectors
