In [1]:
# CELL 1: Imports
import pandas as pd
import os
import shutil
import sys

# Ensure we can import from src if needed
sys.path.append(os.path.abspath('..'))

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document

# Paths
INPUT_CSV = "../data/processed/filtered_complaints.csv"
VECTOR_STORE_DIR = "../vector_store"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# CELL 2: Load Data and Stratified Sampling
print(f"Loading processed data from: {INPUT_CSV}")

if not os.path.exists(INPUT_CSV):
    print(f"❌ Error: Input file not found at {INPUT_CSV}")
    print("Please run '01_eda_preprocessing.ipynb' first to generate this file.")
else:
    df = pd.read_csv(INPUT_CSV)
    print(f"Total rows loaded: {len(df)}")

    # Define Sample Size (Target: 10,000 - 15,000 as per instructions)
    SAMPLE_SIZE = 12000

    # Check if we have enough data to sample
    if len(df) <= SAMPLE_SIZE:
        print(f"Dataset is smaller than sample size ({len(df)} <= {SAMPLE_SIZE}). Using full dataset.")
        df_sample = df.copy()
    else:
        print(f"Sampling {SAMPLE_SIZE} rows stratified by product category...")
        
        # Calculate sampling fraction
        frac = SAMPLE_SIZE / len(df)
        
        # Stratified Sampling using groupby().sample()
        try:
            df_sample = df.groupby('product_category', group_keys=False).sample(frac=frac, random_state=42)
        except AttributeError:
            # Fallback for older pandas versions
            df_sample = df.groupby('product_category', group_keys=False).apply(
                lambda x: x.sample(frac=frac, random_state=42)
            )

    print(f"\nSampled dataset size: {len(df_sample)}")
    print("\nSample distribution per category:")
    print(df_sample['product_category'].value_counts())

Loading processed data from: ../data/processed/filtered_complaints.csv
Total rows loaded: 20000
Sampling 12000 rows stratified by product category...

Sampled dataset size: 11999

Sample distribution per category:
product_category
Credit Card        4653
Savings Account    4579
Money Transfer     2762
Personal Loan         5
Name: count, dtype: int64


In [3]:
# CELL 3: Create LangChain Documents
documents = []

if 'df_sample' in locals():
    for _, row in df_sample.iterrows():
        # Handle potential missing values in metadata fields
        complaint_id = str(row.get('Complaint ID', 'N/A'))
        product = str(row.get('product_category', 'Unknown'))
        issue = str(row.get('Issue', 'Unknown'))
        company = str(row.get('Company', 'Unknown'))
        narrative = str(row.get('cleaned_narrative', ''))
        
        # Skip empty narratives if any slipped through
        if not narrative.strip():
            continue

        metadata = {
            "complaint_id": complaint_id,
            "product": product,
            "issue": issue,
            "company": company
        }
        
        doc = Document(
            page_content=narrative,
            metadata=metadata
        )
        documents.append(doc)

    print(f"Successfully converted {len(documents)} rows into LangChain Documents.")
    
    if documents:
        print("\n--- Example Document ---")
        print(f"Metadata: {documents[0].metadata}")
        print(f"Content (first 200 chars): {documents[0].page_content[:200]}...")
else:
    print("DataFrame not loaded. Run previous cell.")

Successfully converted 11999 rows into LangChain Documents.

--- Example Document ---
Metadata: {'complaint_id': 'N/A', 'product': 'Credit Card', 'issue': 'Unknown', 'company': 'Unknown'}
Content (first 200 chars): omplete or irrelevant to the purchase in question. bank of americas response to this dispute has been consistently dismissive, with the bank stating that since we received a sofa, the issue is resolve...


In [4]:
# CELL 4: Text Chunking Strategy
if documents:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        separators=["\n\n", "\n", ". ", " ", ""], 
        length_function=len
    )

    chunks = text_splitter.split_documents(documents)

    print(f"Splitting complete.")
    print(f"Total chunks created: {len(chunks)}")
    print(f"Average chunks per document: {len(chunks)/len(documents):.2f}")

    if chunks:
        print("\n--- Example Chunk ---")
        print(f"Content: {chunks[0].page_content}")
        print(f"Metadata: {chunks[0].metadata}")
else:
    print("No documents to chunk.")

Splitting complete.
Total chunks created: 11999
Average chunks per document: 1.00

--- Example Chunk ---
Content: omplete or irrelevant to the purchase in question. bank of americas response to this dispute has been consistently dismissive, with the bank stating that since we received a sofa, the issue is resolved. this response is not only inaccurate but also unacceptable, especially considering that i have been in direct contact with the manufacturer, who confirmed that the error lies with , not the manufacturer.
Metadata: {'complaint_id': 'N/A', 'product': 'Credit Card', 'issue': 'Unknown', 'company': 'Unknown'}


In [5]:
# CELL 5: Embedding Model Initialization
print("Initializing Embedding Model (sentence-transformers/all-MiniLM-L6-v2)...")

try:
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    print("✅ Model loaded successfully.")
except Exception as e:
    print(f"❌ Error loading model: {e}")

Initializing Embedding Model (sentence-transformers/all-MiniLM-L6-v2)...
✅ Model loaded successfully.


In [6]:
# CELL 6: Vector Store Creation & Persistence
if 'chunks' in locals() and 'embedding_model' in locals():
    print(f"Indexing {len(chunks)} chunks into ChromaDB at '{VECTOR_STORE_DIR}'...")
    print("This may take a few minutes...")

    if os.path.exists(VECTOR_STORE_DIR):
        try:
            shutil.rmtree(VECTOR_STORE_DIR)
            print(f"Removed old vector store at {VECTOR_STORE_DIR}")
        except Exception as e:
            print(f"Warning: Could not remove old directory: {e}")

    try:
        vector_db = Chroma.from_documents(
            documents=chunks,
            embedding=embedding_model,
            persist_directory=VECTOR_STORE_DIR,
            collection_name="complaints_rag"
        )
        print("Vector store successfully created and persisted!")
        print(f"Database located at: {os.path.abspath(VECTOR_STORE_DIR)}")
    except Exception as e:
        print(f"Error creating vector store: {e}")
else:
    print("Chunks or embedding model not ready.")

Indexing 11999 chunks into ChromaDB at '../vector_store'...
This may take a few minutes...
Removed old vector store at ../vector_store
Vector store successfully created and persisted!
Database located at: d:\10academy\phase5\rag-complaint-chatbot\vector_store


In [7]:
# CELL 7: Verification (Test Query)
if 'vector_db' in locals():
    query = "They charged me a fee I didn't agree to."
    print(f"\nTest Query: {query}")

    try:
        results = vector_db.similarity_search(query, k=2)

        print(f"Retrieved {len(results)} relevant documents:")
        for i, res in enumerate(results):
            print(f"\n--- Result {i+1} ---")
            print(f"Product: {res.metadata.get('product', 'N/A')}")
            print(f"Issue: {res.metadata.get('issue', 'N/A')}")
            print(f"Content: {res.page_content}")
            
    except Exception as e:
        print(f"Error querying database: {e}")
else:
    print("Vector database not loaded.")


Test Query: They charged me a fee I didn't agree to.
Retrieved 2 relevant documents:

--- Result 1 ---
Product: Credit Card
Issue: Unknown
Content: se the charge, which is not a valid justification. this fee is both excessive and unjustified.

--- Result 2 ---
Product: Savings Account
Issue: Unknown
Content: and egregious penalties in the email they sent me. they chose not to do so. they could have highlighted this important subject of a very severe penalty in the body of the message they sent in their messaging center. they chose not to do so. now i could have clicked a link and scrolled through several pages of legalese to find this, but in no way is this clear and conspicuous, not did i have any reason to expect such an outrageous fee.
