In [1]:
import sys
import os
import shutil
import pandas as pd

# Add the project root to system path so we can import from src if needed
# os.path.abspath('..') gets the parent directory of 'notebooks', which is the project root
sys.path.append(os.path.abspath('..'))

# Import LangChain components
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document

# --- Configuration ---

# Paths (Relative to the 'notebooks' directory)
# Input: The cleaned CSV created in Task 1
INPUT_CSV = "../data/processed/filtered_complaints.csv"

# Output: Where the ChromaDB database will be saved
VECTOR_STORE_DIR = "../vector_store"

print("Libraries loaded and paths configured.")

  from .autonotebook import tqdm as notebook_tqdm


Libraries loaded and paths configured.


In [2]:
print(f"Loading processed data from: {INPUT_CSV}")

if not os.path.exists(INPUT_CSV):
    print(f"❌ Error: Input file not found at {INPUT_CSV}")
    print("Please run '01_eda_preprocessing.ipynb' first to generate this file.")
else:
    df = pd.read_csv(INPUT_CSV)
    print(f"Total rows loaded: {len(df)}")

    # Define Sample Size (Target: 10,000 - 15,000 as per instructions)
    SAMPLE_SIZE = 12000

    # Check if we have enough data to sample
    if len(df) <= SAMPLE_SIZE:
        print(f"Dataset is smaller than sample size ({len(df)} <= {SAMPLE_SIZE}). Using full dataset.")
        df_sample = df.copy()
    else:
        print(f"Sampling {SAMPLE_SIZE} rows stratified by product category...")
        # Stratified Sampling: Maintains the percentage of each product category
        df_sample = df.groupby('product_category', group_keys=False).apply(
            lambda x: x.sample(n=int(len(x) / len(df) * SAMPLE_SIZE), random_state=42)
        )

    print(f"\nSampled dataset size: {len(df_sample)}")
    print("\nSample distribution per category:")
    print(df_sample['product_category'].value_counts())

Loading processed data from: ../data/processed/filtered_complaints.csv
Total rows loaded: 479143
Sampling 12000 rows stratified by product category...

Sampled dataset size: 11997

Sample distribution per category:
product_category
Credit Card        4933
Savings Account    3883
Money Transfer     2470
Personal Loan       711
Name: count, dtype: int64


  df_sample = df.groupby('product_category', group_keys=False).apply(


In [3]:
# Convert DataFrame rows into LangChain Document objects
# We store metadata (ID, Product, Issue) to help the chatbot cite sources later
documents = []

if 'df_sample' in locals():
    for _, row in df_sample.iterrows():
        # Handle potential missing values in metadata fields
        complaint_id = str(row.get('Complaint ID', 'N/A'))
        product = str(row.get('product_category', 'Unknown'))
        issue = str(row.get('Issue', 'Unknown'))
        company = str(row.get('Company', 'Unknown'))
        narrative = str(row.get('cleaned_narrative', ''))
        
        # Skip empty narratives if any slipped through
        if not narrative.strip():
            continue

        metadata = {
            "complaint_id": complaint_id,
            "product": product,
            "issue": issue,
            "company": company
        }
        
        doc = Document(
            page_content=narrative,
            metadata=metadata
        )
        documents.append(doc)

    print(f"Successfully converted {len(documents)} rows into LangChain Documents.")
    
    # Preview one document
    if documents:
        print("\n--- Example Document ---")
        print(f"Metadata: {documents[0].metadata}")
        print(f"Content (first 200 chars): {documents[0].page_content[:200]}...")
else:
    print("DataFrame not loaded. Run previous cell.")

Successfully converted 11997 rows into LangChain Documents.

--- Example Document ---
Metadata: {'complaint_id': '2897243', 'product': 'Credit Card', 'issue': 'Fees or interest', 'company': 'WELLS FARGO & COMPANY'}
Content (first 200 chars): wells fargo charged a late fee on my credit card on the date the payment was due. i saw the additional 25.00 on my balance and there was no activity whatsoever. not one person i spoke with could tell ...


In [4]:
if documents:
    # Initialize Text Splitter
    # Chunk Size 500: Small enough for specific retrieval (finding exact complaint details)
    # Overlap 50: Prevents cutting sentences in half at boundaries
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        separators=["\n\n", "\n", ". ", " ", ""], 
        length_function=len
    )

    # Perform Split
    chunks = text_splitter.split_documents(documents)

    print(f"Splitting complete.")
    print(f"Total chunks created: {len(chunks)}")
    print(f"Average chunks per document: {len(chunks)/len(documents):.2f}")

    # Verify a chunk
    if chunks:
        print("\n--- Example Chunk ---")
        print(f"Content: {chunks[0].page_content}")
        print(f"Metadata: {chunks[0].metadata}")
else:
    print("No documents to chunk.")

Splitting complete.
Total chunks created: 35399
Average chunks per document: 2.95

--- Example Chunk ---
Content: wells fargo charged a late fee on my credit card on the date the payment was due. i saw the additional 25.00 on my balance and there was no activity whatsoever. not one person i spoke with could tell me what the 25.00 additional balance due was. i closed my account due to the extreme hassle of having to do business with wells fargo. i found out a week later that the charge was actually a late fee posted on 2018. my minimum payment was due on 2018
Metadata: {'complaint_id': '2897243', 'product': 'Credit Card', 'issue': 'Fees or interest', 'company': 'WELLS FARGO & COMPANY'}


In [6]:
# Load Embedding Model
# 'all-MiniLM-L6-v2' is efficient and high-quality for general semantic search
# It runs well on CPUs which is good for local development
print("Initializing Embedding Model (sentence-transformers/all-MiniLM-L6-v2)...")

try:
    # Initialize the embedding model
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    print("✅ Model loaded successfully.")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    # In case of connection issues, you might see an error here.
    # Ensure you have an internet connection for the first run to download the model.

Initializing Embedding Model (sentence-transformers/all-MiniLM-L6-v2)...
✅ Model loaded successfully.


In [7]:
if 'chunks' in locals() and 'embedding_model' in locals():
    print(f"Indexing {len(chunks)} chunks into ChromaDB at '{VECTOR_STORE_DIR}'...")
    print("This may take a few minutes...")

    # Clear existing vector store if it exists (to start fresh and avoid duplicates)
    if os.path.exists(VECTOR_STORE_DIR):
        try:
            shutil.rmtree(VECTOR_STORE_DIR)
            print(f"Removed old vector store at {VECTOR_STORE_DIR}")
        except Exception as e:
            print(f"Warning: Could not remove old directory: {e}")

    # Create Vector Store
    # This creates the embeddings and saves them to disk
    try:
        vector_db = Chroma.from_documents(
            documents=chunks,
            embedding=embedding_model,
            persist_directory=VECTOR_STORE_DIR,
            collection_name="complaints_rag"
        )
        print("Vector store successfully created and persisted!")
        print(f"Database located at: {os.path.abspath(VECTOR_STORE_DIR)}")
    except Exception as e:
        print(f"Error creating vector store: {e}")
else:
    print("Chunks or embedding model not ready.")

Indexing 35399 chunks into ChromaDB at '../vector_store'...
This may take a few minutes...
Removed old vector store at ../vector_store
Vector store successfully created and persisted!
Database located at: d:\10academy\phase5\rag-complaint-chatbot\vector_store


In [8]:
if 'vector_db' in locals():
    # Simple sanity check to ensure the DB works
    query = "They charged me a fee I didn't agree to."
    print(f"\nTest Query: {query}")

    try:
        results = vector_db.similarity_search(query, k=2)

        print(f"Retrieved {len(results)} relevant documents:")
        for i, res in enumerate(results):
            print(f"\n--- Result {i+1} ---")
            print(f"Product: {res.metadata.get('product', 'N/A')}")
            print(f"Issue: {res.metadata.get('issue', 'N/A')}")
            print(f"Content: {res.page_content}")
            
    except Exception as e:
        print(f"Error querying database: {e}")
else:
    print("Vector database not loaded.")


Test Query: They charged me a fee I didn't agree to.
Retrieved 2 relevant documents:

--- Result 1 ---
Product: Credit Card
Issue: Problem with a purchase shown on your statement
Content: . i did not receive anything from this charge and i called them with in 24 hours of the charge to let them know i did not make that charge so i dont feel i should have to pay that money.

--- Result 2 ---
Product: Savings Account
Issue: Account opening, closing, or management
Content: . also they charged me over draft fees i did  transactions that over drafted my account and they took  fees.
