In [1]:
import sys
import os

# Get the project root directory automatically
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Add it to Python path
if project_root not in sys.path:
    sys.path.append(project_root)

print("Project root added to PATH:", project_root)

Project root added to PATH: e:\Intelligent-Complaint-Analysis-for-Financial-Services-week_7


In [2]:
import numpy as np
import pickle
import faiss
import pyarrow.parquet as pq
from sentence_transformers import SentenceTransformer
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


FAISSVectorStore: Build or Load Index

In [5]:
# Assuming faiss_store.py is in the same folder
from src.faiss_store import FAISSVectorStore

# Path to your Parquet file
parquet_file = "../data/raw/complaint_embeddings.parquet"

# Initialize FAISS store
store = FAISSVectorStore(
    parquet_path=parquet_file,
    embedding_dim=384,  # all-MiniLM-L6-v2 embedding dim
    index_path="data/faiss/index.faiss",
    meta_path="data/faiss/meta.pkl"
)


ðŸ”¹ Loading FAISS index and metadata from disk...
âœ… Loaded FAISS index with 1375327 vectors


Initialize RAGPipeline

In [6]:
from src.RAG_Pipeline import RAGPipeline

# Optional: you can change LLM to any HuggingFace model
llm_model_name = "google/flan-t5-small"

rag = RAGPipeline(vector_store=store, llm_model_name=llm_model_name)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu


Prompt Template

In [7]:
prompt_template = """
You are a financial analyst assistant for CrediTrust.
Use the following retrieved complaint excerpts to answer the user's question.
If the context doesn't contain the answer, say you don't have enough information.

Context: {context}
Question: {question}
Answer:
"""

Test a Single Question

In [8]:
question = "What issues do customers report about credit card billing?"

answer, retrieved = rag.generate_answer(question, prompt_template)

print("Answer:\n", answer)
print("\nTop 2 Sources:")
for i, r in enumerate(retrieved[:2]):
    print(f"{i+1}. {r['document']}")


Token indices sequence length is longer than the specified maximum sequence length for this model (524 > 512). Running this sequence through the model will result in indexing errors


Answer:
 inaccurate or fraudulent credit charges

Top 2 Sources:
1. the customer 's credit report.
2. as both a business owner and a head of household, i have maintained dozens of credit cards during the last 30 years, and i have never had to complain about the service i was getting from a credit card company. a review of my accounts will show that i have always paid my credit card bills in full and on time each month, and in the rare instances where credit card payments have been a few days late, this occurred because of problems with the mail, when i was traveling, or under circumstances beyon


Representative Questions for Evaluation

In [9]:
questions = [
    "What issues do customers report about credit card billing?",
    "How do customers describe problems with personal loans?",
    "Which companies have the most complaints about account management?",
    "What are the main complaints regarding mortgage services?",
    "Are there recurring issues related to bank fees?",
    "How do customers report problems with credit reporting agencies?",
    "What sub-issues are most common in debt collection complaints?",
    "Are there specific states where complaints about credit cards are higher?"
]


Run Qualitative Evaluation

In [11]:
evaluation_results = rag.evaluate_questions(questions, prompt_template)

# Markdown header
print("| Question | Generated Answer | Retrieved Sources (Top 1-2) | Quality Score (1-5) | Comments/Analysis |")
print("|----------|----------------|------------------------------|-------------------|-----------------|")

for res in evaluation_results:
    question = res["question"].replace("\n", " ")
    answer = res["answer"].replace("\n", " ")
    
    # Show only top 2 sources
    sources = "\n".join([s["document"].replace("\n"," ") for s in res["sources"][:2]])
    
    quality = res.get("quality_score", "")  # Fill manually later if needed
    comments = res.get("comments", "")
    
    print(f"| {question} | {answer} | {sources} | {quality} | {comments} |")


| Question | Generated Answer | Retrieved Sources (Top 1-2) | Quality Score (1-5) | Comments/Analysis |
|----------|----------------|------------------------------|-------------------|-----------------|
| What issues do customers report about credit card billing? | inaccurate or fraudulent credit charges | the customer 's credit report.
as both a business owner and a head of household, i have maintained dozens of credit cards during the last 30 years, and i have never had to complain about the service i was getting from a credit card company. a review of my accounts will show that i have always paid my credit card bills in full and on time each month, and in the rare instances where credit card payments have been a few days late, this occurred because of problems with the mail, when i was traveling, or under circumstances beyon | None |  |
| How do customers describe problems with personal loans? | deceptive and misleading | information, so it is not a company wide problem. it is targe