# RAG Pipeline and Evaluation

## Load Modules

In [1]:
import sys
import os
import pandas as pd

sys.path.append(os.path.abspath('..'))
from src.rag_engine import RAGPipeline

  from .autonotebook import tqdm as notebook_tqdm


## Load Files

In [2]:
PARQUET_FILE = "../data/raw/complaint_embeddings.parquet"
vector_db_path="../chroma_db_full"

In [3]:
rag = RAGPipeline(parquet_path=PARQUET_FILE, vector_db_path=vector_db_path)

[RAG] Loading embedding model (all-MiniLM-L6-v2)...
[RAG] Vector Store not found at ../chroma_db_full.
[RAG] Building Vector Store from ../data/raw/complaint_embeddings.parquet...
   Streaming Parquet file...
   Indexed 1000 docs (Total: 1000)...
   Indexed 995 docs (Total: 1995)...
   Indexed 1000 docs (Total: 2995)...
   Indexed 997 docs (Total: 3992)...
   Indexed 997 docs (Total: 4989)...
   Indexed 997 docs (Total: 5986)...
   Indexed 999 docs (Total: 6985)...
   Indexed 999 docs (Total: 7984)...
   Indexed 999 docs (Total: 8983)...
   Indexed 999 docs (Total: 9982)...
   Indexed 998 docs (Total: 10980)...
   Indexed 999 docs (Total: 11979)...
   Indexed 997 docs (Total: 12976)...
   Indexed 999 docs (Total: 13975)...
   Indexed 999 docs (Total: 14974)...
   Indexed 997 docs (Total: 15971)...
   Indexed 998 docs (Total: 16969)...
   Indexed 1000 docs (Total: 17969)...
   Indexed 1000 docs (Total: 18969)...
   Indexed 999 docs (Total: 19968)...
   Indexed 32 docs (Total: 20000)...


### Qualitative Evaluation Loop

In [5]:
# 1. Define the questions you want to test
test_questions = [
    "What are the most common complaints regarding Credit Card late fees?",
    "Why are customers upset about Money Transfers?",
    "Are there specific issues with Personal Loan interest rates?",
    "How does the company handle fraud reports for Savings Accounts?",
    "What do customers say about customer service wait times?"
]

print("--- Starting Evaluation Loop ---")
results = []

# 2. Loop through questions
for q in test_questions:
    print(f"Asking: {q}...")
    
    # Run the RAG pipeline
    output = rag.answer_question(q)
    
    # Extract the top retrieved source text for checking
    top_doc = output['source_documents'][0]
    
    # Create a snippet of the source evidence
    # (We assume metadata has 'date' or 'product', handling missing keys safely)
    meta_date = top_doc.metadata.get('date', 'Unknown Date')
    source_preview = f"{top_doc.page_content[:150]}... (Date: {meta_date})"
    
    results.append({
        "Question": q,
        "Generated Answer": output['answer'],
        "Top Source Evidence": source_preview
    })

print("\n--- Evaluation Complete! ---")

# 3. Display Results
df_results = pd.DataFrame(results)
pd.set_option('display.max_colwidth', None) # Show full text
display(df_results)

--- Starting Evaluation Loop ---
Asking: What are the most common complaints regarding Credit Card late fees?...

[Query] What are the most common complaints regarding Credit Card late fees?
Asking: Why are customers upset about Money Transfers?...

[Query] Why are customers upset about Money Transfers?
Asking: Are there specific issues with Personal Loan interest rates?...

[Query] Are there specific issues with Personal Loan interest rates?
Asking: How does the company handle fraud reports for Savings Accounts?...

[Query] How does the company handle fraud reports for Savings Accounts?
Asking: What do customers say about customer service wait times?...

[Query] What do customers say about customer service wait times?

--- Evaluation Complete! ---


Unnamed: 0,Question,Generated Answer,Top Source Evidence
0,What are the most common complaints regarding Credit Card late fees?,"Based on the context provided, the most common complaints regarding Credit Card late fees include:","years and have consistently made timely payments. despite this, over the past three months, i have experienced multiple incidents where late fees were... (Date: Unknown Date)"
1,Why are customers upset about Money Transfers?,"Customers are upset about money transfers due to problems with receiving and sending out money, and","aves me, as a consumer, at an unfair disadvantage. i acted in good faith based on the data provided by the banks app and ensured that funds were trans... (Date: Unknown Date)"
2,Are there specific issues with Personal Loan interest rates?,"Yes, there are specific issues with Personal Loan interest rates mentioned in the context:\n\n* **",interests rates extremely inflated makes it very difficult to pay loan back. reached out multiple times about interest rates customer service reps sta... (Date: Unknown Date)
3,How does the company handle fraud reports for Savings Accounts?,"Based on the context provided:\n\n* **ALLY FINANCIAL INC.** has a ""one-size-fits-all fraud dispute policy which renders your bank account unusable while they investigate the fraud.""\n* **SOFI TECHNOLOGIES, INC.** investigates fraudulent activity and states that fraudulently withdrawn money would be returned to the account, though in one instance, only one of multiple fraudulent withdrawals was refunded.\n* **CITIBANK, N.A.** will report the fraud and close the account, also advising the customer to report the fraud to the Federal Trade Commission website.\n* **CITIZENS FINANCIAL GROUP, INC.** has a fraud line and an office that reviews cases, which can be contacted via email with additional information.",savings accounts and checking accounts available. i will not deal with a company that has a one-size-fits-all fraud dispute policy which renders your ... (Date: Unknown Date)
4,What do customers say about customer service wait times?,One customer mentioned waiting about 10 seconds before being seen and then another two minutes for the agent,"company, they should enable customer self-service. instead, they push customers through frustrating, unnecessary loops with agents who create more pro... (Date: Unknown Date)"


## Save Evaluation Table

In [6]:
df_results.to_csv("../data/task3_evaluation.csv", index=False)
print("Evaluation saved.")

Evaluation saved.
