In [None]:
import pandas as pd
import numpy as np
import pickle
import os
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline, set_seed, AutoTokenizer, AutoModelForCausalLM
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch
import time

print("Starting Task 3: Building the RAG Core Logic and Evaluation")

vector_store_dir = '../vector_store/'
embedding_model_name = 'sentence-transformers/all-MiniLM-L6-v2'
llm_model_name = 'distilgpt2'
input_path = '../data/filtered_complaints.csv'
sample_size = 50000

try:
    faiss_index_path = os.path.join(vector_store_dir, 'faiss_index.bin')
    metadata_path = os.path.join(vector_store_dir, 'metadata.pkl')

    index = faiss.read_index(faiss_index_path)
    print(f"Loaded FAISS index with {index.ntotal} embeddings.")

    with open(metadata_path, 'rb') as f:
        metadata = pickle.load(f)
    print(f"Loaded metadata for {len(metadata)} chunks.")

    embedding_model = SentenceTransformer(embedding_model_name)
    print(f"Loaded embedding model: {embedding_model_name}")

except FileNotFoundError:
    print("Error: Required FAISS index or metadata files not found.")
    print(f"Please ensure Task 2 completed successfully and check '{vector_store_dir}' directory.")
    exit()

print("Re-creating text chunks for content retrieval...")
try:
    df_original = pd.read_csv(input_path)
except FileNotFoundError:
    print(f"Error: {input_path} not found. Please ensure Task 1 completed successfully.")
    exit()

df_original['cleaned_narrative'] = df_original['cleaned_narrative'].astype(str).fillna('')
df_original = df_original[df_original['cleaned_narrative'].str.strip() != '']

if len(df_original) > sample_size:
    df_original = df_original.sample(n=sample_size, random_state=42).reset_index(drop=True)
else:
    df_original = df_original.reset_index(drop=True)

text_splitter_local = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    add_start_index=True,
)

all_chunks_text = []
for idx, row in df_original.iterrows():
    narrative = row['cleaned_narrative']
    current_docs = text_splitter_local.create_documents([narrative])
    for doc in current_docs:
        all_chunks_text.append(doc.page_content)
print(f"Re-created {len(all_chunks_text)} text chunks for retrieval.")


print(f"Loading LLM: {llm_model_name}")
device = -1
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(llm_model_name)
generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device=device)
set_seed(42)
print(f"LLM '{llm_model_name}' loaded successfully on {'GPU' if device != -1 else 'CPU'}.")


def retrieve_chunks(query, top_k=5):
    query_embedding = embedding_model.encode([query]).astype('float32')
    faiss.normalize_L2(query_embedding)

    D, I = index.search(query_embedding, top_k)
    
    retrieved_chunks_content = []
    retrieved_sources_info = []

    for i in I[0]:
        if 0 <= i < len(all_chunks_text):
            chunk_content = all_chunks_text[i]
            
            chunk_meta = metadata[i]
            
            retrieved_chunks_content.append(chunk_content)
            retrieved_sources_info.append({
                'original_id': chunk_meta.get('original_id', 'N/A'),
                'product': chunk_meta.get('product', 'N/A'),
                'chunk_id': chunk_meta.get('chunk_id', 'N/A'),
                'content': chunk_content
            })
        else:
            print(f"Warning: Retrieved index {i} out of bounds for all_chunks_text ({len(all_chunks_text)}). Skipping.")
    return retrieved_chunks_content, retrieved_sources_info

prompt_template = """You are a financial analyst assistant for CrediTrust. Your task is to answer questions about customer complaints. Use the following retrieved complaint excerpts to formulate your answer. If the context doesn't contain the answer, state that you don't have enough information.

Context:
{context}

Question: {question}
Answer:"""

def generate_answer(question, top_k=5):
    retrieved_texts, retrieved_sources = retrieve_chunks(question, top_k)
    
    context = "\n\n".join(retrieved_texts)
    
    full_prompt = prompt_template.format(context=context, question=question)
    
    response = generator(
        full_prompt,
        max_new_tokens=200,
        num_return_sequences=1,
        do_sample=True,
        temperature=0.7,
        pad_token_id=generator.tokenizer.eos_token_id,
        no_repeat_ngram_size=2
    )
    
    generated_text = response[0]['generated_text']
    
    answer_start_index = generated_text.lower().find("answer:")
    if answer_start_index != -1:
        extracted_answer = generated_text[answer_start_index + len("answer:"):].strip()
    else:
        extracted_answer = generated_text.replace(full_prompt, "").strip()
    
    return extracted_answer, retrieved_sources

print("\n--- Qualitative Evaluation ---")
test_questions = [
    "What are common complaints about credit cards?",
    "Why are people unhappy with BNPL services?",
    "Are there issues with money transfers related to fraud?",
    "What problems are customers facing with savings accounts?",
    "Tell me about complaints regarding personal loans."
]

evaluation_results = []

for i, question in enumerate(test_questions):
    print(f"\nQuestion {i+1}: {question}")
    answer, sources = generate_answer(question, top_k=5)
    print(f"Generated Answer: {answer}")
    print("Retrieved Sources (Top 2):")
    for j, source in enumerate(sources[:2]):
        print(f"  Source {j+1} (Product: {source['product']}, Original ID: {source['original_id']}): {source['content'][:200]}...")
    print("-" * 50)

    evaluation_results.append({
        "Question": question,
        "Generated Answer": answer,
        "Retrieved Sources": [
            f"Source {k+1} (Product: {s['product']}, Original ID: {s['original_id']})" for k, s in enumerate(sources)
        ],
        "Quality Score": "N/A",
        "Comments/Analysis": " "
    })

markdown_table = """
## Evaluation Table

| Question | Generated Answer | Retrieved Sources (Top 2) | Quality Score (1-5) | Comments/Analysis |
|---|---|---|---|---|
"""

for result in evaluation_results:
    cleaned_generated_answer = result['Generated Answer'].replace('\n', '<br>')
    generated_answer_formatted = f"> {cleaned_generated_answer}"

    sources_formatted = "<br>".join([f"- {s}" for s in result["Retrieved Sources"][:2]])

    markdown_table += (
        f"| {result['Question']} "
        f"| {generated_answer_formatted} "
        f"| {sources_formatted} "
        f"| {result['Quality Score']} "
        f"| {result['Comments/Analysis']} |\n"
    )

report_path = os.path.join('..', 'data', 'processed', 'rag_qualitative_evaluation_report.md')
with open(report_path, 'w', encoding='utf-8') as f:
    f.write(markdown_table)
print(f"\nQualitative evaluation report saved to: {report_path}")

print("Task 3: Building the RAG Core Logic and Evaluation completed.")


  from .autonotebook import tqdm as notebook_tqdm


Starting Task 3: Building the RAG Core Logic and Evaluation
Loaded FAISS index with 65322 embeddings.
Loaded metadata for 65322 chunks.
Loaded embedding model: sentence-transformers/all-MiniLM-L6-v2
Re-creating text chunks for content retrieval...
Re-created 82236 text chunks for retrieval.
Loading LLM: distilgpt2


Device set to use cpu


LLM 'distilgpt2' loaded successfully on CPU.

--- Qualitative Evaluation ---

Question 1: What are common complaints about credit cards?
Generated Answer: 1) customer support is required in this case i would have to file a formal complaint to get my card refund for a refund. 2) i can take an action to provide my customer with an account at credis as time has come to an end and can only take one day to contact me for help. 3) the customer is entitled to help in contacting me with any issues that make me feel like I have lost my life i tried to leave the business so i could have a better life. 4) he is responsible for how i get paid i am not responsible of how I pay for my service due i are not entitled as he has the right to not need a check i was told i cannot use my credit to pay again so he can have better insurance and that is why i got paid to him without the help of a bank. 5) his personal debt is a big problem as credit is very expensive in many parts of America with the cost of 

NameError: name '__file__' is not defined