In [1]:
# rag_pipeline.py

import os
import pandas as pd
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_core.chains import LLMChain

# --- Configuration ---
DATA_DIR = '../data'
VECTOR_STORE_DIR = '../vector_store/chroma_db_credi_trust'
FILTERED_DATA_FILE = os.path.join(DATA_DIR, 'filtered_complaints.csv')
NARRATIVE_COLUMN = 'Consumer complaint narrative'

# --- Load Data ---
df = pd.read_csv(FILTERED_DATA_FILE)
documents = [Document(page_content=text) for text in df[NARRATIVE_COLUMN].dropna().tolist()]

# --- Embedding Setup ---
embedding_model = HuggingFaceEmbeddings(
    model_name='sentence-transformers/all-MiniLM-L6-v2'
)

# --- Vector Store Setup ---
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embedding_model,
    persist_directory=VECTOR_STORE_DIR
)
vectorstore.persist()

# --- Retriever Function ---
def retrieve_chunks(question, k=5):
    return vectorstore.similarity_search(question, k=k)

# --- Prompt Template ---
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a financial analyst assistant for CrediTrust. Your task is to answer questions about customer complaints.
Use the following retrieved complaint excerpts to formulate your answer.

If the context doesn't contain the answer, state that you don't have enough information.

Context: {context}
Question: {question}
Answer:
"""
)

# --- Language Model (LLM) Setup ---
llm = HuggingFaceEndpoint(
    repo_id="google/flan-t5-large",  # You can switch to mistralai/Mistral-7B-Instruct or another HF model
    model_kwargs={"temperature": 0.2, "max_length": 512}
)

rag_chain = LLMChain(
    llm=llm,
    prompt=prompt_template
)

# --- RAG Pipeline Function ---
def answer_question(question: str, k: int = 5) -> dict:
    retrieved_docs = retrieve_chunks(question, k=k)
    context = "\n\n".join(doc.page_content for doc in retrieved_docs)
    response = rag_chain.invoke({"context": context, "question": question})
    return {
        "answer": response["text"],
        "sources": [doc.page_content for doc in retrieved_docs]
    }

# --- Test Run ---
if __name__ == "__main__":
    test_question = "How do customers feel about credit card disputes?"
    result = answer_question(test_question)
    print("\n--- Answer ---")
    print(result['answer'])
    print("\n--- Retrieved Sources ---")
    for i, src in enumerate(result['sources']):
        print(f"\nSource {i+1}:\n{src}")


ModuleNotFoundError: No module named 'langchain_core.chains'