#  QA chatbot using RAG pipeline

# Installing libraries

In [1]:
!pip install -q faiss-cpu
!pip install -q sentence-transformers
!pip install -q transformers accelerate

#  Importing libraries

In [2]:
import pandas as pd
import numpy as np
import os
from sentence_transformers import SentenceTransformer
import faiss
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

2025-07-26 17:30:13.171726: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753551013.198455     632 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753551013.206306     632 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


#  Loading the dataset

In [3]:
df = pd.read_csv("/kaggle/input/loan-approval-prediction/Training Dataset.csv")
df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


# Formating rows as text documents

In [4]:
def row_to_text(row):
    return (
        f"Loan_ID: {row['Loan_ID']}, Gender: {row['Gender']}, Married: {row['Married']}, "
        f"Dependents: {row['Dependents']}, Education: {row['Education']}, "
        f"Self_Employed: {row['Self_Employed']}, ApplicantIncome: {row['ApplicantIncome']}, "
        f"CoapplicantIncome: {row['CoapplicantIncome']}, LoanAmount: {row['LoanAmount']}, "
        f"Loan_Amount_Term: {row['Loan_Amount_Term']}, Credit_History: {row['Credit_History']}, "
        f"Property_Area: {row['Property_Area']}, Loan_Status: {row['Loan_Status']}"
    )
documents = df.fillna("").apply(row_to_text, axis=1).tolist()

# Create embeddings

In [5]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = embedder.encode(documents, show_progress_bar=True)

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

# Creating FAISS index

In [6]:
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(doc_embeddings))

# Function to get top k relevant docs

In [7]:
def retrieve_docs(query, k=5, filter_rejected=False):
    filtered_docs = documents
    if filter_rejected:
        filtered_docs = [doc for doc in documents if "Loan_Status: N" in doc]

    query_embedding = embedder.encode([query])
    distances, indices = index.search(np.array(query_embedding), k)
    
    return [filtered_docs[i] for i in indices[0] if i < len(filtered_docs)]

#  Loading generator model

In [8]:
generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    tokenizer="google/flan-t5-base",
    max_length=512
)

Device set to use cpu


#  RAG Function

In [9]:
def format_doc(doc_text):
    fields = doc_text.split(", ")
    return "- " + "\n- ".join(fields)

def rag_chatbot(query, k=5, filter_rejected=False):
    relevant_docs = retrieve_docs(query, k, filter_rejected=filter_rejected)
    formatted_context = "\n\n".join([format_doc(doc) for doc in relevant_docs])
    if len(formatted_context) > 1500:
        formatted_context = formatted_context[:1500]

    prompt = (
        f"You are a helpful data analyst. Here is a sample of loan application data:\n\n"
        f"{formatted_context}\n\n"
        f"Question: {query}\n\n"
        f"Based on the patterns in the data, answer in one or two clear sentences. "
        f"Do not repeat the question. Provide an actual insight if possible."
    )

    response = generator(prompt)[0]['generated_text']
    return response

# Rejection Analysis

In [10]:
def rejection_insight():
    rejected_df = df[df["Loan_Status"] == "N"]
    insights = (
        f"There are {len(rejected_df)} rejected applications. "
        f"{(rejected_df['Credit_History'] == 0.0).sum()} had no credit history. "
        f"{(rejected_df['ApplicantIncome'] < 3000).sum()} had low income. "
        f"{(rejected_df['LoanAmount'] > 150).sum()} had high loan amounts."
    )

    prompt = (
        f"You are a financial assistant. Based on the following summary of rejected loans:\n\n"
        f"{insights}\n\n"
        f"Explain in simple language why these loans might have been rejected."
    )

    response = generator(prompt)[0]['generated_text']
    return response

# Testing the RAG Chatbot

In [11]:
query = "Does income affect loan approval?"
answer = rag_chatbot(query)
print("Answer:", answer)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (639 > 512). Running this sequence through the model will result in indexing errors


Answer: Applicant income affects loan approval.


In [12]:
query = "What are common reasons for loan rejection?"
answer = rejection_insight()
print("Answer:", answer)

  return op(a, b)


Answer: The applicants did not have a credit history.
