In [1]:
pip install pandas langchain faiss-cpu sentence-transformers transformers openai streamlit


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting streamlit
  Downloading streamlit-1.46.1-py3-none-any.whl.metadata (9.0 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sent

In [13]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import re

#  Load data
df = pd.read_csv("Training Dataset.csv")

# Fill missing values
str_cols = df.select_dtypes(include=["object"]).columns
df[str_cols] = df[str_cols].fillna("Unknown")
num_cols = df.select_dtypes(include=["number"]).columns
df[num_cols] = df[num_cols].fillna(-1)

# Convert rows into documents
documents = df.apply(lambda row: row.to_json(), axis=1).tolist()

# here i am emmbeding index
embedder = SentenceTransformer("all-MiniLM-L6-v2")
doc_embeddings = embedder.encode(documents, convert_to_numpy=True)
index = faiss.IndexFlatL2(doc_embeddings.shape[1])
index.add(doc_embeddings)

#  NLP/Pandas used to handle question
def answer_with_pandas(question):
    q = question.lower()

    # Example 1: Credit History
    if "credit history" in q and "loan" in q and ("affect" in q or "impact" in q):
        approved = df[df['Loan_Status'] == 'Y']
        without_credit = approved[approved['Credit_History'] == 0]
        with_credit = approved[approved['Credit_History'] == 1]

        pct = round(len(with_credit) / len(approved) * 100, 2)
        return f"Yes, credit history significantly affects loan approval. About {pct}% of approved loans had a credit history of 1."

    # Example 2: Count of self-employed
    elif "how many" in q and "self employed" in q:
        count = df[df['Self_Employed'] == "Yes"].shape[0]
        return f"There are {count} self-employed applicants."

    # Example 3: Approval rate of interest question
    elif "loan approval rate" in q:
        rate = round(df[df['Loan_Status'] == 'Y'].shape[0] / df.shape[0] * 100, 2)
        return f"The loan approval rate is {rate}%."


    return None

from transformers import pipeline
qa_model = pipeline("text2text-generation", model="google/flan-t5-small")

def generate_answer(context, question):
    prompt = f"Context: {context}\nQuestion: {question}\nAnswer:"
    result = qa_model(prompt, max_new_tokens=128, truncation=True)[0]["generated_text"]
    return result.strip()

def retrieve_top_docs(query, k=5):
    query_embedding = embedder.encode([query])[0]
    distances, indices = index.search(np.array([query_embedding]), k)
    return [documents[i] for i in indices[0]]

def answer_question(question):
    direct_answer = answer_with_pandas(question)
    if direct_answer:
        return direct_answer

    # Fallback to RAG
    context = "\n".join(retrieve_top_docs(question))
    return generate_answer(context, question)

if __name__ == "__main__":
    print(" Hybrid Loan Dataset Q&A Chatbot")
    print("Type 'exit' to quit.\n")
    while True:
        q = input("Ask a question: ").strip()
        if q.lower() in ["exit", "quit"]:
            break
        print("Answer:", answer_question(q))


Device set to use cpu


 Hybrid Loan Dataset Q&A Chatbot
Type 'exit' to quit.

Ask a question: "What is the loan approval rate?"
Answer: The loan approval rate is 68.73%.
Ask a question: exit
