In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')


In [13]:
from google.colab import files
import pandas as pd
import io

# First remove any existing uploaded files
!rm -f *.csv

# Upload with exact names we want to use
uploaded = files.upload()

# Verify the uploaded filenames
print(uploaded.keys())  # Check what names were actually used

# Load with consistent names
train_df = pd.read_csv(io.BytesIO(uploaded['Training_Dataset.csv']))
test_df = pd.read_csv(io.BytesIO(uploaded['Test_Dataset.csv']))
sample_sub = pd.read_csv(io.BytesIO(uploaded['Sample_Submission.csv']))

Saving Sample_Submission.csv to Sample_Submission.csv
Saving Test_Dataset.csv to Test_Dataset.csv
Saving Training_Dataset.csv to Training_Dataset.csv
dict_keys(['Sample_Submission.csv', 'Test_Dataset.csv', 'Training_Dataset.csv'])


In [16]:
df = train_df.copy()

In [17]:
# Check the training data
print("Training Data Shape:", train_df.shape)
print("\nFirst 5 rows:")
display(train_df.head())

# Check for missing values
print("\nMissing values:")
print(train_df.isnull().sum())

Training Data Shape: (614, 13)

First 5 rows:


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y



Missing values:
Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


In [18]:
def load_data():
    # Create knowledge base from training data
    knowledge_base = []

    for _, row in train_df.iterrows():  # Using train_df instead of df
        entry = f"""
        Loan ID: {row['Loan_ID']}
        Gender: {row['Gender']}
        Married: {row['Married']}
        Dependents: {row['Dependents']}
        Education: {row['Education']}
        Self Employed: {row['Self_Employed']}
        Applicant Income: {row['ApplicantIncome']}
        Coapplicant Income: {row['CoapplicantIncome']}
        Loan Amount: {row['LoanAmount']}
        Loan Amount Term: {row['Loan_Amount_Term']}
        Credit History: {row['Credit_History']}
        Property Area: {row['Property_Area']}
        Loan Status: {row['Loan_Status']}
        """
        knowledge_base.append(entry)

    # Add general information about loan approval
    knowledge_base.extend([
        "Loan approval typically depends on credit history, income, and loan amount.",
        "Applicants with higher income and good credit history are more likely to get approved.",
        "Urban property areas often have higher approval rates.",
        "Graduates tend to have better loan approval chances.",
        "Longer loan terms may increase approval chances but result in higher interest."
    ])

    return knowledge_base

In [21]:
def load_data():
    # Create knowledge base from the already-loaded train_df
    knowledge_base = []

    for _, row in train_df.iterrows():
        entry = f"""
        Loan ID: {row['Loan_ID']}
        Gender: {row['Gender']}
        Married: {row['Married']}
        Dependents: {row['Dependents']}
        Education: {row['Education']}
        Self Employed: {row['Self_Employed']}
        Applicant Income: {row['ApplicantIncome']}
        Coapplicant Income: {row['CoapplicantIncome']}
        Loan Amount: {row['LoanAmount']}
        Loan Amount Term: {row['Loan_Amount_Term']}
        Credit History: {row['Credit_History']}
        Property Area: {row['Property_Area']}
        Loan Status: {row['Loan_Status']}
        """
        knowledge_base.append(entry)

    # Add general information
    knowledge_base.extend([
        "Loan approval typically depends on credit history, income, and loan amount.",
        "Applicants with higher income and good credit history are more likely to get approved.",
        "Urban property areas often have higher approval rates.",
        "Graduates tend to have better loan approval chances.",
        "Longer loan terms may increase approval chances but result in higher interest."
    ])

    return knowledge_base

In [22]:
class RAGLoanApprovalChatbot:
    def __init__(self):
        # Use a lightweight embedding model
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        # Use a lightweight generative model
        self.generator = pipeline('text-generation', model='gpt2')

        # Load and prepare data (using already-loaded train_df)
        self.knowledge_base = load_data()  # No parameter needed now
        self.knowledge_embeddings = self.embedding_model.encode(self.knowledge_base)

    def retrieve_relevant_info(self, query, top_k=3):
        query_embedding = self.embedding_model.encode(query)
        similarities = cosine_similarity(
            [query_embedding],
            self.knowledge_embeddings
        )[0]
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        return [self.knowledge_base[i] for i in top_indices]

    def generate_response(self, query):
        relevant_info = self.retrieve_relevant_info(query)
        context = "\n".join(relevant_info)

        prompt = f"""
        You are a loan approval assistant. Answer the user's question based on the context below.
        If you don't know the answer, say you don't know. Be concise and helpful.

        Context:
        {context}

        Question: {query}
        Answer:
        """

        response = self.generator(
            prompt,
            max_length=200,
            num_return_sequences=1,
            temperature=0.7,
            truncation=True
        )

        return response[0]['generated_text'].replace(prompt, "").strip()

In [31]:
# Initialize the chatbot
chatbot = RAGLoanApprovalChatbot()

# Example usage
question = "What factors affect loan approval?"
answer = chatbot.generate_response(question)
print(f"Question: {question}")
print(f"Answer: {answer}")

question = "How does income affect loan approval chances?"
answer = chatbot.generate_response(question)
print(f"\nQuestion: {question}")
print(f"Answer: {answer}")

question = "What's the typical loan amount in the data?"
answer = chatbot.generate_response(question)
print(f"\nQuestion: {question}")
print(f"Answer: {answer}")

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Question: What factors affect loan approval?
Answer: x                                    x


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Question: How does income affect loan approval chances?
Answer: 

Question: What's the typical loan amount in the data?
Answer: 


In [30]:
# Add try-except blocks for robust error handling
try:
    answer = chatbot.generate_response(question)
    print(f"Answer: {answer}")
except Exception as e:
    print(f"Error generating response: {str(e)}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Answer: 


In [32]:
# Create an interactive chat loop
print("Loan Approval Chatbot (type 'quit' to exit)")
while True:
    user_input = input("\nYou: ")
    if user_input.lower() in ['quit', 'exit']:
        break
    try:
        response = chatbot.generate_response(user_input)
        print(f"Bot: {response}")
    except Exception as e:
        print(f"Bot: Sorry, I encountered an error. Please try again.")

Loan Approval Chatbot (type 'quit' to exit)

You: quit


In [34]:
# Create test cases
test_questions = [
    "What credit score is needed for loan approval?",
    "How does employment status affect approval?",
    "What's the maximum loan amount available?"
]

for q in test_questions:
    print(f"\nTesting question: {q}")
    print("Response:", chatbot.generate_response(q))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Testing question: What credit score is needed for loan approval?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Response: 

Testing question: How does employment status affect approval?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Response: Urban property area residents are more likely to be approved by the school district.
Ask about eligibility for a loan.
If your property is in an Urban area and you would like to be notified of the approval, post your question to a local business or school board to hear about it.
If you are under the age of 18, the school district will have a program that allows you to transfer to a different school. If you are 18 or over, you can transfer to a different school if you are 18 or over.

Testing question: What's the maximum loan amount available?
Response: Additional Interest:


In [35]:
import gradio as gr

def chat_interface(question):
    return chatbot.generate_response(question)

iface = gr.Interface(fn=chat_interface,
                    inputs="text",
                    outputs="text",
                    title="Loan Approval Chatbot")
iface.launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ba15468e047b6de8bb.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


