#  Step 1: Create Sample Dataset

In [1]:
import pandas as pd

# Create mock loan data
data = {
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
    'Married': ['Yes', 'No', 'Yes', 'No', 'Yes'],
    'Education': ['Graduate', 'Not Graduate', 'Graduate', 'Graduate', 'Not Graduate'],
    'Self_Employed': ['No', 'Yes', 'No', 'No', 'Yes'],
    'ApplicantIncome': [5000, 3000, 4000, 2500, 6000],
    'LoanAmount': [150, 120, 130, 100, 200],
    'Credit_History': [1.0, 0.0, 1.0, 1.0, 0.0],
    'Property_Area': ['Urban', 'Rural', 'Semiurban', 'Urban', 'Rural'],
    'Loan_Status': ['Y', 'N', 'Y', 'Y', 'N']
}

df = pd.DataFrame(data)
df.head()


Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,LoanAmount,Credit_History,Property_Area,Loan_Status
0,Male,Yes,Graduate,No,5000,150,1.0,Urban,Y
1,Female,No,Not Graduate,Yes,3000,120,0.0,Rural,N
2,Male,Yes,Graduate,No,4000,130,1.0,Semiurban,Y
3,Female,No,Graduate,No,2500,100,1.0,Urban,Y
4,Male,Yes,Not Graduate,Yes,6000,200,0.0,Rural,N


# Step 2: Convert Each Row to Text Document

In [2]:
docs = []
for _, row in df.iterrows():
    doc = f"{row['Gender']} {row['Married']} applicant with {row['Education']} education, "
    doc += f"{row['Self_Employed']} self-employed, income {row['ApplicantIncome']}, loan {row['LoanAmount']}, "
    doc += f"credit history {row['Credit_History']}, property in {row['Property_Area']}. Loan status: {row['Loan_Status']}."
    docs.append(doc)

# Show sample
docs[:2]


['Male Yes applicant with Graduate education, No self-employed, income 5000, loan 150, credit history 1.0, property in Urban. Loan status: Y.',
 'Female No applicant with Not Graduate education, Yes self-employed, income 3000, loan 120, credit history 0.0, property in Rural. Loan status: N.']

# Step 3: Install Required Libraries



In [4]:
!pip install sentence-transformers transformers scikit-learn


Defaulting to user installation because normal site-packages is not writeable



# Step 4: Create Embeddings Using Sentence Transformer

In [6]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = model.encode(docs)

# Step 5: Define Retrieval Function (Cosine Similarity)

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_top_k_docs(query, k=3):
    query_embedding = model.encode([query])
    similarities = cosine_similarity(query_embedding, doc_embeddings)[0]
    top_k_indices = np.argsort(similarities)[-k:][::-1]
    return [docs[i] for i in top_k_indices]


# Step 6: Load Generative Model

In [8]:
from transformers import pipeline

generator = pipeline("text2text-generation", model="google/flan-t5-base")


config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu


# tep 7: Final Answer Function

In [10]:
def answer_query(query):
    top_docs = get_top_k_docs(query)
    context = " ".join(top_docs)
    prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
    result = generator(prompt, max_length=200)[0]['generated_text']
    return result


# Step 8: Ask a Question to the Chatbot

In [12]:
query = "What is the loan approval status of self-employed applicants?"
answer = answer_query(query)
print("Answer:", answer)


Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Answer: Y
