In [15]:
import getpass
import os
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv 
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

In [16]:
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma

In [17]:
import pandas as pd
import numpy as np

df = pd.read_csv("datasets/Loan_default.csv")
df.head()

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


In [18]:
# Check for missing values and handle them
print(df.isnull().sum())

LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
dtype: int64


In [19]:
# Handle missing values in the column by filling it with the most frequent value (mode) of that column.
for column in df.columns:
  if df[column].isnull().sum() > 0 and not df[column].mode().empty:
    df[column].fillna(df[column].mode().iloc[0], inplace=True)

In [20]:
# Handle missing values in the target variable 'Default'
# Option 1: Drop rows with missing target values (if very few)
dropna_df = df.dropna(subset=['Default'], inplace=True)
display(dropna_df)

None

In [21]:
df = df.iloc[:10,:]
df.shape

(10, 18)

In [51]:
# Define the function to create a prompt
def create_prompt(data_point):
    prompt = f"""Predict the default probability for this user given LoanID: {data_point['LoanID']}.
Details:
    - Age: {data_point['Age']}
    - Income: {data_point['Income']}
    - Loan Amount: {data_point['LoanAmount']}
    - Credit Score: {data_point['CreditScore']}
    - Months Employed: {data_point['MonthsEmployed']}
    - Number of Credit Lines: {data_point['NumCreditLines']}
    - Interest Rate: {data_point['InterestRate']}
    - Loan Term: {data_point['LoanTerm']}
    - Debt-to-Income Ratio: {data_point['DTIRatio']}
    - Education: {data_point['Education']}
    - Employment Type: {data_point['EmploymentType']}
    - Marital Status: {data_point['MaritalStatus']}
    - Has Mortgage: {data_point['HasMortgage']}
    - Has Dependents: {data_point['HasDependents']}
    - Loan Purpose: {data_point['LoanPurpose']}
    - Has Co-Signer: {data_point['HasCoSigner']}
    calculate it when the LoanId is given as the input
    """

    return prompt

# Create a list to store prompts and responses
# data_list = []

# Generate prompts and responses
# for index, row in df.iterrows():
#     print(row)
#     prompt = create_prompt(row)
#     data_list.append(prompt)

In [58]:
def generate_prompt_by_loanid(loan_id, df):
    # Filter the dataframe for the provided LoanID
    data_point = df[df['LoanID'] == loan_id] 
    #print(data_point)
    
    # Check if LoanID exists
    if data_point.empty:
        return f"No record found for LoanID: {loan_id}"
    
    # Generate prompt for the filtered data
    prompt = create_prompt(data_point.iloc[0])  # Convert row to series
    return prompt

In [59]:
Loan_ID='C1OZ6DPJ8Y'
data_input = generate_prompt_by_loanid(Loan_ID,df) 

In [38]:
from langchain.docstore.document import Document

documents = []

# Iterate over rows using .rows() method
for i, row_tuple in df.iterrows():
    document = f"id:{i}\ LoanID: {row_tuple.iloc[0]}\ Credit Score: {row_tuple.iloc[4]}\ Debt-to-Income Ratio:{row_tuple.iloc[9]}"
    documents.append(Document(page_content=document))
# display(documents)

In [25]:
llm = ChatGoogleGenerativeAI(model='gemini-pro', google_api_key=GEMINI_API_KEY, convert_system_message_to_human=True)

In [26]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001",google_api_key = GEMINI_API_KEY)

In [27]:
from langchain.vectorstores import Chroma

persist_directory = 'docs/chroma_rag/'

langchain_chroma = Chroma.from_documents(
    documents=documents,
    collection_name="default_prediction",
    embedding=embeddings,
    persist_directory=persist_directory
)

In [28]:
""""
Financial coefficients required to calculate PD: 
These values may vary according to the FInancial Lending organisation
Intercept = b0
Credit Score = b1
Debt-to-income ratio = b2

PD = 1/(1+e^(-z))
where,The linear combination of borrower characteristics and their corresponding coefficients
z = b0+b1.x1+b2.x2+....+bn.xn 
=>b0+b1*[credit Score]+b2*[Debt-to-income-ratio]
"""
b0 = -1.5
b1 = -0.005
b2 = 0.07

In [60]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from IPython.display import display, Markdown
import os
import warnings
warnings.filterwarnings('ignore')


# Define the prompt template
template = """
You are an Credit Risk Expert in Financial Text Data, Analyse the question and get the context and Answer the following:
1.**Instruction :**
    - Predict if the Given Customer is going to Default or not by calculating Default using formula below and consider the return values as flag.
2. **Analysis Criteria:**
   - Assess overall creditworthiness based on payment patterns and credit utilization.
3. **Output Requirements:**
   - If the customer has no defaults:
     - Respond with: "The loan associated with Loan ID [Insert Loan ID] has no recorded defaults. It is safe to proceed with any further actions related to this loan."
   - If there are potential risks (e.g., late payments, high credit utilization):
     - Respond with: "The loan associated with Loan ID [Insert Loan ID] has recorded defaults. This poses a risk for further lending activities and may require immediate attention to mitigate potential financial repercussions"
   - Provide any additional recommendations or next steps if necessary.
4. **Tone:** 
   - Professional, concise, and informative.
5. *Response*
   - The loan associated with Loan ID and may 
  require immediate attention to mitigate potential financial repercussions. 
  The customer has a (high/low) credit score  and a relatively (high/low) debt-to-income ratio. 
  These factors increase the likelihood of default.


Question: {question}
Context: {context}


z = b0+b1*[Inser Credit Score]+b2*[Insert Debt-to-income-ratio]
Formula for Default = 1/(1+e^(-z))
Answer: 
"""

PROMPT = PromptTemplate(input_variables=["context", "query","b0","b1","b2"], template=template)

# Ensure llm and langchain_chroma are properly initialized
context = langchain_chroma.as_retriever(search_kwargs={"k": 1})

qa_chain = RetrievalQA.from_chain_type(
    llm, retriever=context, chain_type_kwargs={"prompt": PROMPT}
)
# LonaID=input("Enter LoanID")
question = data_input
print(question)

# Run the QA chain

try:
    #result = qa_chain.invoke({"query": question})
    result = qa_chain.invoke(question)
    display(result.get('result'))
except Exception as e:
    print(f"Error encountered: {e}")

Predict the default probability for this user given LoanID: C1OZ6DPJ8Y.
Details:
    - Age: 46
    - Income: 84208
    - Loan Amount: 129188
    - Credit Score: 451
    - Months Employed: 26
    - Number of Credit Lines: 3
    - Interest Rate: 21.17
    - Loan Term: 24
    - Debt-to-Income Ratio: 0.31
    - Education: Master's
    - Employment Type: Unemployed
    - Marital Status: Divorced
    - Has Mortgage: Yes
    - Has Dependents: Yes
    - Loan Purpose: Auto
    - Has Co-Signer: No
    calculate it when the LoanId is given as the input
    


'The loan associated with Loan ID C1OZ6DPJ8Y has recorded defaults. This poses a risk for further lending activities and may require immediate attention to mitigate potential financial repercussions. The customer has a low credit score and a relatively high debt-to-income ratio. These factors increase the likelihood of default.'

In [None]:
#call streamlit through notebbok


In [15]:

# model_id = 'HuggingFaceH4/zephyr-7b-beta'

# device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# # set quantization configuration to load large model with less GPU memory
# # this requires the `bitsandbytes` library
# bnb_config = transformers.BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type='nf4',
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_compute_dtype=bfloat16
# )

# print(device)

In [16]:
# from accelerate import Accelerator
# accelerator = Accelerator()
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# model_config = transformers.AutoConfig.from_pretrained(
#    model_id,
#     trust_remote_code=True,
#     max_new_tokens=1024
# )
# model = transformers.AutoModelForCausalLM.from_pretrained(
#     model_id,
#     trust_remote_code=True,
#     config=model_config,
#     quantization_config=bnb_config,
#     device_map='auto',
#     low_cpu_mem_usage=True
# )
# tokenizer = AutoTokenizer.from_pretrained(model_id)

In [17]:
# # Initialize the query pipeline with increased max_length
# query_pipeline = transformers.pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     torch_dtype=torch.float16,
#     max_length=6000,  # Increase max_length
#     max_new_tokens=500,  # Control the number of new tokens generated
#     device_map="auto",
# )

In [18]:
# from IPython.display import display, Markdown
# def colorize_text(text):
#     for word, color in zip(["Reasoning", "Question", "Answer", "Total time"], ["blue", "red", "green", "magenta"]):
#         text = text.replace(f"{word}:", f"\n\n**<font color='{color}'>{word}:</font>**")
#     return text

# llm = HuggingFacePipeline(pipeline=query_pipeline)

# question = "Please explain Default in Credit Risk?"
# response = llm(prompt=question)

# full_response =  f"Question: {question}\nAnswer: {response}"
# display(Markdown(colorize_text(full_response)))

In [19]:
# from langchain.chains import RetrievalQA
# from langchain.prompts import PromptTemplate
# from langchain_community.llms import HuggingFaceHub
# from IPython.display import display, Markdown
# import os
# import warnings
# warnings.filterwarnings('ignore')


# # Define the prompt template
# template = """
# You are an Credit Risk Expert in Financial Text Data, Analyse the question and get the context and Answer the following:
# 1.**Instruction :**
#     - Predict is the Given Customer is going to Default or not by checking the flag 1 or 0.
# 2. **Analysis Criteria:**
#    - Check for any recorded defaults in the past 12 months.
#    - Assess overall creditworthiness based on payment patterns and credit utilization.
# 3. **Output Requirements:**
#    - If the customer has no defaults:
#      - Respond with: "The loan associated with Loan ID [Insert Loan ID] has no recorded defaults. It is safe to proceed with any further actions related to this loan."
#    - If there are potential risks (e.g., late payments, high credit utilization):
#      - Respond with: "The loan associated with Loan ID [Insert Loan ID] has recorded defaults. This poses a risk for further lending activities and may require immediate attention to mitigate potential financial repercussions"
#    - Provide any additional recommendations or next steps if necessary.

# 4. **Tone:** 
#    - Professional, concise, and informative.

# Question: {question}
# Context: {context}
# Answer:
# """
# PROMPT = PromptTemplate(input_variables=["context", "query"], template=template)

# # Ensure llm and langchain_chroma are properly initialized
# context = langchain_chroma.as_retriever(search_kwargs={"k": 1})

# qa_chain = RetrievalQA.from_chain_type(
#     llm, retriever=context, chain_type_kwargs={"prompt": PROMPT}
# )

# # Define your question
# # question = "The company reported inflated revenues by including sales that never occurred."
# # question = "\n    Predict the default probability for the following loan applicant:\n\n    - Age: 56\n    - Income: 85994\n    - Loan Amount: 50587\n    - Credit Score: 520\n    - Months Employed: 80\n    - Number of Credit Lines: 4\n    - Interest Rate: 15.23\n    - Loan Term: 36\n    - Debt-to-Income Ratio: 0.44\n    - Education: Bachelor's\n    - Employment Type: Full-time\n    - Marital Status: Divorced\n    - Has Mortgage: Yes\n    - Has Dependents: Yes\n    - Loan Purpose: Other\n    - Has Co-Signer: Yes\n    "
# # question = "Revenue was recognized prematurely before the actual sales occurred."
# # question = "The balance sheet provides a true and fair view of the company’s financial position."
 
# question = str(data)
# # question = eval(str(data_list[0]))
# # print(type(question))
# # Run the QA chain

# try:
#     result = qa_chain({"query": question})
#     display(result)
# except Exception as e:
#     print(f"Error encountered: {e}")
# # except RuntimeError as e:
# #     print(f"RuntimeError encountered: {e}")