In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor, XGBClassifier
import numpy as np

from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever

from pymongo import MongoClient
from dotenv import load_dotenv
import os
import joblib
from glob import glob

from langchain.embeddings.huggingface import HuggingFaceInstructEmbeddings
from langchain.vectorstores import MongoDBAtlasVectorSearch
from langchain.chains import RetrievalQA

load_dotenv()

True

# Init All models and connectors

In [4]:
MONGO_CONN=os.environ.get("MONGO_CONNECTION_STRING")
client = MongoClient(MONGO_CONN)
col = client["bfsi-genai"]["credit_history"]
vcol = client["bfsi-genai"]["cc_products"]

llm = ChatOpenAI(temperature=0.2, model_name="gpt-3.5-turbo")
llm_large= ChatOpenAI(temperature=0, model_name="gpt-4")
repo_id = "hkunlp/instructor-base"
hf = HuggingFaceInstructEmbeddings(model_name=repo_id, cache_folder="tmp/")
hf.embed_instruction = "Represent the document for retrieval of personalized credit cards:"
vectorstore = MongoDBAtlasVectorSearch(vcol, hf)

# use pipeline only if group of results is required
pipeline= [
    {"$addFields":{
            "score":{"$meta":"searchScore"}
        }
    },
    {
        '$group': {
            '_id': '$title', 
            'source': {
                '$first': '$source'
            }, 
            'title': {
                '$first': '$title'
            }, 
            'text': {
                '$push': '$text'
            },
            'score':{"$avg": "$score"}
        }
    }, {
        '$addFields': {
            'text': {
                '$reduce': {
                    'input': '$text', 
                    'initialValue': '', 
                    'in': {
                        '$concat': [
                            '$$value', '$$this'
                        ]
                    }
                }
            }
        }
    },
    {
        "$sort":{
            "score": -1
        }
    },
    {
        '$limit': 5
    }
]


# LLM powered retriver for product suggestions
retriever = vectorstore.as_retriever(search_type='similarity',search_kwargs={'k': 3})
recommender_retriever = MultiQueryRetriever.from_llm(retriever=retriever,llm=llm_large)

  from tqdm.autonotebook import trange
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: hkunlp/instructor-base


Downloading (…)62736/.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

Downloading (…)/2_Dense/config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading (…)15e6562736/README.md:   0%|          | 0.00/66.2k [00:00<?, ?B/s]

Downloading (…)e6562736/config.json:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)62736/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.43k [00:00<?, ?B/s]

Downloading (…)6562736/modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer


INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu


max_seq_length  512


# Train a ML Model to predict Candidate deliquency and save model as ckpt

In [3]:
df = pd.DataFrame.from_records(col.find({}, {"_id":0,"Unnamed: 0":0}))

# Separate target from predictors
y = df.SeriousDlqin2yrs
X = df.drop(['SeriousDlqin2yrs'], axis=1)

if "../model/classifier.jlb" in glob("../model/*"):
    print("i am here")
    # Divide data into training and validation subsets
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


    model = XGBClassifier(learning_rate = 0.1, n_estimators = 1000, verbose = 1)

    model.fit(X_train, y_train)
    joblib.dump(model, "../model/classifier.jlb")
else:
    model = joblib.load("../model/classifier.jlb")

NameError: name 'col' is not defined

# Load saved Classifier model

In [98]:
#do not run each time
model = joblib.load("../model/classifier.jlb")
imp_idx = np.argsort(-1 * model.feature_importances_)

df = pd.DataFrame.from_records((col.find({"Unnamed: 0":241}, {"_id":0,"Unnamed: 0":0, "SeriousDlqin2yrs":0})))
feature_importance = "\n".join(i for i in list(map(lambda x:f"Columns:{x[0]}  Prob score for decision making:{x[1]}" ,zip(df.columns[imp_idx], model.feature_importances_[imp_idx]))))
user_profile_ip = df.to_dict(orient="records")[0]
pred = model.predict_proba(df)[:,1][0]

0.44639033

# Features important to classify by the model

In [99]:

print(feature_importance)

Columns:NumberOfTimes90DaysLate  Prob score for decision making:0.46804147958755493
Columns:NumberOfTime60-89DaysPastDueNotWorse  Prob score for decision making:0.12263187766075134
Columns:NumberOfTime30-59DaysPastDueNotWorse  Prob score for decision making:0.11812681704759598
Columns:RevolvingUtilizationOfUnsecuredLines  Prob score for decision making:0.09055158495903015
Columns:NumberRealEstateLoansOrLines  Prob score for decision making:0.038781192153692245
Columns:MonthlyIncome  Prob score for decision making:0.03306850045919418
Columns:DebtRatio  Prob score for decision making:0.03270106017589569
Columns:NumberOfOpenCreditLinesAndLoans  Prob score for decision making:0.03246095031499863
Columns:NumberOfDependents  Prob score for decision making:0.03210572153329849
Columns:age  Prob score for decision making:0.031530871987342834


In [86]:
# young 115
# i=8
# ip = X_valid[["RevolvingUtilizationOfUnsecuredLines", "age", "NumberOfTime30-59DaysPastDueNotWorse", "DebtRatio", "MonthlyIncome", "NumberOfOpenCreditLinesAndLoans", "NumberOfTimes90DaysLate", "NumberRealEstateLoansOrLines", "NumberOfTime60-89DaysPastDueNotWorse", "NumberOfDependents"]]
# c = ip.values[i].tolist()
# dlq = res[i]
# c,dlq

In [87]:
prompt = f"""
##Instruction: Taking into account the Definitions of various fields and their respective values a model is trained to predict weather a person will expericen delinquency or not in the next 2 years. Below both the values that was input to the model and the result produced by the model are provided. As a bank employee response to the candidate, It is expected to provide a detailed reason in layman language as to why a Credit request was rejected or processed given the profile of the candidate. Also while providing reason do not mention the use of AI model being used for decision making.

##Definitions
RevolvingUtilizationOfUnsecuredLines=Total balance on credit cards and personal lines of credit except real estate and no installment debt like car loans divided by the sum of credit limits DataType=percentage
age=Age of borrower in years DataType=integer
NumberOfTime30-59DaysPastDueNotWorse=Number of times borrower has been 30-59 days past due but no worse in the last 2 years. DataType=integer
DebtRatio=Monthly debt payments, alimony,living costs divided by monthy gross income DataType=percentage
MonthlyIncome=Monthly income in INR DataType=real
NumberOfOpenCreditLinesAndLoans=Number of Open loans (installment like car loan or mortgage) and Lines of credit (e.g. credit cards) DataType=integer
NumberOfTimes90DaysLate=Number of times borrower has been 90 days or more past due. DataType=integer
NumberRealEstateLoansOrLines=Number of mortgage and real estate loans including home equity lines of credit DataType=integer
NumberOfTime60-89DaysPastDueNotWorse=Number of times borrower has been 60-89 days past due but no worse in the last 2 years. DataType=integer
NumberOfDependents=Number of dependents in family excluding themselves (spouse, children etc.) DataType=integer
SeriousDlqin2yrs=Person experienced 90 days past due delinquency or worse  DataType=Percentage

##Feature importace of the model used:
{feature_importance}

##Values for given profile to be use to predict the Result(SeriousDlqin2yrs) with a reason
{user_profile_ip}

## Model Result
SeriousDlqin2yrs={pred}

##Reason in step by step points:
"""
print(prompt)


##Instruction: Taking into account the Definitions of various fields and their respective values a model is trained to predict weather a person will expericen delinquency or not in the next 2 years. Below both the values that was input to the model and the result produced by the model are provided. As a bank employee response to the candidate, It is expected to provide a detailed reason in layman language as to why a Credit request was rejected or processed given the profile of the candidate. Also while providing reason do not mention the use of AI model being used for decision making.

##Definitions
RevolvingUtilizationOfUnsecuredLines=Total balance on credit cards and personal lines of credit except real estate and no installment debt like car loans divided by the sum of credit limits DataType=percentage
age=Age of borrower in years DataType=integer
NumberOfTime30-59DaysPastDueNotWorse=Number of times borrower has been 30-59 days past due but no worse in the last 2 years. DataType=i

In [88]:
response = llm.predict(prompt)

In [89]:
print(response)

1. The person's revolving utilization of unsecured lines is 97.33%. This means that they have a high balance on their credit cards and personal lines of credit compared to their credit limits. This could indicate that they are relying heavily on credit and may have difficulty managing their debt.

2. The person is 61 years old. Age can be a factor in determining creditworthiness as older individuals may have a more stable financial situation and a longer credit history.

3. The person has been 30-59 days past due on their payments once in the last 2 years. This suggests that they have had some difficulty in making their payments on time.

4. The person's debt ratio is 20.99%. This means that their monthly debt payments, including alimony and living costs, are almost 21% of their monthly gross income. A high debt ratio indicates that a significant portion of their income is going towards debt repayment, which may make it difficult for them to handle additional credit.

5. The person has

# Credit Card suggestions

In [93]:
user_profile_based_card_template=f"""
##Instruction: Given the user profile recommended credit cards that will best fit the user profile. Provide reason as to why the credit card is suggested to the user for each card.

## User profile:
{response}

## Recommendations with reasons point by point:
"""
rec = recommender_retriever.get_relevant_documents(user_profile_based_card_template)
card_suggestions= ""
for r in rec:
    card_suggestions += f'- Card name:{" ".join(r.metadata["title"].split("-"))} card \n  Card Features:{r.page_content} +\n'

recomendations_template=f"""
##Instruction:
-If the the user is considerd High/Moderate risk of default and suggestion on user profile in rejection of credit request then return "No Credit Card Recomended"
-Given the user profile recommended credit cards that will best fit the user profile. 
-Provide card by card reasons(concise) as to why the credit card is suggested to the user.  

## User profile:
{response}

## Recommended Credit cards if Eligible:
{card_suggestions}

## Recommendations=Output as Json with card name as Key and concise reasons point by point as Value:
"""
resp = llm.predict(recomendations_template)

for ele in list(filter(lambda x:x!="",resp.split("\n")[1:-1])):
    print(ele)

INFO:root:Generated queries: ['1. What credit cards would be most suitable for a person with a high revolving utilization of unsecured lines and why?', '2. Which credit cards are recommended for a 61-year-old individual with a history of late payments and why?', '3. What are the best credit card options for someone with a debt ratio of 20.99% and why?', '', '1. What credit cards can be suggested for someone who heavily relies on credit and has a history of late payments, and what are the reasons for these suggestions?', '2. What credit cards are suitable for a 61-year-old with a high income but also a high debt ratio, and why are these cards recommended?', '3. What credit cards are recommended for someone with multiple open credit lines but no real estate loans, and why are these cards suggested?', '', '1. What credit cards are best for someone with a high balance on their credit cards and personal lines of credit, and why are these cards recommended?', '2. What credit cards are suitab

In [94]:
resp.split("\n")[1:]

['  "titanium edge card": "1. Offers reward points for every ₹150 spent\\n2. Provides dining privileges with 50% more reward points\\n3. Fuel surcharge waiver of up to ₹250 per billing cycle\\n4. EMV Chip technology for fraud protection\\n5. 50 days of interest-free period for purchases\\n6. Revolving credit at a nominal interest rate\\n7. Eligibility criteria for salaried employees aged 21 to 60 or self-employed individuals aged 21 to 65\\n8. Annual fees can be waived by spending ₹50,000 in a']

In [95]:
resp

'{\n  "titanium edge card": "1. Offers reward points for every ₹150 spent\\n2. Provides dining privileges with 50% more reward points\\n3. Fuel surcharge waiver of up to ₹250 per billing cycle\\n4. EMV Chip technology for fraud protection\\n5. 50 days of interest-free period for purchases\\n6. Revolving credit at a nominal interest rate\\n7. Eligibility criteria for salaried employees aged 21 to 60 or self-employed individuals aged 21 to 65\\n8. Annual fees can be waived by spending ₹50,000 in a'

In [100]:
rec = recommender_retriever.get_relevant_documents(user_profile_based_card_template)

INFO:root:Generated queries: ['1. What credit cards would be suitable for a person with a high revolving utilization of unsecured lines and why?', '2. Which credit cards are recommended for a 61-year-old individual with a history of late payments and why?', '3. What are the best credit cards for someone with a debt ratio of 20.99% and a monthly income of 137,500 INR and why?', '', '4. What credit cards are suitable for a person with 11 open credit lines and loans and no real estate loans or lines of credit and why?', '5. Which credit cards are recommended for a person who has been 60-89 days past due on their payments once in the last 2 years and has 1 dependent and why?', '', '6. What credit cards are suitable for a person with a 44.64% probability of experiencing delinquency in the next 2 years and why?']


In [101]:
rec

[Document(page_content='The credit card mentioned above offers various features and benefits. Users can earn reward points for every ₹150 spent and redeem them for exclusive gifts, air miles, or cash back against their outstanding credit card balance. The card also provides dining privileges with 50% more reward points, a fuel surcharge waiver of up to ₹250 per billing cycle, and EMV Chip technology for fraud protection. Users can enjoy 50 days of interest-free period for purchases and avail of revolving credit at a nominal interest rate. The eligibility criteria for the card are being a salaried employee aged 21 to 60 or a self-employed individual aged 21 to 65. The annual fees can be waived by spending ₹50,000 in a year, and there is a cash advance fee of 2.5% with a minimum of ₹500. Interest at a rate of 3.49% per month will be charged on any outstanding amount carried beyond the bill due date.', metadata={'_id': ObjectId('6517ebd2aba949c4b64d3be2'), 'source': 'https://www.tengenban