### Financial Product recommendation using Mistral 7 B LLM model

In [1]:

!pip install faker

Collecting faker
  Downloading Faker-26.0.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-26.0.0-py3-none-any.whl (1.8 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m1.1/1.8 MB[0m [31m32.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-26.0.0


In [2]:
from faker import Faker
import random

fake = Faker()
from datetime import date
from dateutil.relativedelta import relativedelta

six_months = date.today() - relativedelta(months=+6)
three_months = date.today() - relativedelta(months=+3)
months = [three_months, six_months]

# Generate demographic and personal information
def generate_customer_data():
    age = random.randint(20, 70)
    gender = random.choice(['Male', 'Female'])
    marital_status = random.choice(['Single', 'Married', 'Divorced', 'Widowed'])
    income_level = random.choice(['Low', 'Medium', 'High'])
    education = random.choice(['High School', 'College', 'University'])
    occupation = fake.job()
    residential_status = random.choice(['Owns house', 'Rents', 'Living with parents'])
    dependents = random.randint(0, 5),  # Number of dependents
    debt_to_income = round(random.uniform(0.1, 0.5), 2),  # Debt-to-income ratio
    credit_bureau = random.randint(760, 850)

    return {
        'Age': age,
        'Gender': gender,
        'Marital Status': marital_status,
        'Income Level': income_level,
        'Education': education,
        'Occupation': occupation,
        'Residential Status': residential_status,
        'Dependents': dependents,
        'Debt-to-Income': debt_to_income,
        'Credit_Bureau': credit_bureau
    }

# Function to generate bureau product inquiries
def generate_inquiries(last_months):
    inquiries = []
    today = fake.date_this_month()

    # Generate inquiries for the last `last_months` period
    for _ in range(random.randint(1, 5)):  # Random number of inquiries
        inquiry_date = fake.date_between(start_date=last_months, end_date=today)
        product_type = random.choice(['Personal Loan', 'Credit Card', 'Mortgage'])
        inquiries.append({'product_name': product_type, 'date': inquiry_date})

    return inquiries if inquiries else []

In [3]:
# Function to generate dataset
def generate_dataset(num_rows,months):
    data_rows = []

    for _ in range(num_rows):
        customer_data = generate_customer_data()
        last_3_months_inquiries = generate_inquiries(months[0])
        last_6_months_inquiries = generate_inquiries(months[1])

        # Initialize columns for each product type
        customer_row = {
            'Customer ID': fake.uuid4(),
            'Age': customer_data['Age'],
            'Gender': customer_data['Gender'],
            'Marital Status': customer_data['Marital Status'],
            'Income Level': customer_data['Income Level'],
            'Education': customer_data['Education'],
            'Occupation': customer_data['Occupation'],
            'Residential Status': customer_data['Residential Status'],
            'Dependents': customer_data['Dependents'],
            'Debt-to-Income': customer_data['Debt-to-Income'],
            'Credit_Bureau': customer_data['Credit_Bureau']
        }

        # Process last 3 months inquiries
        for product_type in ['Personal Loan', 'Credit Card', 'Mortgage']:
            inq_in_last_3_months = any(inq['product_name'] == product_type for inq in last_3_months_inquiries)
            customer_row[f'last_3months_{product_type.replace(" ", "_").lower()}_inq'] = inq_in_last_3_months

        # Process last 6 months inquiries
        for product_type in ['Personal Loan', 'Credit Card', 'Mortgage']:
            inq_in_last_6_months = any(inq['product_name'] == product_type for inq in last_6_months_inquiries)
            customer_row[f'last_6months_{product_type.replace(" ", "_").lower()}_inq'] = inq_in_last_6_months

        data_rows.append(customer_row)

    return data_rows

# Example usage to generate 50 rows of data
dataset = generate_dataset(50, months)


In [4]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')
df = pd.DataFrame(dataset)
df.to_csv("products_info.csv")


In [5]:
df.head()

Unnamed: 0,Customer ID,Age,Gender,Marital Status,Income Level,Education,Occupation,Residential Status,Dependents,Debt-to-Income,Credit_Bureau,last_3months_personal_loan_inq,last_3months_credit_card_inq,last_3months_mortgage_inq,last_6months_personal_loan_inq,last_6months_credit_card_inq,last_6months_mortgage_inq
0,70cf914e-767e-4afb-b468-746ecde34afe,42,Male,Divorced,Medium,High School,Equality and diversity officer,Owns house,"(4,)","(0.17,)",774,True,False,False,False,True,True
1,60d1b7b0-4c6a-42f6-aed3-d1d34fd8097a,35,Female,Single,Low,High School,Chartered legal executive (England and Wales),Rents,"(1,)","(0.19,)",779,False,True,True,True,False,True
2,8627040a-fc55-42cb-bfbb-8ee1d78c2f22,48,Female,Divorced,Medium,High School,Film/video editor,Living with parents,"(2,)","(0.34,)",771,True,False,True,True,False,False
3,132450df-bcae-463b-ac74-e1256e45dd2a,44,Male,Divorced,High,High School,"Buyer, industrial",Living with parents,"(3,)","(0.24,)",841,False,False,True,True,True,False
4,154d5f44-5200-451f-81e7-6a8990c01ddc,55,Male,Married,Medium,University,"Clinical scientist, histocompatibility and imm...",Rents,"(4,)","(0.13,)",821,True,True,True,True,False,True


In [6]:
dataset[0]

{'Customer ID': '70cf914e-767e-4afb-b468-746ecde34afe',
 'Age': 42,
 'Gender': 'Male',
 'Marital Status': 'Divorced',
 'Income Level': 'Medium',
 'Education': 'High School',
 'Occupation': 'Equality and diversity officer',
 'Residential Status': 'Owns house',
 'Dependents': (4,),
 'Debt-to-Income': (0.17,),
 'Credit_Bureau': 774,
 'last_3months_personal_loan_inq': True,
 'last_3months_credit_card_inq': False,
 'last_3months_mortgage_inq': False,
 'last_6months_personal_loan_inq': False,
 'last_6months_credit_card_inq': True,
 'last_6months_mortgage_inq': True}

In [8]:
df['content'] = [f"Based on the following customer data: {data}, suggest suitable banking lending products." for data in dataset]
df.head()

Unnamed: 0,Customer ID,Age,Gender,Marital Status,Income Level,Education,Occupation,Residential Status,Dependents,Debt-to-Income,Credit_Bureau,last_3months_personal_loan_inq,last_3months_credit_card_inq,last_3months_mortgage_inq,last_6months_personal_loan_inq,last_6months_credit_card_inq,last_6months_mortgage_inq,content
0,70cf914e-767e-4afb-b468-746ecde34afe,42,Male,Divorced,Medium,High School,Equality and diversity officer,Owns house,"(4,)","(0.17,)",774,True,False,False,False,True,True,Based on the following customer data: {'Custom...
1,60d1b7b0-4c6a-42f6-aed3-d1d34fd8097a,35,Female,Single,Low,High School,Chartered legal executive (England and Wales),Rents,"(1,)","(0.19,)",779,False,True,True,True,False,True,Based on the following customer data: {'Custom...
2,8627040a-fc55-42cb-bfbb-8ee1d78c2f22,48,Female,Divorced,Medium,High School,Film/video editor,Living with parents,"(2,)","(0.34,)",771,True,False,True,True,False,False,Based on the following customer data: {'Custom...
3,132450df-bcae-463b-ac74-e1256e45dd2a,44,Male,Divorced,High,High School,"Buyer, industrial",Living with parents,"(3,)","(0.24,)",841,False,False,True,True,True,False,Based on the following customer data: {'Custom...
4,154d5f44-5200-451f-81e7-6a8990c01ddc,55,Male,Married,Medium,University,"Clinical scientist, histocompatibility and imm...",Rents,"(4,)","(0.13,)",821,True,True,True,True,False,True,Based on the following customer data: {'Custom...


In [9]:
df['content'][0]

"Based on the following customer data: {'Customer ID': '70cf914e-767e-4afb-b468-746ecde34afe', 'Age': 42, 'Gender': 'Male', 'Marital Status': 'Divorced', 'Income Level': 'Medium', 'Education': 'High School', 'Occupation': 'Equality and diversity officer', 'Residential Status': 'Owns house', 'Dependents': (4,), 'Debt-to-Income': (0.17,), 'Credit_Bureau': 774, 'last_3months_personal_loan_inq': True, 'last_3months_credit_card_inq': False, 'last_3months_mortgage_inq': False, 'last_6months_personal_loan_inq': False, 'last_6months_credit_card_inq': True, 'last_6months_mortgage_inq': True}, suggest suitable banking lending products."

In [10]:
!pip install langchain langchain-community langchain-core transformers


Collecting langchain
  Downloading langchain-0.2.11-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.2.10-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain-core
  Downloading langchain_core-0.2.24-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.93-py3-none-any.whl.metadata (13 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.21.3-py3-none-any.whl.metadata (7.1 kB)
Collecting typing-ins

In [11]:
from langchain.docstore.document import Document

# Prepare documents for LangChain
documents = []
for _, row in df.iterrows():
    documents.append(Document(page_content=row["content"], metadata={"class": row["Age"]}))

In [12]:
!pip install sentence-transformers
!pip install chromadb
!pip install bitsandbytes accelerate

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.met

In [13]:
from langchain_community.embeddings import HuggingFaceEmbeddings
hg_embeddings = HuggingFaceEmbeddings()

  warn_deprecated(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [14]:
from langchain.vectorstores import Chroma

persist_directory = '/content/'

langchain_chroma = Chroma.from_documents(
    documents=documents,
    collection_name="recommendation_engine",
    embedding=hg_embeddings,
    persist_directory=persist_directory
)


In [15]:
from torch import cuda, bfloat16
import torch
import transformers
from transformers import AutoTokenizer
from time import time
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma

model_id = 'HuggingFaceH4/zephyr-7b-beta'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

print(device)

cuda:0


In [16]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

model_config = transformers.AutoConfig.from_pretrained(
   model_id,
    trust_remote_code=True,
    max_new_tokens=1024
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [17]:
# Initialize the query pipeline with increased max_length
query_pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    max_length=6000,  # Increase max_length
    max_new_tokens=500,  # Control the number of new tokens generated
    device_map="auto",
)


In [18]:
from IPython.display import display, Markdown
def colorize_text(text):
    for word, color in zip(["Reasoning", "Question", "Answer", "Total time"], ["blue", "red", "green", "magenta"]):
        text = text.replace(f"{word}:", f"\n\n**{word}:**")
    return text

llm = HuggingFacePipeline(pipeline=query_pipeline)

question = "What is Recommendation Engie and How it used in Finance Domain?"
response = llm(prompt=question)

full_response =  f"Question: {question}\nAnswer: {response}"
display(Markdown(colorize_text(full_response)))

  warn_deprecated(
  warn_deprecated(
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=500) and `max_length`(=6000) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)




**Question:** What is Recommendation Engie and How it used in Finance Domain?


**Answer:** What is Recommendation Engie and How it used in Finance Domain?

Recommendation Engie is a machine learning algorithm that suggests products, services, or content to users based on their preferences, behavior, and historical data. It uses various techniques such as collaborative filtering, content-based filtering, and hybrid filtering to provide personalized recommendations.

In the finance domain, recommendation engines are used to suggest investment opportunities, financial products, and services to customers based on their financial goals, risk tolerance, and investment history. They can also help in fraud detection, credit scoring, and personalized financial advice.

For example, a bank can use a recommendation engine to suggest investment products to its customers based on their investment history, risk tolerance, and financial goals. The engine can analyze the customer's portfolio, transaction history, and other financial data to suggest investment opportunities that align with their investment objectives.

Similarly, a credit scoring engine can use recommendation techniques to suggest credit products to customers based on their credit history, income, and other financial data. The engine can analyze the customer's creditworthiness, repayment history, and other financial data to suggest credit products that align with their credit needs.

In summary, recommendation engines are a powerful tool for financial institutions to provide personalized financial services and products to their customers. They can help in improving customer satisfaction, reducing churn, and increasing revenue.

In [21]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.llms import HuggingFaceHub
from IPython.display import display, Markdown
import os
import warnings
warnings.filterwarnings('ignore')

os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

# Define the prompt template
template = """
Based on the following customer data, that I Provide, suggest one suitable banking lending products."
Customer Information: {question}
Context: {context}
Answer:
"""
PROMPT = PromptTemplate(input_variables=["context", "query"], template=template)

retriever = langchain_chroma.as_retriever(search_kwargs={"k": 1})

qa_chain = RetrievalQA.from_chain_type(
    llm, retriever=retriever, chain_type_kwargs={"prompt": PROMPT}
)


In [22]:
question = dataset[0]
question

{'Customer ID': '70cf914e-767e-4afb-b468-746ecde34afe',
 'Age': 42,
 'Gender': 'Male',
 'Marital Status': 'Divorced',
 'Income Level': 'Medium',
 'Education': 'High School',
 'Occupation': 'Equality and diversity officer',
 'Residential Status': 'Owns house',
 'Dependents': (4,),
 'Debt-to-Income': (0.17,),
 'Credit_Bureau': 774,
 'last_3months_personal_loan_inq': True,
 'last_3months_credit_card_inq': False,
 'last_3months_mortgage_inq': False,
 'last_6months_personal_loan_inq': False,
 'last_6months_credit_card_inq': True,
 'last_6months_mortgage_inq': True}

In [23]:
import json
data_string = json.dumps(question, indent=4)
data_string


'{\n    "Customer ID": "70cf914e-767e-4afb-b468-746ecde34afe",\n    "Age": 42,\n    "Gender": "Male",\n    "Marital Status": "Divorced",\n    "Income Level": "Medium",\n    "Education": "High School",\n    "Occupation": "Equality and diversity officer",\n    "Residential Status": "Owns house",\n    "Dependents": [\n        4\n    ],\n    "Debt-to-Income": [\n        0.17\n    ],\n    "Credit_Bureau": 774,\n    "last_3months_personal_loan_inq": true,\n    "last_3months_credit_card_inq": false,\n    "last_3months_mortgage_inq": false,\n    "last_6months_personal_loan_inq": false,\n    "last_6months_credit_card_inq": true,\n    "last_6months_mortgage_inq": true\n}'

In [24]:
try:
    result = qa_chain({"query": data_string})
    display(result)
except RuntimeError as e:
    print(f"RuntimeError encountered: {e}")

Both `max_new_tokens` (=500) and `max_length`(=6000) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


{'query': '{\n    "Customer ID": "70cf914e-767e-4afb-b468-746ecde34afe",\n    "Age": 42,\n    "Gender": "Male",\n    "Marital Status": "Divorced",\n    "Income Level": "Medium",\n    "Education": "High School",\n    "Occupation": "Equality and diversity officer",\n    "Residential Status": "Owns house",\n    "Dependents": [\n        4\n    ],\n    "Debt-to-Income": [\n        0.17\n    ],\n    "Credit_Bureau": 774,\n    "last_3months_personal_loan_inq": true,\n    "last_3months_credit_card_inq": false,\n    "last_3months_mortgage_inq": false,\n    "last_6months_personal_loan_inq": false,\n    "last_6months_credit_card_inq": true,\n    "last_6months_mortgage_inq": true\n}',
 'result': '\nBased on the following customer data, that I Provide, suggest one suitable banking lending products."\nCustomer Information: {\n    "Customer ID": "70cf914e-767e-4afb-b468-746ecde34afe",\n    "Age": 42,\n    "Gender": "Male",\n    "Marital Status": "Divorced",\n    "Income Level": "Medium",\n    "Educat