SQL RAG AGENT

In [1]:
import os
from dotenv import load_dotenv 

load_dotenv()

True

In [2]:
from langchain_core.documents import Document

def load_txt_and_chunk(path: str) -> list[Document]:
    with open(path, "r") as f:
        raw = f.read()

    chunks = [c.strip() for c in raw.split("###") if c.strip()]
    docs = []

    for chunk in chunks:
        lines = chunk.splitlines()
        title = lines[0].strip() if lines else "unknown"

        docs.append(
            Document(
                page_content=chunk,
                metadata={
                    "type": "schema",
                    "table": title,
                    "source": path
                }
            )
        )

    return docs

schema_docs = load_txt_and_chunk("schema.txt")

In [3]:
from langchain_text_splitters import CharacterTextSplitter

with open("schema.txt") as f:
    schema_doc = f.read()

text_splitter = CharacterTextSplitter(
    separator="###",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

texts = text_splitter.create_documents([schema_doc])
print(texts[0])

  from .autonotebook import tqdm as notebook_tqdm
Created a chunk of size 1011, which is longer than the specified 1000
Created a chunk of size 1187, which is longer than the specified 1000
Created a chunk of size 1012, which is longer than the specified 1000


page_content='Customers Table

Represents individual bank customers.
Primary key: customer_id (uuid)

Important fields
full_name: customer’s legal name
kyc_status: regulatory verification status
Allowed values: PENDING, VERIFIED

Lifecycle
Soft deleted using deleted_at
created_at and updated_at are system-managed

Relationships
One customer can have:
multiple accounts
multiple loans

CREATE TABLE public.customers (
    customer_id uuid DEFAULT gen_random_uuid() NOT NULL,
    full_name text NOT NULL,
    kyc_status text,
    created_at timestamp without time zone DEFAULT now() NOT NULL,
    updated_at timestamp without time zone DEFAULT now() NOT NULL,
    deleted_at timestamp without time zone,
    CONSTRAINT customers_kyc_status_check CHECK ((kyc_status = ANY (ARRAY['PENDING'::text, 'VERIFIED'::text])))
);'


In [4]:
texts

[Document(metadata={}, page_content="Customers Table\n\nRepresents individual bank customers.\nPrimary key: customer_id (uuid)\n\nImportant fields\nfull_name: customer’s legal name\nkyc_status: regulatory verification status\nAllowed values: PENDING, VERIFIED\n\nLifecycle\nSoft deleted using deleted_at\ncreated_at and updated_at are system-managed\n\nRelationships\nOne customer can have:\nmultiple accounts\nmultiple loans\n\nCREATE TABLE public.customers (\n    customer_id uuid DEFAULT gen_random_uuid() NOT NULL,\n    full_name text NOT NULL,\n    kyc_status text,\n    created_at timestamp without time zone DEFAULT now() NOT NULL,\n    updated_at timestamp without time zone DEFAULT now() NOT NULL,\n    deleted_at timestamp without time zone,\n    CONSTRAINT customers_kyc_status_check CHECK ((kyc_status = ANY (ARRAY['PENDING'::text, 'VERIFIED'::text])))\n);"),
 Document(metadata={}, page_content='Branches Table\n\nRepresents physical or logical bank branches.\nPrimary key: branch_id (uu

In [5]:
print(len(schema_docs))
print(schema_docs[0].page_content)
print("Metadata:", schema_docs[0].metadata)

10
Customers Table

Represents individual bank customers.
Primary key: customer_id (uuid)

Important fields
full_name: customer’s legal name
kyc_status: regulatory verification status
Allowed values: PENDING, VERIFIED

Lifecycle
Soft deleted using deleted_at
created_at and updated_at are system-managed

Relationships
One customer can have:
multiple accounts
multiple loans

CREATE TABLE public.customers (
    customer_id uuid DEFAULT gen_random_uuid() NOT NULL,
    full_name text NOT NULL,
    kyc_status text,
    created_at timestamp without time zone DEFAULT now() NOT NULL,
    updated_at timestamp without time zone DEFAULT now() NOT NULL,
    deleted_at timestamp without time zone,
    CONSTRAINT customers_kyc_status_check CHECK ((kyc_status = ANY (ARRAY['PENDING'::text, 'VERIFIED'::text])))
);
Metadata: {'type': 'schema', 'table': 'Customers Table', 'source': 'schema.txt'}


In [6]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

schema_store = Chroma.from_documents(
    schema_docs,
    embedding=embeddings,
    collection_name="schema_rag"
)

schema_retriever = schema_store.as_retriever(k=4)


In [7]:
from langchain_core.tools import tool
from sqlalchemy import create_engine, text
from typing import List, Tuple

engine = create_engine("postgresql+psycopg2://postgres:Nst%401995@localhost:5432/postgres")

@tool
def run_sql_query(query: str) -> str:
    """Execute a read-only SQL query and return rows."""
    
    print("\n--- SQL GENERATED ---")
    print(query)

    with engine.connect() as conn:
        result = conn.execute(text(query))
        rows = result.fetchmany(5)
        return str(rows)

@tool
def get_table_schema(table_name: str) -> List[Tuple]:
    """Return column schema information for a table.
    Argument: table_name, example - customers
    """
    query = """
        SELECT column_name, data_type, is_nullable
        FROM information_schema.columns
        WHERE table_name = :table_name
        ORDER BY ordinal_position
    """
    with engine.connect() as conn:
        result = conn.execute(text(query), {"table_name": table_name})
        return result.fetchall()
    
@tool
def list_table():
    """Return names of table present in the database."""
    query = """
        SELECT table_name
        FROM information_schema.tables
        WHERE table_schema='public';
    """
    with engine.connect() as conn:
        result = conn.execute(text(query))
        return str(result.fetchall())

In [8]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    model="meta-llama/llama-4-scout-17b-16e-instruct",
    temperature=0
)

SYSTEM_PROMPT = """
You are a database assistant.

You will be given:
- Relevant database schema context
- A user question

Steps:
1. Understand schema
2. Generate correct PostgreSQL SELECT query
3. Use run_sql tool
4. If query fails, fix and retry
5. Answer clearly

Rules:
- Never modify data
- Never guess columns not in schema
"""


In [9]:
from langchain.agents import create_agent

agent = create_agent(
    llm,
    tools=[run_sql_query, get_table_schema],
    system_prompt=SYSTEM_PROMPT,
)

In [10]:
question = "I need info of loans which were disbursed between jan 2023 and june 2023"

In [11]:
from IPython.display import display, Markdown

schema_context = schema_retriever.invoke(question)
context_text = "\n\n".join(d.page_content for d in schema_context)

response = agent.invoke({
    "messages": [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "system", "content": f"Schema:\n{context_text}"},
        {"role": "user", "content": question},
    ],
    # "recursion_limit": 10,
})

# agent.invoke returns a structured dict - extract text
display(Markdown(response["messages"][-1].content))


--- SQL GENERATED ---
SELECT * FROM loans WHERE disbursed_at BETWEEN '2023-01-01' AND '2023-06-30' AND deleted_at IS NULL


The loans disbursed between January 2023 and June 2023 are:

- Loan ID: 1aec7993-2abd-46d4-9ae4-f1952b9ed951, 
  Customer ID: ec5bf63f-2a89-460f-a8ea-5fe2674503b2, 
  Branch ID: dc59f73e-d87a-486f-8f78-9dddff16106d, 
  Loan Type: HOME, 
  Principal: 1945675.00, 
  Interest Rate: 13.46%, 
  Tenure: 36 months, 
  Disbursed At: 2023-04-07, 
  Status: NPA

- Loan ID: 70979651-1dff-46d5-95dc-c55557f6a559, 
  Customer ID: 3024a891-5f3b-45a1-9c1e-e978aeb31601, 
  Branch ID: 5384586d-6e16-46a7-93e2-aecd70687f04, 
  Loan Type: AUTO, 
  Principal: 791717.00, 
  Interest Rate: 16.84%, 
  Tenure: 60 months, 
  Disbursed At: 2023-06-25, 
  Status: NPA

- Loan ID: 84954583-a5b2-45c2-9de0-10a367534495, 
  Customer ID: 33ea8cc9-a813-48fb-b094-a8f37267d65b, 
  Branch ID: 225ba84c-cf19-4ccb-b5f9-a04f9070fa8c, 
  Loan Type: AUTO, 
  Principal: 143085.00, 
  Interest Rate: 12.36%, 
  Tenure: 120 months, 
  Disbursed At: 2023-01-07, 
  Status: NPA

- Loan ID: 1f1b7c72-b5fc-4bf4-86f3-f84934514e47, 
  Customer ID: 60d47bc4-1d9c-49af-990f-834c70e8986e, 
  Branch ID: 225ba84c-cf19-4ccb-b5f9-a04f9070fa8c, 
  Loan Type: PERSONAL, 
  Principal: 1448575.00, 
  Interest Rate: 11.26%, 
  Tenure: 36 months, 
  Disbursed At: 2023-02-09, 
  Status: ACTIVE

- Loan ID: ff4147e9-feea-4f84-bd46-937cbb0bb4bf, 
  Customer ID: 93c15b25-17e6-4f0f-bfe3-99626dff6045, 
  Branch ID: 411285e5-25a5-40f8-bca3-dc28ad9cde62, 
  Loan Type: HOME, 
  Principal: 101223.00, 
  Interest Rate: 10.45%, 
  Tenure: 60 months, 
  Disbursed At: 2023-06-05, 
  Status: ACTIVE

| Loan ID                              | Customer ID                          | Branch ID                            | Loan Type | Principal    | Interest Rate (%) | Tenure (Months) | Disbursed At | Status |
| ------------------------------------ | ------------------------------------ | ------------------------------------ | --------- | ------------ | ----------------- | --------------- | ------------ | ------ |
| 1aec7993-2abd-46d4-9ae4-f1952b9ed951 | ec5bf63f-2a89-460f-a8ea-5fe2674503b2 | dc59f73e-d87a-486f-8f78-9dddff16106d | HOME      | 1,945,675.00 | 13.46             | 36              | 2023-04-07   | NPA    |
| 70979651-1dff-46d5-95dc-c55557f6a559 | 3024a891-5f3b-45a1-9c1e-e978aeb31601 | 5384586d-6e16-46a7-93e2-aecd70687f04 | AUTO      | 791,717.00   | 16.84             | 60              | 2023-06-25   | NPA    |
| 84954583-a5b2-45c2-9de0-10a367534495 | 33ea8cc9-a813-48fb-b094-a8f37267d65b | 225ba84c-cf19-4ccb-b5f9-a04f9070fa8c | AUTO      | 143,085.00   | 12.36             | 120             | 2023-01-07   | NPA    |
| 1f1b7c72-b5fc-4bf4-86f3-f84934514e47 | 60d47bc4-1d9c-49af-990f-834c70e8986e | 225ba84c-cf19-4ccb-b5f9-a04f9070fa8c | PERSONAL  | 1,448,575.00 | 11.26             | 36              | 2023-02-09   | ACTIVE |
| ff4147e9-feea-4f84-bd46-937cbb0bb4bf | 93c15b25-17e6-4f0f-bfe3-99626dff6045 | 411285e5-25a5-40f8-bca3-dc28ad9cde62 | HOME      | 101,223.00   | 10.45             | 60              | 2023-06-05   | ACTIVE |



In [26]:
for step in agent.stream(
    {"messages": [{"role": "user", "content": question}]},
    stream_mode="values",
):
    step["messages"][-1].pretty_print()


I need info of loans which were disbursed between jan 2023 and june 2023

To provide the information about loans disbursed between January 2023 and June 2023, I first need to understand the schema of the relevant table. 

Let's assume there's a table named `loans`. I'll start by retrieving its schema.
Tool Calls:
  get_table_schema (kqrkfnpe0)
 Call ID: kqrkfnpe0
  Args:
    table_name: loans
Name: get_table_schema

[('loan_id', 'uuid', 'NO'), ('customer_id', 'uuid', 'NO'), ('branch_id', 'uuid', 'NO'), ('loan_type', 'text', 'YES'), ('principal', 'numeric', 'YES'), ('interest_rate', 'numeric', 'YES'), ('tenure_months', 'integer', 'YES'), ('disbursed_at', 'date', 'YES'), ('status', 'text', 'YES'), ('created_at', 'timestamp without time zone', 'NO'), ('updated_at', 'timestamp without time zone', 'NO'), ('deleted_at', 'timestamp without time zone', 'YES')]
Tool Calls:
  run_sql_query (22s2vdxmf)
 Call ID: 22s2vdxmf
  Args:
    query: SELECT * FROM loans WHERE disbursed_at BETWEEN '2023-01

In [34]:
for step in agent.stream(
    {"messages": [{"role": "user", "content": "just give me the names of all tables"}]},
    stream_mode="values",
):
    step["messages"][-1].pretty_print()


just give me the names of all tables

To provide the names of all tables, I will first need to understand the schema context. However, since the specific schema details are not provided in your query, I will assume a general approach to list tables in a database.

Given the functions I have access to, I can use the `get_table_schema` function to fetch schema information for tables one by one, but that requires knowing the table names or having a way to list them. Unfortunately, without a direct function to list all tables, I'll guide you through a workaround.

If I had a list of tables or could query the database's information schema, I could provide this directly. Since I can execute SQL queries, I can use a query that fetches table names from the information schema.

Here's how I can proceed:
Tool Calls:
  run_sql_query (nvwxaey91)
 Call ID: nvwxaey91
  Args:
    query: SELECT table_name FROM information_schema.tables WHERE table_schema='public';

--- SQL GENERATED ---
SELECT table_

In [12]:
def streamer_agent(question):
    for step in agent.stream(
        {"messages": [{"role": "user", "content": question}]},
        stream_mode="values",
    ):
        yield step["messages"][-1].pretty_print()

In [19]:
[msg for msg in streamer_agent("How financially exposed is the bank to each customer?")]


How financially exposed is the bank to each customer?

To assess the financial exposure of the bank to each customer, we need to consider the customer's account balance and any other relevant financial information. 

Let's assume that we have a table named `customer_accounts` with columns `customer_id`, `account_balance`, and other relevant details.

To get the financial exposure, we can use a SQL query to retrieve the account balance for each customer.

First, I will get the schema of the `customer_accounts` table to ensure I have the correct column names.
Tool Calls:
  get_table_schema (0p71pjbxk)
 Call ID: 0p71pjbxk
  Args:
    table_name: customer_accounts
Name: get_table_schema

[]


BadRequestError: Error code: 400 - {'error': {'message': "'messages.3' : for 'role:tool' the following must be satisfied[('messages.3.content' : one of the following must be satisfied[('messages.3.content' : value must be a string) OR ('messages.3.content' : minimum number of items is 1)])]", 'type': 'invalid_request_error'}}