## Pattern I: Using LLM to detect intent and recognize/extract entities followed by Text-to-SQL generation

#### Imports 

In [1]:
from langchain.prompts.chat import SystemMessagePromptTemplate
from langchain.prompts.chat import HumanMessagePromptTemplate
from langchain.prompts.chat import AIMessagePromptTemplate
from langchain.prompts.chat import ChatPromptTemplate
from langchain.chat_models import ChatVertexAI
from google.cloud import bigquery
import pandas as pd
import logging
import os

##### Setup logging

In [2]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

#### Setup essentials 

In [3]:
SERVICE_ACCOUNT_CREDENTIALS = './../credentials/vai-key.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = SERVICE_ACCOUNT_CREDENTIALS

In [4]:
PROJECT = 'arun-genai-bb'
LOCATION = 'us-central1'
MODEL_NAME = 'codechat-bison@latest'

In [5]:
bq = bigquery.Client()

In [6]:
llm = ChatVertexAI(project=PROJECT, 
                   location=LOCATION, 
                   model_name=MODEL_NAME, 
                   temperature=0.0, 
                   max_output_tokens=256)

In [7]:
user_query = "Provide a list of all flight reservations from October 10th to October 15th, 2023"

### Step 1: Identify the `intent` of user's query

##### Load example prompt and completion pairs needed for intent detection

In [8]:
messages = []

In [9]:
examples = pd.read_csv('./../data/few-shot/prompts_intent.csv')
examples.head()

Unnamed: 0,prompt,intent
0,Need all the bookings from 10th to 15th Octobe...,RETRIEVE_RESERVATIONS
1,Could you retrieve reservations for mid-Octobe...,RETRIEVE_RESERVATIONS
2,Let’s see all the October reservations from 10...,RETRIEVE_RESERVATIONS
3,Any reservations from 10/10/2023 to 15/10/2023?,RETRIEVE_RESERVATIONS
4,I’m looking for bookings between the second an...,RETRIEVE_RESERVATIONS


In [10]:
template = "You are a helpful assistant capable of detecting the intent behind a user's query."
system_message_prompt = SystemMessagePromptTemplate.from_template(template)
messages.append(system_message_prompt)

In [11]:
for _, row in examples.iterrows():
    prompt, completion = row
    human_message = HumanMessagePromptTemplate.from_template(prompt)
    messages.append(human_message)
    ai_message = AIMessagePromptTemplate.from_template(completion)
    messages.append(ai_message)

In [12]:
human_template = "{user_query}"
human_message = HumanMessagePromptTemplate.from_template(human_template)
messages.append(human_message)

In [13]:
chat_prompt = ChatPromptTemplate.from_messages(messages)

In [14]:
request = chat_prompt.format_prompt(user_query=user_query).to_messages()

In [15]:
%%time 

response = llm(request)
intent = response.content.strip()
logger.info(intent)

RETRIEVE_RESERVATIONS


CPU times: user 45.2 ms, sys: 6.65 ms, total: 51.8 ms
Wall time: 4.46 s


### Step 2: Extract the entities from the user query

Load example prompt and completion pairs needed for entity recognition

In [16]:
messages = []

In [17]:
examples = pd.read_csv('./../data/few-shot/prompts_ner.csv')
examples.head()

Unnamed: 0,prompt,entities
0,Can you show me all the reservations from Octo...,"Start Date:October 10th, 2023|End Date:October..."
1,What bookings do we have from 10/10/2023 to 10...,Start Date:10/10/2023|End Date:10/15/2023
2,Show the reservations occurring between the se...,"Start Date:October 8th, 2023|End Date:October ..."
3,List all bookings that are happening from Octo...,"Start Date:October 10, 2023|End Date:October 1..."
4,Fetch the reservations from the second week of...,"Start Date:October 8th, 2023|End Date:October ..."


In [18]:
template = "You are a helpful assistant capable of performing named entity recognition."
system_message_prompt = SystemMessagePromptTemplate.from_template(template)
messages.append(system_message_prompt)

In [19]:
for _, row in examples.iterrows():
    prompt, completion = row
    human_message = HumanMessagePromptTemplate.from_template(prompt)
    messages.append(human_message)
    ai_message = AIMessagePromptTemplate.from_template(completion)
    messages.append(ai_message)

In [20]:
human_template = "{user_query} Standardize the date format to YYYY-MM-DD."
human_message = HumanMessagePromptTemplate.from_template(human_template)
messages.append(human_message)

In [21]:
chat_prompt = ChatPromptTemplate.from_messages(messages)

In [22]:
request = chat_prompt.format_prompt(user_query=user_query).to_messages()

In [23]:
%%time 

response = llm(request)
entities = response.content.strip()
logger.info(entities)

Start Date:2023-10-10|End Date:2023-10-15


CPU times: user 7.63 ms, sys: 3.49 ms, total: 11.1 ms
Wall time: 3.39 s


### Step 3: Map Intent to table names

In [24]:
messages = []

In [25]:
examples = pd.read_csv('./../data/few-shot/intent_to_table_mapping.csv')
examples.head()

Unnamed: 0,intent,mapped_tables
0,RETRIEVE_RESERVATIONS,reservations|flights
1,IDENTIFY_RECENT_CUSTOMERS,reservations|customers
2,CALCULATE_REVENUE,reservations|transactions
3,FIND_PEAK_DEPARTURE_MONTHS,flights
4,GROUP_AND_COUNT_CUSTOMERS_BY_AGE,customers


In [26]:
template = "You are a helpful assistant capable of mapping detected intent to the correct list of BigQuery tables."
system_message_prompt = SystemMessagePromptTemplate.from_template(template)
messages.append(system_message_prompt)

In [27]:
for _, row in examples.iterrows():
    prompt, completion = row
    human_message = HumanMessagePromptTemplate.from_template(prompt)
    messages.append(human_message)
    ai_message = AIMessagePromptTemplate.from_template(completion)
    messages.append(ai_message)

In [28]:
human_template = "{user_intent}"
human_message = HumanMessagePromptTemplate.from_template(human_template)
messages.append(human_message)

In [29]:
chat_prompt = ChatPromptTemplate.from_messages(messages)

In [30]:
request = chat_prompt.format_prompt(user_intent=intent).to_messages()

In [31]:
%%time 

response = llm(request)
tables = response.content.strip()
logger.info(tables)

reservations|flights


CPU times: user 4.22 ms, sys: 2.26 ms, total: 6.48 ms
Wall time: 709 ms


### Step 4: Load and filter table schemas

In [32]:
def read_files_from_dir(directory):
    if not os.path.exists(directory):
        logger.warn(f"The directory {directory} does not exist!")
        return {}

    # Create an empty dictionary to store filename and content
    files_dict = {}

    # Iterate over each file in the directory
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)

        # Ensure it's a file and not a sub-directory or other entity
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                filename = filename.split('.txt')[0]
                files_dict[filename] = content

    return files_dict

In [33]:
directory_path = './../data/text-schema/'
table_schemas = read_files_from_dir(directory_path)

In [34]:
table_names = tables.split('|')
filtered_table_schemas = {}

for table_name in table_names:
    if table_name in table_schemas.keys():
        filtered_table_schemas[table_name] = table_schemas[table_name]

In [35]:
filtered_table_schemas_text = []

for schema in filtered_table_schemas.values():
    filtered_table_schemas_text.append(schema)

filtered_table_schemas_text = ''.join(filtered_table_schemas_text)
logger.info(filtered_table_schemas_text)

----
Reservations Table:
Description:
The Reservations table keeps track of all flight reservations made by customers. Each record represents a unique reservation, detailing the customer, flight, reservation time, and status.
----
Columns:
--
reservation_id:
Description: A unique identifier for each reservation made on the platform.
Usage: This ID ensures that each reservation is distinct and can be referenced for customer inquiries, modifications, and operational tracking.
Type: INT64
--
customer_id:
Description: A reference to a customer from the Customers table who made the reservation.
Usage: Establishes which customer made a specific reservation, aiding in personalized user experiences, communication, and support.
Type: INT64
--
flight_id:
Description: Refers to a specific flight from the Flights table.
Usage: Ensures that the reservation corresponds to a specific flight, aiding in managing flight capacities and customer communications.
Type: INT64
--
reservation_datetime:
Descrip

### Step 5: Text-to-SQL generation

In [36]:
messages = []

In [37]:
template = "You are a SQL master expert capable of writing complex SQL query in BigQuery."
system_message_prompt = SystemMessagePromptTemplate.from_template(template)
messages.append(system_message_prompt)

In [38]:
human_template = """Please construct a SQL query using the information provided below:

Input Parameters:
-----------------
INTENT: {intent}
EXTRACTED_ENTITIES: {entities}
MAPPED_TABLES: {tables}

User Query:
-----------
{user_query}

Table Schemas:
--------------
{filtered_table_schemas_text}

Note: 
- Please prefix the table names with `flight_reservations`."""

In [39]:
human_message = HumanMessagePromptTemplate.from_template(human_template)
messages.append(human_message)

In [40]:
chat_prompt = ChatPromptTemplate.from_messages(messages)

In [41]:
request = chat_prompt.format_prompt(intent=intent, 
                                    entities=entities, 
                                    tables=tables, 
                                    user_query=user_query, 
                                    filtered_table_schemas_text=filtered_table_schemas_text).to_messages()

In [42]:
%%time 

response = llm(request)
sql = '\n'.join(response.content.strip().split('\n')[1:-1])
logger.info(sql)

SELECT * 
FROM flight_reservations.reservations 
WHERE reservation_datetime BETWEEN '2023-10-10' AND '2023-10-15'


CPU times: user 4.39 ms, sys: 2.8 ms, total: 7.19 ms
Wall time: 1.67 s


### Step 6: Execute the generated SQL query in BigQuery

In [43]:
df = bq.query(sql).to_dataframe()
df

Unnamed: 0,reservation_id,customer_id,flight_id,reservation_datetime,status
0,6,6,6,2023-10-10 10:00:00,Confirmed
1,7,6,7,2023-10-12 11:30:00,Confirmed


In [44]:
df = df.to_markdown(index=False)

### Step 7: Transform SQL results into a human friendly response (Optional)

In [45]:
messages = []

In [46]:
template = "You are a travel assistant chatbot that can help people make flight reservations."
system_message_prompt = SystemMessagePromptTemplate.from_template(template)
messages.append(system_message_prompt)

In [47]:
human_template = """User's Question:
----------------
{user_query}

BigQuery Result:
----------------
{bq_response}

Task:
-----
Please convert the above query result into a human-readable format.

IMPORTANT Notes:
----------------
- The response should be courteous and human-friendly.
- If the answer doesn't require a tabular structure, avoid using it."""


In [48]:
human_message = HumanMessagePromptTemplate.from_template(human_template)
messages.append(human_message)

In [49]:
chat_prompt = ChatPromptTemplate.from_messages(messages)

In [50]:
request = chat_prompt.format_prompt(user_query=user_query,
                                    bq_response=df).to_messages()

In [51]:
%%time 

response = llm(request)
output = response.content.strip()
logger.info(output)

Here is a list of all flight reservations from October 10th to October 15th, 2023:

* Reservation ID: 6, Customer ID: 6, Flight ID: 6, Reservation Datetime: 2023-10-10 10:00:00, Status: Confirmed
* Reservation ID: 7, Customer ID: 6, Flight ID: 7, Reservation Datetime: 2023-10-12 11:30:00, Status: Confirmed


CPU times: user 5.45 ms, sys: 3.95 ms, total: 9.41 ms
Wall time: 5.97 s
