## BigQuery Agent

#### Imports 

In [1]:
from langchain.agents.agent_toolkits import SQLDatabaseToolkit
from langchain.chat_models import ChatVertexAI
from langchain.sql_database import SQLDatabase
from langchain.agents import create_sql_agent
from urllib.parse import quote_plus
import pkg_resources
import sqlalchemy
import langchain
import logging
import pyodbc
import os 

##### Setup logging

In [2]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

#### Setup essentials

In order to use LangChain or PyODBC to talk to databases directly - ensure the correct and respective drivers are installed in your work machine. 

For MS SQL:
https://learn.microsoft.com/en-us/sql/connect/odbc/linux-mac/install-microsoft-odbc-driver-sql-server-macos?view=sql-server-ver15 <br><br>

For BigQuery:<br>
-- Landing Page<br>
https://cloud.google.com/bigquery/docs/reference/odbc-jdbc-drivers<br>
-- Install ODBC release 2.5.2.1004<br>
https://cloud.google.com/bigquery/docs/reference/odbc-jdbc-drivers#odbc_release_2521004<br>
-- Congifuration Guide <br>
https://storage.googleapis.com/simba-bq-release/odbc/Simba%20Google%20BigQuery%20ODBC%20Connector%20Install%20and%20Configuration%20Guide-2.5.2.1004.pdf<br>



In [3]:
PROJECT = 'arun-genai-bb'
LOCATION = 'us-central1'
MODEL_NAME = 'codechat-bison@latest'
DATASET = 'flight_reservations'
SERVICE_ACCOUNT_KEY_PATH = './../credentials/vai-key.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = SERVICE_ACCOUNT_KEY_PATH
SERVICE_ACCOUNT_KEY_PATH_ABS = os.path.abspath(SERVICE_ACCOUNT_KEY_PATH)
SERVICE_ACCOUNT_EMAIL = 'as-service@arun-genai-bb.iam.gserviceaccount.com'  # Can be found in the service account JSON key above
OAUTH_FLAG = 0 # 0 means we are using the service account credentials key for auth and 1 means we are using a refresh token 
CATALOG = 'arun-genai-bb'  # same as project name 
DRIVER_PATH_ABS = '/Library/simba/googlebigqueryodbc/lib/libgooglebigqueryodbc_sbu.dylib'
EMAIL = 'as-service@arun-genai-bb.iam.gserviceaccount.com'

#### Establish ODBC connection 

Option 1: Establishing connection using PyODBC directly

In [4]:
connection_string = f'Driver={DRIVER_PATH_ABS};Catalog={CATALOG};Dataset={DATASET};KeyFilePath={SERVICE_ACCOUNT_KEY_PATH_ABS};OAuthMechanism={OAUTH_FLAG};Email={EMAIL}'
logger.info(connection_string)
connection = pyodbc.connect(connection_string)

Driver=/Library/simba/googlebigqueryodbc/lib/libgooglebigqueryodbc_sbu.dylib;Catalog=arun-genai-bb;Dataset=flight_reservations;KeyFilePath=/Users/arunpshankar/Desktop/Projects/LLM-Text-to-SQL-Architectures/credentials/vai-key.json;OAuthMechanism=0;Email=as-service@arun-genai-bb.iam.gserviceaccount.com


Option 2: Using LangChain (perferred)

In [5]:
params = {
    'KeyFilePath': quote_plus(SERVICE_ACCOUNT_KEY_PATH_ABS),
    'Driver': quote_plus(DRIVER_PATH_ABS),
    'OAuthMechanism': OAUTH_FLAG,
    'Catalog': CATALOG,
    'Dataset': DATASET,
    'Email': EMAIL,
}

SQLALCHEMY_URI = f"bigquery://{PROJECT}/{DATASET}?{'&'.join(f'{k}={v}' for k, v in params.items())}"
logger.info(f'SQLALCHEMY_URI={SQLALCHEMY_URI}')

SQLALCHEMY_URI=bigquery://arun-genai-bb/flight_reservations?KeyFilePath=%2FUsers%2Farunpshankar%2FDesktop%2FProjects%2FLLM-Text-to-SQL-Architectures%2Fcredentials%2Fvai-key.json&Driver=%2FLibrary%2Fsimba%2Fgooglebigqueryodbc%2Flib%2Flibgooglebigqueryodbc_sbu.dylib&OAuthMechanism=0&Catalog=arun-genai-bb&Dataset=flight_reservations&Email=as-service@arun-genai-bb.iam.gserviceaccount.com


In [6]:
db = SQLDatabase.from_uri(SQLALCHEMY_URI)

In [7]:
db.__dict__

{'_engine': Engine(bigquery://arun-genai-bb/flight_reservations?Catalog=arun-genai-bb&Dataset=flight_reservations&Driver=%2FLibrary%2Fsimba%2Fgooglebigqueryodbc%2Flib%2Flibgooglebigqueryodbc_sbu.dylib&Email=as-service%40arun-genai-bb.iam.gserviceaccount.com&KeyFilePath=%2FUsers%2Farunpshankar%2FDesktop%2FProjects%2FLLM-Text-to-SQL-Architectures%2Fcredentials%2Fvai-key.json&OAuthMechanism=0),
 '_schema': None,
 '_inspector': <sqlalchemy.engine.reflection.Inspector at 0x130505e50>,
 '_all_tables': {'customers', 'flights', 'reservations', 'transactions'},
 '_include_tables': set(),
 '_ignore_tables': set(),
 '_usable_tables': {'customers', 'flights', 'reservations', 'transactions'},
 '_sample_rows_in_table_info': 3,
 '_indexes_in_table_info': False,
 '_custom_table_info': None,
 '_max_string_length': 300,
 '_metadata': MetaData()}

List all installed drivers 

In [8]:
pyodbc.drivers()

['ODBC Driver 18 for SQL Server',
 'ODBC Driver 13 for SQL Server',
 'ODBC Drivers',
 'Simba GoogleBigQuery ODBC Driver']

#### Setup LLM and SQL DB Toolkit

In [9]:
llm = ChatVertexAI(project=PROJECT, 
                   location=LOCATION, 
                   model_name=MODEL_NAME,
                   temperature=0.0, 
                   max_output_tokens=256)

In [10]:
toolkit = SQLDatabaseToolkit(db=db, llm=llm)
toolkit.__dict__

{'db': <langchain.utilities.sql_database.SQLDatabase at 0x110948cd0>,
 'llm': ChatVertexAI(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, project='arun-genai-bb', location='us-central1', request_parallelism=5, max_retries=6, stop=None, model_name='codechat-bison@latest', client=<vertexai.preview.language_models._PreviewCodeChatModel object at 0x130529430>, temperature=0.0, max_output_tokens=256, top_p=0.95, top_k=40, credentials=None)}

#### Create SQL Agent

In [11]:
agent_executor = create_sql_agent(llm=llm, 
                                  toolkit=toolkit,  
                                  top_k=1000,  # max number of retrieved rows
                                  verbose=True)
agent_executor.__dict__

{'memory': None,
 'callbacks': None,
 'callback_manager': None,
 'verbose': True,
 'tags': None,
 'metadata': None,
 'agent': ZeroShotAgent(llm_chain=LLMChain(memory=None, callbacks=None, callback_manager=None, verbose=False, tags=None, metadata=None, prompt=PromptTemplate(input_variables=['input', 'agent_scratchpad'], output_parser=None, partial_variables={}, template='You are an agent designed to interact with a SQL database.\nGiven an input question, create a syntactically correct bigquery query to run, then look at the results of the query and return the answer.\nUnless the user specifies a specific number of examples they wish to obtain, always limit your query to at most 1000 results.\nYou can order the results by a relevant column to return the most interesting examples in the database.\nNever query for all the columns from a specific table, only ask for the relevant columns given the question.\nYou have access to tools for interacting with the database.\nOnly use the below tool

#### Test Text-to-SQL Scenarios

#### Scenario 1: Retrieve Active Reservations for a Specific Date Range
For this scenario, you want to find all active reservations within a specific date range.

In [12]:
question = "Provide a list of all flight reservations from October 10th to October 15th, 2023"

In [13]:
agent_executor.run(question)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m Action: sql_db_list_tables
Action Input: [0m
Observation: [38;5;200m[1;3mcustomers, flights, reservations, transactions[0m
Thought:[32;1m[1;3m Action: sql_db_schema
Action Input: flights[0m
Observation: [33;1m[1;3m
CREATE TABLE `flights` (
	`flight_id` INT64 NOT NULL, 
	`origin` STRING NOT NULL, 
	`destination` STRING NOT NULL, 
	`departure_datetime` DATETIME NOT NULL, 
	`arrival_datetime` DATETIME NOT NULL, 
	`carrier` STRING NOT NULL, 
	`price` FLOAT64 NOT NULL
)

/*
3 rows from flights table:
flight_id	origin	destination	departure_datetime	arrival_datetime	carrier	price
1	JFK	LAX	2023-11-05 09:00:00	2023-11-05 12:30:00	Delta	450.0
2	LAX	JFK	2023-11-10 15:00:00	2023-11-10 23:30:00	Delta	450.0
3	JFK	ATL	2023-11-15 07:00:00	2023-11-15 09:30:00	Southwest	300.0
*/[0m
Thought:[32;1m[1;3m Action: sql_db_schema
Action Input: reservations[0m
Observation: [33;1m[1;3m
CREATE TABLE `reservations` (
	`reservation_id` IN

'2 flight reservations were made between October 10th and October 15th, 2023.'

#### Scenario 2: Identify customers who made reservations in the past N days.

In [14]:
question = "Identify all customers who have made flight reservations within the last 7 days."

In [15]:
_ = agent_executor.run(question)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m Action: sql_db_list_tables
Action Input: [0m
Observation: [38;5;200m[1;3mcustomers, flights, reservations, transactions[0m
Thought:[32;1m[1;3m Action: sql_db_schema
Action Input: customers[0m
Observation: [33;1m[1;3m
CREATE TABLE `customers` (
	`customer_id` INT64 NOT NULL, 
	`first_name` STRING NOT NULL, 
	`last_name` STRING NOT NULL, 
	`email` STRING NOT NULL, 
	`date_of_birth` DATE NOT NULL, 
	`created_at` DATETIME NOT NULL
)

/*
3 rows from customers table:
customer_id	first_name	last_name	email	date_of_birth	created_at
1	John	Doe	john.doe@example.com	1985-02-15	2021-01-01 08:30:00
2	Jane	Doe	jane.doe@example.com	1987-05-23	2021-01-15 12:45:00
3	Alice	Johnson	alice.j@example.com	1990-08-10	2021-03-20 09:00:00
*/[0m
Thought:[32;1m[1;3m Action: sql_db_schema
Action Input: flights[0m
Observation: [33;1m[1;3m
CREATE TABLE `flights` (
	`flight_id` INT64 NOT NULL, 
	`origin` STRING NOT NULL, 
	`destination` STRI

### Scenario 3: Calculate Monthly Revenue
Calculate the total revenue generated from transactions for a given month and year.

In [16]:
question = "Calculate the total revenue generated from transactions in October 2023, specifically from all reservations with a Confirmed status."

In [17]:
_ = agent_executor.run(question)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m Action: sql_db_list_tables
Action Input: [0m
Observation: [38;5;200m[1;3mcustomers, flights, reservations, transactions[0m
Thought:[32;1m[1;3m Action: sql_db_schema
Action Input: reservations[0m
Observation: [33;1m[1;3m
CREATE TABLE `reservations` (
	`reservation_id` INT64 NOT NULL, 
	`customer_id` INT64 NOT NULL, 
	`flight_id` INT64 NOT NULL, 
	`reservation_datetime` DATETIME NOT NULL, 
	`status` STRING NOT NULL
)

/*
3 rows from reservations table:
reservation_id	customer_id	flight_id	reservation_datetime	status
1	1	1	2023-10-01 10:30:00	Confirmed
2	2	2	2023-10-02 11:00:00	Confirmed
3	3	3	2023-10-03 12:15:00	Confirmed
*/[0m
Thought:[32;1m[1;3m Action: sql_db_query
Action Input: SELECT SUM(amount) AS total_revenue
FROM transactions
WHERE reservation_id IN (
  SELECT reservation_id
  FROM reservations
  WHERE status = 'Confirmed'
  AND reservation_datetime >= '2023-10-01'
  AND reservation_datetime < '2023-11-01'

### Scenario 4: Popular Flight Times
Identify the most popular departure hours or days for a given day or month or year.

In [18]:
question = "Determine the departure months with the highest frequency for the year 2023."

In [19]:
_ = agent_executor.run(question)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m Action: sql_db_list_tables
Action Input: [0m
Observation: [38;5;200m[1;3mcustomers, flights, reservations, transactions[0m
Thought:[32;1m[1;3m Action: sql_db_schema
Action Input: flights[0m
Observation: [33;1m[1;3m
CREATE TABLE `flights` (
	`flight_id` INT64 NOT NULL, 
	`origin` STRING NOT NULL, 
	`destination` STRING NOT NULL, 
	`departure_datetime` DATETIME NOT NULL, 
	`arrival_datetime` DATETIME NOT NULL, 
	`carrier` STRING NOT NULL, 
	`price` FLOAT64 NOT NULL
)

/*
3 rows from flights table:
flight_id	origin	destination	departure_datetime	arrival_datetime	carrier	price
1	JFK	LAX	2023-11-05 09:00:00	2023-11-05 12:30:00	Delta	450.0
2	LAX	JFK	2023-11-10 15:00:00	2023-11-10 23:30:00	Delta	450.0
3	JFK	ATL	2023-11-15 07:00:00	2023-11-15 09:30:00	Southwest	300.0
*/[0m
Thought:[32;1m[1;3m Action: sql_db_query
Action Input: SELECT DATE_FORMAT(departure_datetime, '%Y-%m') AS departure_month, COUNT(*) AS num_flights
FRO

### Scenario 5: Customer Age Group
Group customers by age brackets and count the number in each bracket.

In [20]:
question = "Group customers into five distinct age brackets and count the number of customers in each bracket."

In [21]:
_ = agent_executor.run(question)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m Action: sql_db_list_tables
Action Input: [0m
Observation: [38;5;200m[1;3mcustomers, flights, reservations, transactions[0m
Thought:[32;1m[1;3m Action: sql_db_schema
Action Input: customers[0m
Observation: [33;1m[1;3m
CREATE TABLE `customers` (
	`customer_id` INT64 NOT NULL, 
	`first_name` STRING NOT NULL, 
	`last_name` STRING NOT NULL, 
	`email` STRING NOT NULL, 
	`date_of_birth` DATE NOT NULL, 
	`created_at` DATETIME NOT NULL
)

/*
3 rows from customers table:
customer_id	first_name	last_name	email	date_of_birth	created_at
1	John	Doe	john.doe@example.com	1985-02-15	2021-01-01 08:30:00
2	Jane	Doe	jane.doe@example.com	1987-05-23	2021-01-15 12:45:00
3	Alice	Johnson	alice.j@example.com	1990-08-10	2021-03-20 09:00:00
*/[0m
Thought:[32;1m[1;3m Action: sql_db_query
Action Input: SELECT 
  FLOOR(DATEDIFF(CURRENT_DATE(), date_of_birth) / 365) AS age_bracket,
  COUNT(*) AS num_customers
FROM customers
GROUP BY age_bracket

### Scenario 6: Age Calculation
Calculate the age of customers based on their date of birth and filter those who are above X years old.


In [22]:
question = "Identify and rank all customers aged 18+ who have `Confirmed` reservations for the current month, ordered by their age. Make sure to display their ages in the result."

In [23]:
_ = agent_executor.run(question)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m Action: sql_db_list_tables
Action Input: [0m
Observation: [38;5;200m[1;3mcustomers, flights, reservations, transactions[0m
Thought:[32;1m[1;3m Action: sql_db_schema
Action Input: customers[0m
Observation: [33;1m[1;3m
CREATE TABLE `customers` (
	`customer_id` INT64 NOT NULL, 
	`first_name` STRING NOT NULL, 
	`last_name` STRING NOT NULL, 
	`email` STRING NOT NULL, 
	`date_of_birth` DATE NOT NULL, 
	`created_at` DATETIME NOT NULL
)

/*
3 rows from customers table:
customer_id	first_name	last_name	email	date_of_birth	created_at
1	John	Doe	john.doe@example.com	1985-02-15	2021-01-01 08:30:00
2	Jane	Doe	jane.doe@example.com	1987-05-23	2021-01-15 12:45:00
3	Alice	Johnson	alice.j@example.com	1990-08-10	2021-03-20 09:00:00
*/[0m
Thought:[32;1m[1;3m Action: sql_db_schema
Action Input: reservations[0m
Observation: [33;1m[1;3m
CREATE TABLE `reservations` (
	`reservation_id` INT64 NOT NULL, 
	`customer_id` INT64 NOT NULL, 
