## BigQuery Agent

#### Imports 

In [1]:
# Langchain specific imports
from langchain.agents.agent_toolkits import SQLDatabaseToolkit
from langchain.chat_models import ChatVertexAI
from langchain.sql_database import SQLDatabase
from langchain.agents import create_sql_agent
from langchain.agents import AgentExecutor

import langchain

# Other imports
from sqlalchemy.engine import create_engine
from urllib.parse import quote_plus
from google.cloud import bigquery
import pkg_resources
import sqlalchemy
import pybigquery
import logging
import pyodbc
import os 

##### Setup logging

In [2]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

##### Log dependencies 

In [3]:
logger.info(f'Using langchain=={langchain.__version__}')
logger.info(f"Using pyodbc=={pkg_resources.get_distribution('pyodbc').version}")
logger.info(f'Using sqlalchemy=={sqlalchemy.__version__}')
logger.info(f"Using pybigquery=={pkg_resources.get_distribution('pybigquery').version}")


Using langchain==0.0.283
Using pyodbc==4.0.34
Using sqlalchemy==1.4.22
Using pybigquery==0.5.0


#### Essentials

In order to use LangChain or PyODBC to talk to databases directly - ensure the correct and respective drivers are installed in your work machine. 

For MS SQL:
https://learn.microsoft.com/en-us/sql/connect/odbc/linux-mac/install-microsoft-odbc-driver-sql-server-macos?view=sql-server-ver15 <br><br>

For BigQuery:<br>
-- Landing Page<br>
https://cloud.google.com/bigquery/docs/reference/odbc-jdbc-drivers<br>
-- Install ODBC release 2.5.2.1004<br>
https://cloud.google.com/bigquery/docs/reference/odbc-jdbc-drivers#odbc_release_2521004<br>
-- Congifuration Guide <br>
https://storage.googleapis.com/simba-bq-release/odbc/Simba%20Google%20BigQuery%20ODBC%20Connector%20Install%20and%20Configuration%20Guide-2.5.2.1004.pdf<br>



In [4]:
PROJECT = 'arun-genai-bb'
LOCATION = 'us-central1'
MODEL_NAME = 'chat-bison'
DATASET = 'hotel_reservations'
SERVICE_ACCOUNT_KEY_PATH = './credentials/vai-key.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = SERVICE_ACCOUNT_KEY_PATH
SERVICE_ACCOUNT_KEY_PATH_ABS = os.path.abspath(SERVICE_ACCOUNT_KEY_PATH)
SERVICE_ACCOUNT_EMAIL = 'as-service@arun-genai-bb.iam.gserviceaccount.com'  # Can be found in the service account JSON key above
OAUTH_FLAG = 0 # 0 means we are using the service account credentials key for auth and 1 means we are using a refresh token 
CATALOG = 'arun-genai-bb'  # same as project name 
DRIVER_PATH_ABS = '/Library/simba/googlebigqueryodbc/lib/libgooglebigqueryodbc_sbu.dylib'
EMAIL = 'as-service@arun-genai-bb.iam.gserviceaccount.com'

#### Establish ODBC connection 

Option 1: Establishing connection using PyODBC directly

In [5]:
connection_string = f'Driver={DRIVER_PATH_ABS};Catalog={CATALOG};Dataset={DATASET};KeyFilePath={SERVICE_ACCOUNT_KEY_PATH_ABS};OAuthMechanism={OAUTH_FLAG};Email={EMAIL}'
logger.info(connection_string)
connection = pyodbc.connect(connection_string)
connection

Driver=/Library/simba/googlebigqueryodbc/lib/libgooglebigqueryodbc_sbu.dylib;Catalog=arun-genai-bb;Dataset=hotel_reservations;KeyFilePath=/Users/arunpshankar/Desktop/Projects/bq-sql-agent/credentials/vai-key.json;OAuthMechanism=0;Email=as-service@arun-genai-bb.iam.gserviceaccount.com


<pyodbc.Connection at 0x12e4ddf10>

Option 2: Using LangChain

In [6]:
params = {
    'KeyFilePath': quote_plus(SERVICE_ACCOUNT_KEY_PATH_ABS),
    'Driver': quote_plus(DRIVER_PATH_ABS),
    'OAuthMechanism': OAUTH_FLAG,
    'Catalog': CATALOG,
    'Dataset': DATASET,
    'Email': EMAIL,
}

SQLALCHEMY_URI = f"bigquery://{PROJECT}/{DATASET}?{'&'.join(f'{k}={v}' for k, v in params.items())}"
logger.info(f'SQLALCHEMY_URI={SQLALCHEMY_URI}')

SQLALCHEMY_URI=bigquery://arun-genai-bb/hotel_reservations?KeyFilePath=%2FUsers%2Farunpshankar%2FDesktop%2FProjects%2Fbq-sql-agent%2Fcredentials%2Fvai-key.json&Driver=%2FLibrary%2Fsimba%2Fgooglebigqueryodbc%2Flib%2Flibgooglebigqueryodbc_sbu.dylib&OAuthMechanism=0&Catalog=arun-genai-bb&Dataset=hotel_reservations&Email=as-service@arun-genai-bb.iam.gserviceaccount.com


In [7]:
db = SQLDatabase.from_uri(SQLALCHEMY_URI)

interfere with sqlalchemy_bigquery.
pybigquery should be uninstalled.
  return _bootstrap._gcd_import(name[level:], package, level)


In [8]:
db.__dict__

{'_engine': Engine(bigquery://arun-genai-bb/hotel_reservations?Catalog=arun-genai-bb&Dataset=hotel_reservations&Driver=%2FLibrary%2Fsimba%2Fgooglebigqueryodbc%2Flib%2Flibgooglebigqueryodbc_sbu.dylib&Email=as-service%40arun-genai-bb.iam.gserviceaccount.com&KeyFilePath=%2FUsers%2Farunpshankar%2FDesktop%2FProjects%2Fbq-sql-agent%2Fcredentials%2Fvai-key.json&OAuthMechanism=0),
 '_schema': None,
 '_inspector': <sqlalchemy.engine.reflection.Inspector at 0x12e6180d0>,
 '_all_tables': {'hotels', 'reservations', 'rooms', 'users'},
 '_include_tables': set(),
 '_ignore_tables': set(),
 '_usable_tables': {'hotels', 'reservations', 'rooms', 'users'},
 '_sample_rows_in_table_info': 3,
 '_indexes_in_table_info': False,
 '_custom_table_info': None,
 '_max_string_length': 300,
 '_metadata': MetaData()}

List all installed drivers 

In [9]:
pyodbc.drivers()

['ODBC Driver 18 for SQL Server',
 'ODBC Driver 13 for SQL Server',
 'ODBC Drivers',
 'Simba GoogleBigQuery ODBC Driver']

Setup LLM

In [10]:
llm = ChatVertexAI(project=PROJECT, 
                   location=LOCATION, 
                   model_name=MODEL_NAME,
                   temperature=0.0, 
                   max_output_tokens=256)

Setup toolkit

In [11]:
toolkit = SQLDatabaseToolkit(db=db, llm=llm)
toolkit.__dict__

{'db': <langchain.utilities.sql_database.SQLDatabase at 0x10bbdcca0>,
 'llm': ChatVertexAI(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, project='arun-genai-bb', location='us-central1', request_parallelism=5, max_retries=6, stop=None, model_name='chat-bison', client=<vertexai.preview.language_models._PreviewChatModel object at 0x12e4ccac0>, temperature=0.0, max_output_tokens=256, top_p=0.95, top_k=40, credentials=None)}

#### Create SQL Agent

In [12]:
agent_executor = create_sql_agent(llm=llm, 
                                  toolkit=toolkit, 
                                  verbose=True, 
                                  top_k=1000  # max number of retrieved rows
                                  )
agent_executor.__dict__

{'memory': None,
 'callbacks': None,
 'callback_manager': None,
 'verbose': True,
 'tags': None,
 'metadata': None,
 'agent': ZeroShotAgent(llm_chain=LLMChain(memory=None, callbacks=None, callback_manager=None, verbose=False, tags=None, metadata=None, prompt=PromptTemplate(input_variables=['input', 'agent_scratchpad'], output_parser=None, partial_variables={}, template='You are an agent designed to interact with a SQL database.\nGiven an input question, create a syntactically correct bigquery query to run, then look at the results of the query and return the answer.\nUnless the user specifies a specific number of examples they wish to obtain, always limit your query to at most 1000 results.\nYou can order the results by a relevant column to return the most interesting examples in the database.\nNever query for all the columns from a specific table, only ask for the relevant columns given the question.\nYou have access to tools for interacting with the database.\nOnly use the below tool

#### Run & Test Query

In [13]:
agent_executor.run("Find the vailability of rooms in Miami. Table names are `hotels`, `rooms` and `reservations`.")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m Action: sql_db_list_tables
Action Input: [0m
Observation: [38;5;200m[1;3mhotels, reservations, rooms, users[0m
Thought:[32;1m[1;3m Action: sql_db_schema
Action Input: rooms[0m
Observation: [33;1m[1;3m
CREATE TABLE `rooms` (
	`room_id` INT64 NOT NULL, 
	`hotel_id` INT64 NOT NULL, 
	`room_type` STRING NOT NULL, 
	`price_per_night` FLOAT64 NOT NULL, 
	`availability` INT64 NOT NULL
)

/*
3 rows from rooms table:
room_id	hotel_id	room_type	price_per_night	availability
1	1	Deluxe	150.0	10
2	1	Suite	250.0	5
3	2	Deluxe	180.0	10
*/[0m
Thought:[32;1m[1;3m Action: sql_db_query
Action Input: SELECT room_type, availability FROM rooms WHERE hotel_id IN (SELECT hotel_id FROM hotels WHERE city = 'Miami');[0m
Observation: [36;1m[1;3mError: (google.cloud.bigquery.dbapi.exceptions.DatabaseError) 400 Unrecognized name: city at [1:96]

Location: US
Job ID: f8c940b7-737d-4d95-9438-a44d5163e17c

[SQL: SELECT room_type, availability 

'Deluxe (10), Suite (5)'