## Infer schema directly, self correct and pick optimal query

#### Imports 

In [1]:
from vertexai.language_models import CodeGenerationModel
from google.cloud import bigquery
import pandas as pd
import logging 
import os 

##### Setup logging

In [2]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

#### Setup essentials

In [3]:
# Adjust display settings
pd.set_option('display.max_colwidth', None)  # Set max column width to None to show all content
pd.set_option('display.expand_frame_repr', False)  # Prevent truncation of DataFrame HTML representation


In [4]:
SERVICE_ACCOUNT_CREDENTIALS = './../credentials/vai-key.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = SERVICE_ACCOUNT_CREDENTIALS

In [5]:
PROJECT_ID = 'arun-genai-bb'
CODE_GEN_MODEL_NAME = 'code-bison@latest'
TEMPERATURE = 1 
MAX_OUTPUT_TOKENS = 2048  # length of the output response | overridding the default value which is 128
# TOP_P = 0.95  # default value
# TOP_K = 40  # default value
LOCATION = 'us-central1'

In [6]:
DATASET = 'flight_reservations'
TABLES = ['customers', 'flights', 'reservations', 'transactions', 'loyality_points']

In [7]:
code_gen_model = CodeGenerationModel.from_pretrained(CODE_GEN_MODEL_NAME)
bq_client = bigquery.Client()

In [8]:
query = f"""
    SELECT *
    FROM `{PROJECT_ID}.{DATASET}.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS`
    WHERE table_name in ({','.join([f'"{table}"' for table in TABLES])})
"""
logger.info(query)


    SELECT *
    FROM `arun-genai-bb.flight_reservations.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS`
    WHERE table_name in ("customers","flights","reservations","transactions","loyality_points")



In [9]:
schema_columns = bq_client.query(query=query).to_dataframe()
schema_columns

Unnamed: 0,table_catalog,table_schema,table_name,column_name,field_path,data_type,description,collation_name,rounding_mode
0,arun-genai-bb,flight_reservations,transactions,transaction_id,transaction_id,INT64,,,
1,arun-genai-bb,flight_reservations,transactions,reservation_id,reservation_id,INT64,,,
2,arun-genai-bb,flight_reservations,transactions,amount,amount,FLOAT64,,,
3,arun-genai-bb,flight_reservations,transactions,transaction_datetime,transaction_datetime,DATETIME,,,
4,arun-genai-bb,flight_reservations,reservations,reservation_id,reservation_id,INT64,,,
5,arun-genai-bb,flight_reservations,reservations,customer_id,customer_id,INT64,,,
6,arun-genai-bb,flight_reservations,reservations,flight_id,flight_id,INT64,,,
7,arun-genai-bb,flight_reservations,reservations,reservation_datetime,reservation_datetime,DATETIME,,,
8,arun-genai-bb,flight_reservations,reservations,status,status,STRING,,,
9,arun-genai-bb,flight_reservations,flights,flight_id,flight_id,INT64,,,


In [10]:
schema_columns = schema_columns.to_markdown(index=False)
logger.info(schema_columns)

| table_catalog   | table_schema        | table_name   | column_name          | field_path           | data_type   | description   | collation_name   | rounding_mode   |
|:----------------|:--------------------|:-------------|:---------------------|:---------------------|:------------|:--------------|:-----------------|:----------------|
| arun-genai-bb   | flight_reservations | transactions | transaction_id       | transaction_id       | INT64       |               | NULL             |                 |
| arun-genai-bb   | flight_reservations | transactions | reservation_id       | reservation_id       | INT64       |               | NULL             |                 |
| arun-genai-bb   | flight_reservations | transactions | amount               | amount               | FLOAT64     |               | NULL             |                 |
| arun-genai-bb   | flight_reservations | transactions | transaction_datetime | transaction_datetime | DATETIME    |               | NULL             

Utility function to convert text into SQL and automatically correct the SQL query if any execution errors occur. <br>
The function also collects the successful query executions, ranks them by latency of execution, and returns the fastest. <br>
Additionally, it provides an option to retrieve all the successful queries and their latencies as a dataframe

In [11]:
import pandas as pd
import time

def generate_and_execute_sql(prompt, max_tries=5, return_all=False):
    """
    Generate an SQL query using the code_gen_model, execute it using bq_client, and rank successful queries by latency.
    
    Args:
    - prompt (str): Prompt to provide to the model for generating SQL.
    - max_tries (int): Maximum number of attempts to generate and execute SQL.
    - return_all (bool): Flag to determine whether to return all successful queries or only the fastest.
    
    Returns:
    - dict: A dictionary containing the fastest dataframe or all successful dataframes, or error messages and prompt evolution.
    """
    
    tries = 0
    error_messages = []
    prompts = [prompt]
    successful_queries = []
    
    while tries < max_tries:
        logger.info(f'TRIAL: {tries+1}')
        try:
            # Predict SQL using the model
            start_time = time.time()
            response = code_gen_model.predict(prompt, temperature=TEMPERATURE, max_output_tokens=MAX_OUTPUT_TOKENS)
            generated_sql_query = response.text
            generated_sql_query = '\n'.join(generated_sql_query.split('\n')[1:-1])
            logger.info('-' * 50)
            logger.info(generated_sql_query)
            logger.info('-' * 50)
            # Execute SQL using BigQuery client
            df = bq_client.query(generated_sql_query).to_dataframe()
            latency = time.time() - start_time
            successful_queries.append({
                "query": generated_sql_query,
                "dataframe": df,
                "latency": latency
            })
            logger.info('SUCCEEDED')
        except Exception as e:
            logger.error('FAILED')
            # Catch the error, store the message, and try again
            msg = str(e)
            error_messages.append(msg)
            # Evolve the prompt by appending the error message and asking the model to correct it
            prompt = f"""{prompt}
Encountered an error: {msg}. 
To address this, please generate an alternative SQL query response that avoids this specific error. 
Follow the instructions mentioned above to remediate the error. 

Modify the below SQL query to resolve the issue:
{generated_sql_query}
 
Ensure the revised SQL query aligns precisely with the requirements outlined in the initial question."""
            prompts.append(prompt)
        logger.info('=' * 100)
        tries += 1
    # If no successful queries
    if len(successful_queries) == 0:
        return {
            "error": "All attempts exhausted.",
            "prompts": prompts,
            "errors": error_messages
        }
    
    # Sort successful queries by latency
    successful_queries.sort(key=lambda x: x['latency'])
    
    if return_all:
        df = pd.DataFrame([(q["query"], q["dataframe"], q["latency"]) for q in successful_queries], columns=["Query", "Result", "Latency"])
        return {
            "dataframe": df
        }
    else:
        return {
            "fastest_query": successful_queries[0]["query"],
            "result": successful_queries[0]["dataframe"],
            "latency": successful_queries[0]["latency"]
        }

### Test text to SQL scenarios

Construct the SEED prompt

In [12]:
seed_prompt = """
Please craft a SQL query for BigQuery that addresses the following QUESTION provided below. 
Ensure you reference the appropriate BigQuery tables and column names provided in the SCHEMA below. 
When joining tables, employ type coercion to guarantee data type consistency for the join columns. 
Additionally, the output column names should specify units where applicable.\n
QUESTION:
{}\n
SCHEMA:
{}\n
IMPORTANT: 
Use ONLY DATETIME and DO NOT use TIMESTAMP.
--
Ensure your SQL query accurately defines both the start and end of the DATETIME range.
"""
logger.info(seed_prompt)


Please craft a SQL query for BigQuery that addresses the following QUESTION provided below. 
Ensure you reference the appropriate BigQuery tables and column names provided in the SCHEMA below. 
When joining tables, employ type coercion to guarantee data type consistency for the join columns. 
Additionally, the output column names should specify units where applicable.

QUESTION:
{}

SCHEMA:
{}

IMPORTANT: 
Use ONLY DATETIME and DO NOT use TIMESTAMP.
--
Ensure your SQL query accurately defines both the start and end of the DATETIME range.



#### Scenario 1: Retrieve Active Reservations for a Specific Date Range

For this scenario, you want to find all active reservations within a specific date range.

In [13]:
question = "Provide a list of all flight reservations from October 10th to October 15th, 2023"

In [14]:
prompt = seed_prompt.format(question, schema_columns)
logger.info(prompt)


Please craft a SQL query for BigQuery that addresses the following QUESTION provided below. 
Ensure you reference the appropriate BigQuery tables and column names provided in the SCHEMA below. 
When joining tables, employ type coercion to guarantee data type consistency for the join columns. 
Additionally, the output column names should specify units where applicable.

QUESTION:
Provide a list of all flight reservations from October 10th to October 15th, 2023

SCHEMA:
| table_catalog   | table_schema        | table_name   | column_name          | field_path           | data_type   | description   | collation_name   | rounding_mode   |
|:----------------|:--------------------|:-------------|:---------------------|:---------------------|:------------|:--------------|:-----------------|:----------------|
| arun-genai-bb   | flight_reservations | transactions | transaction_id       | transaction_id       | INT64       |               | NULL             |                 |
| arun-genai-bb 

In [15]:
%%time

response = generate_and_execute_sql(prompt=prompt, return_all=True)
sql_output = response['dataframe']
sql_output


TRIAL: 1
--------------------------------------------------
SELECT
  r.reservation_id,
  r.customer_id,
  r.flight_id,
  r.reservation_datetime,
  r.status,
  f.origin,
  f.destination,
  f.departure_datetime,
  f.arrival_datetime,
  f.carrier,
  f.price
FROM flight_reservations.reservations AS r
JOIN flight_reservations.flights AS f
ON CAST(r.flight_id AS STRING) = CAST(f.flight_id AS STRING)
WHERE r.reservation_datetime BETWEEN '2023-10-10 00:00:00' AND '2023-10-15 23:59:59';
--------------------------------------------------
SUCCEEDED
TRIAL: 2
--------------------------------------------------
SELECT
  r.reservation_id,
  r.customer_id,
  r.flight_id,
  r.reservation_datetime AS reservation_datetime,
  r.status,
  f.origin,
  f.destination,
  f.departure_datetime AS departure_datetime,
  f.arrival_datetime AS arrival_datetime,
  f.carrier,
  f.price
FROM arun-genai-bb.flight_reservations.reservations r
JOIN arun-genai-bb.flight_reservations.flights f
ON CAST(r.flight_id AS STRING) =

CPU times: user 365 ms, sys: 70.2 ms, total: 436 ms
Wall time: 45 s


Unnamed: 0,Query,Result,Latency
0,"SELECT\n r.reservation_id,\n r.customer_id,\n r.flight_id,\n r.reservation_datetime,\n r.status,\n f.origin,\n f.destination,\n f.departure_datetime,\n f.arrival_datetime,\n f.carrier,\n f.price\nFROM arun-genai-bb.flight_reservations.reservations r\nJOIN arun-genai-bb.flight_reservations.transactions t\nON CAST(t.transaction_datetime AS DATE) BETWEEN DATE '2023-10-10' AND DATE '2023-10-15'\nAND r.reservation_id = t.reservation_id\nJOIN arun-genai-bb.flight_reservations.flights f\nON r.flight_id = f.flight_id;",reservation_id customer_id flight_id reservation_datetime status origin destination departure_datetime arrival_datetime carrier price 0 6 6 6 2023-10-10 10:00:00 Confirmed SEA JFK 2023-11-25 06:00:00 2023-11-25 14:30:00 United 550.0 1 7 6 7 2023-10-12 11:30:00 Confirmed JFK MIA 2023-11-27 20:00:00 2023-11-27 23:30:00 American 380.0 2 8 8 8 2023-10-15 13:20:00 Confirmed MIA JFK 2023-11-30 10:00:00 2023-11-30 13:30:00 American 380.0,6.950561
1,"SELECT\n r.reservation_id,\n r.customer_id,\n r.flight_id,\n r.reservation_datetime,\n r.status,\n f.origin,\n f.destination,\n f.departure_datetime,\n f.arrival_datetime,\n f.carrier,\n f.price\nFROM\n arun-genai-bb.flight_reservations.reservations r\nJOIN\n arun-genai-bb.flight_reservations.flights f\nON\n r.flight_id = CAST(f.flight_id AS INT64)\nWHERE\n r.reservation_datetime BETWEEN '2023-10-10 00:00:00' AND '2023-10-15 23:59:59';",reservation_id customer_id flight_id reservation_datetime status origin destination departure_datetime arrival_datetime carrier price 0 6 6 6 2023-10-10 10:00:00 Confirmed SEA JFK 2023-11-25 06:00:00 2023-11-25 14:30:00 United 550.0 1 7 6 7 2023-10-12 11:30:00 Confirmed JFK MIA 2023-11-27 20:00:00 2023-11-27 23:30:00 American 380.0 2 8 8 8 2023-10-15 13:20:00 Confirmed MIA JFK 2023-11-30 10:00:00 2023-11-30 13:30:00 American 380.0,7.831321
2,"SELECT\n r.reservation_id,\n r.customer_id,\n r.flight_id,\n r.reservation_datetime AS reservation_datetime,\n r.status,\n f.origin,\n f.destination,\n f.departure_datetime AS departure_datetime,\n f.arrival_datetime AS arrival_datetime,\n f.carrier,\n f.price\nFROM arun-genai-bb.flight_reservations.reservations r\nJOIN arun-genai-bb.flight_reservations.flights f\nON CAST(r.flight_id AS STRING) = CAST(f.flight_id AS STRING)\nWHERE r.reservation_datetime BETWEEN '2023-10-10 00:00:00' AND '2023-10-15 23:59:59'\nORDER BY r.reservation_datetime;",reservation_id customer_id flight_id reservation_datetime status origin destination departure_datetime arrival_datetime carrier price 0 6 6 6 2023-10-10 10:00:00 Confirmed SEA JFK 2023-11-25 06:00:00 2023-11-25 14:30:00 United 550.0 1 7 6 7 2023-10-12 11:30:00 Confirmed JFK MIA 2023-11-27 20:00:00 2023-11-27 23:30:00 American 380.0 2 8 8 8 2023-10-15 13:20:00 Confirmed MIA JFK 2023-11-30 10:00:00 2023-11-30 13:30:00 American 380.0,7.948229
3,"SELECT\n r.reservation_id,\n r.customer_id,\n r.flight_id,\n r.reservation_datetime,\n r.status,\n f.origin,\n f.destination,\n f.departure_datetime,\n f.arrival_datetime,\n f.carrier,\n f.price\nFROM flight_reservations.reservations AS r\nJOIN flight_reservations.flights AS f\nON CAST(r.flight_id AS STRING) = CAST(f.flight_id AS STRING)\nWHERE r.reservation_datetime BETWEEN '2023-10-10 00:00:00' AND '2023-10-15 23:59:59';",reservation_id customer_id flight_id reservation_datetime status origin destination departure_datetime arrival_datetime carrier price 0 6 6 6 2023-10-10 10:00:00 Confirmed SEA JFK 2023-11-25 06:00:00 2023-11-25 14:30:00 United 550.0 1 7 6 7 2023-10-12 11:30:00 Confirmed JFK MIA 2023-11-27 20:00:00 2023-11-27 23:30:00 American 380.0 2 8 8 8 2023-10-15 13:20:00 Confirmed MIA JFK 2023-11-30 10:00:00 2023-11-30 13:30:00 American 380.0,9.851194
4,"SELECT\n r.reservation_id,\n r.customer_id,\n r.flight_id,\n r.reservation_datetime,\n r.status,\n f.origin,\n f.destination,\n f.departure_datetime,\n f.arrival_datetime,\n f.carrier,\n f.price\nFROM arun-genai-bb.flight_reservations.reservations r\nJOIN arun-genai-bb.flight_reservations.flights f\nON CAST(r.flight_id AS STRING) = CAST(f.flight_id AS STRING)\nWHERE r.reservation_datetime BETWEEN '2023-10-10 00:00:00' AND '2023-10-15 23:59:59'\nORDER BY r.reservation_datetime;",reservation_id customer_id flight_id reservation_datetime status origin destination departure_datetime arrival_datetime carrier price 0 6 6 6 2023-10-10 10:00:00 Confirmed SEA JFK 2023-11-25 06:00:00 2023-11-25 14:30:00 United 550.0 1 7 6 7 2023-10-12 11:30:00 Confirmed JFK MIA 2023-11-27 20:00:00 2023-11-27 23:30:00 American 380.0 2 8 8 8 2023-10-15 13:20:00 Confirmed MIA JFK 2023-11-30 10:00:00 2023-11-30 13:30:00 American 380.0,12.360988


In [16]:
result_df = sql_output.loc[0, 'Result']
result_df


Unnamed: 0,reservation_id,customer_id,flight_id,reservation_datetime,status,origin,destination,departure_datetime,arrival_datetime,carrier,price
0,6,6,6,2023-10-10 10:00:00,Confirmed,SEA,JFK,2023-11-25 06:00:00,2023-11-25 14:30:00,United,550.0
1,7,6,7,2023-10-12 11:30:00,Confirmed,JFK,MIA,2023-11-27 20:00:00,2023-11-27 23:30:00,American,380.0
2,8,8,8,2023-10-15 13:20:00,Confirmed,MIA,JFK,2023-11-30 10:00:00,2023-11-30 13:30:00,American,380.0


#### Scenario 2: Identify customers who made reservations in the past N days.

In [17]:
question = "Identify all customers who have made flight reservations within the last 7 days."

In [18]:
prompt = seed_prompt.format(question, schema_columns)

In [19]:
%%time

response = generate_and_execute_sql(prompt=prompt, return_all=True)
sql_output = response['dataframe']
sql_output

TRIAL: 1
--------------------------------------------------
SELECT c.customer_id,
       c.first_name,
       c.last_name,
       c.email
FROM flight_reservations.customers c
JOIN flight_reservations.reservations r
ON c.customer_id = r.customer_id
WHERE r.reservation_datetime BETWEEN DATE_SUB(CURRENT_DATETIME(), INTERVAL 7 DAY) AND CURRENT_DATETIME();
--------------------------------------------------
SUCCEEDED
TRIAL: 2
--------------------------------------------------
SELECT 
  c.customer_id,
  c.first_name,
  c.last_name,
  c.email,
  r.reservation_id,
  r.flight_id,
  r.reservation_datetime,
  r.status,
  f.origin,
  f.destination,
  f.departure_datetime,
  f.arrival_datetime,
  f.carrier,
  f.price
FROM flight_reservations.customers c
JOIN flight_reservations.reservations r
ON CAST(c.customer_id AS STRING) = CAST(r.customer_id AS STRING)
JOIN flight_reservations.flights f
ON CAST(r.flight_id AS STRING) = CAST(f.flight_id AS STRING)
WHERE r.reservation_datetime BETWEEN DATE_SUB(CUR

CPU times: user 330 ms, sys: 64.8 ms, total: 395 ms
Wall time: 36.5 s


Unnamed: 0,Query,Result,Latency
0,"SELECT c.customer_id, c.first_name, c.last_name, c.email\nFROM flight_reservations.customers c\nJOIN flight_reservations.reservations r\nON CAST(c.customer_id AS STRING) = CAST(r.customer_id AS STRING)\nWHERE r.reservation_datetime BETWEEN DATE_SUB(CURRENT_DATETIME(), INTERVAL 7 DAY) AND CURRENT_DATETIME()",customer_id first_name last_name email 0 11 Ian Somerhalder ian.s@example.com 1 13 Kate Winslet kate.w@example.com 2 13 Kate Winslet kate.w@example.com,6.23467
1,"SELECT c.customer_id,\n c.first_name,\n c.last_name,\n c.email\nFROM flight_reservations.customers c\nJOIN flight_reservations.reservations r\nON c.customer_id = r.customer_id\nWHERE r.reservation_datetime BETWEEN DATE_SUB(CURRENT_DATETIME(), INTERVAL 7 DAY) AND CURRENT_DATETIME();",customer_id first_name last_name email 0 11 Ian Somerhalder ian.s@example.com 1 13 Kate Winslet kate.w@example.com 2 13 Kate Winslet kate.w@example.com,6.505069
2,"SELECT \n c.customer_id,\n c.first_name,\n c.last_name,\n c.email,\n r.reservation_id,\n r.flight_id,\n r.reservation_datetime\nFROM arun-genai-bb.flight_reservations.customers c\nJOIN arun-genai-bb.flight_reservations.reservations r\nON CAST(c.customer_id AS STRING) = CAST(r.customer_id AS STRING)\nWHERE r.reservation_datetime BETWEEN DATE_SUB(CURRENT_DATETIME(), INTERVAL 7 DAY) AND CURRENT_DATETIME()",customer_id first_name last_name email reservation_id flight_id reservation_datetime 0 11 Ian Somerhalder ian.s@example.com 12 12 2023-10-28 17:10:00 1 13 Kate Winslet kate.w@example.com 13 12 2023-10-30 14:50:00 2 13 Kate Winslet kate.w@example.com 14 14 2023-11-02 08:20:00,6.86726
3,"SELECT c.customer_id,\n c.first_name,\n c.last_name,\n c.email,\n t.transaction_datetime AS reservation_date\nFROM flight_reservations.customers c\nJOIN flight_reservations.reservations r\nON c.customer_id = r.customer_id\nJOIN flight_reservations.transactions t\nON r.reservation_id = t.reservation_id\nWHERE t.transaction_datetime >= DATE_SUB(CURRENT_DATETIME(), INTERVAL 7 DAY)",customer_id first_name last_name email reservation_date 0 11 Ian Somerhalder ian.s@example.com 2023-10-28 17:11:00 1 13 Kate Winslet kate.w@example.com 2023-11-02 08:21:00 2 15 Mary Jane mary.j@example.com 2023-11-04 10:46:00 3 16 Nick Fury nick.f@example.com 2023-11-08 15:31:00 4 17 Olivia Newton olivia.n@example.com 2023-11-11 10:16:00 5 18 Peter Parker peter.p@example.com 2023-11-15 12:51:00 6 20 Ryan Reynolds ryan.r@example.com 2023-11-22 09:11:00,8.395221
4,"SELECT \n c.customer_id,\n c.first_name,\n c.last_name,\n c.email,\n r.reservation_id,\n r.flight_id,\n r.reservation_datetime,\n r.status,\n f.origin,\n f.destination,\n f.departure_datetime,\n f.arrival_datetime,\n f.carrier,\n f.price\nFROM flight_reservations.customers c\nJOIN flight_reservations.reservations r\nON CAST(c.customer_id AS STRING) = CAST(r.customer_id AS STRING)\nJOIN flight_reservations.flights f\nON CAST(r.flight_id AS STRING) = CAST(f.flight_id AS STRING)\nWHERE r.reservation_datetime BETWEEN DATE_SUB(CURRENT_DATETIME(), INTERVAL 7 DAY) AND CURRENT_DATETIME()",customer_id first_name last_name email reservation_id flight_id reservation_datetime status origin destination departure_datetime arrival_datetime carrier price 0 11 Ian Somerhalder ian.s@example.com 12 12 2023-10-28 17:10:00 Confirmed ORD LAX 2023-12-15 18:00:00 2023-12-15 20:30:00 Alaska 400.0 1 13 Kate Winslet kate.w@example.com 13 12 2023-10-30 14:50:00 Cancelled ORD LAX 2023-12-15 18:00:00 2023-12-15 20:30:00 Alaska 400.0 2 13 Kate Winslet kate.w@example.com 14 14 2023-11-02 08:20:00 Confirmed ORD SEA 2023-12-24 22:00:00 2023-12-25 02:30:00 JetBlue 450.0,8.450051


In [20]:
result_df = sql_output.loc[0, 'Result']
result_df

Unnamed: 0,customer_id,first_name,last_name,email
0,11,Ian,Somerhalder,ian.s@example.com
1,13,Kate,Winslet,kate.w@example.com
2,13,Kate,Winslet,kate.w@example.com


### Scenario 3: Calculate Monthly Revenue
Calculate the total revenue generated from transactions for a given month and year.

In [21]:
question = "Calculate the total revenue generated from transactions in October 2023, specifically from all reservations with a Confirmed status."

In [22]:
prompt = seed_prompt.format(question, schema_columns)

In [23]:
%%time

response = generate_and_execute_sql(prompt=prompt, return_all=True)
sql_output = response['dataframe']
sql_output

TRIAL: 1
--------------------------------------------------
SELECT
  SUM(transactions.amount) AS total_revenue_usd
FROM
  arun-genai-bb.flight_reservations.transactions AS transactions
JOIN
  arun-genai-bb.flight_reservations.reservations AS reservations
ON
  transactions.reservation_id = reservations.reservation_id
WHERE
  reservations.status = 'Confirmed'
  AND transactions.transaction_datetime BETWEEN '2023-10-01 00:00:00' AND '2023-10-31 23:59:59';
--------------------------------------------------
SUCCEEDED
TRIAL: 2
--------------------------------------------------
SELECT
  SUM(transactions.amount) AS total_revenue_usd
FROM
  arun-genai-bb.flight_reservations.transactions AS transactions
JOIN
  arun-genai-bb.flight_reservations.reservations AS reservations
ON
  transactions.reservation_id = reservations.reservation_id
WHERE
  reservations.status = 'Confirmed'
  AND transactions.transaction_datetime BETWEEN '2023-10-01 00:00:00' AND '2023-10-31 23:59:59';
-------------------------

CPU times: user 319 ms, sys: 58.7 ms, total: 378 ms
Wall time: 29.3 s


Unnamed: 0,Query,Result,Latency
0,SELECT\n SUM(transactions.amount) AS total_revenue_usd\nFROM\n arun-genai-bb.flight_reservations.transactions AS transactions\nJOIN\n arun-genai-bb.flight_reservations.reservations AS reservations\nON\n transactions.reservation_id = reservations.reservation_id\nWHERE\n reservations.status = 'Confirmed'\n AND transactions.transaction_datetime BETWEEN '2023-10-01 00:00:00' AND '2023-10-31 23:59:59';,total_revenue_usd 0 3860.0,5.515625
1,SELECT\n SUM(transactions.amount) AS total_revenue_usd\nFROM\n arun-genai-bb.flight_reservations.transactions AS transactions\nJOIN\n arun-genai-bb.flight_reservations.reservations AS reservations\nON\n transactions.reservation_id = reservations.reservation_id\nWHERE\n reservations.status = 'Confirmed'\n AND transactions.transaction_datetime BETWEEN '2023-10-01 00:00:00' AND '2023-10-31 23:59:59';,total_revenue_usd 0 3860.0,5.784518
2,SELECT\n SUM(transactions.amount) AS total_revenue_usd\nFROM\n arun-genai-bb.flight_reservations.transactions AS transactions\nJOIN\n arun-genai-bb.flight_reservations.reservations AS reservations\nON\n transactions.reservation_id = reservations.reservation_id\nWHERE\n reservations.status = 'Confirmed'\n AND transactions.transaction_datetime BETWEEN '2023-10-01 00:00:00' AND '2023-10-31 23:59:59';,total_revenue_usd 0 3860.0,5.807508
3,SELECT\n SUM(transactions.amount) AS total_revenue_usd\nFROM flight_reservations.transactions AS transactions\nJOIN flight_reservations.reservations AS reservations\nON CAST(transactions.reservation_id AS STRING) = CAST(reservations.reservation_id AS STRING)\nWHERE reservations.status = 'Confirmed'\nAND transactions.transaction_datetime BETWEEN '2023-10-01 00:00:00' AND '2023-10-31 23:59:59';,total_revenue_usd 0 3860.0,6.004572
4,SELECT\n SUM(transactions.amount) AS total_revenue_usd\nFROM\n arun-genai-bb.flight_reservations.transactions AS transactions\nJOIN\n arun-genai-bb.flight_reservations.reservations AS reservations\nON\n transactions.reservation_id = reservations.reservation_id\nWHERE\n reservations.status = 'Confirmed'\n AND transactions.transaction_datetime BETWEEN '2023-10-01 00:00:00' AND '2023-10-31 23:59:59';,total_revenue_usd 0 3860.0,6.159729


In [24]:
result_df = sql_output.loc[0, 'Result']
result_df

Unnamed: 0,total_revenue_usd
0,3860.0


### Scenario 4: Popular Flight Times
Identify the most popular departure hours or days for a given day or month or year.

In [25]:
question = "Determine the departure months with the highest frequency for the year 2023."

In [26]:
prompt = seed_prompt.format(question, schema_columns)

In [27]:
%%time

response = generate_and_execute_sql(prompt=prompt, return_all=True)
sql_output = response['dataframe']
sql_output

TRIAL: 1
--------------------------------------------------
SELECT DATE_TRUNC(departure_datetime, MONTH) AS departure_month,
       COUNT(*) AS num_departures
FROM flight_reservations.flights
WHERE departure_datetime >= '2023-01-01'
  AND departure_datetime < '2024-01-01'
GROUP BY departure_month
ORDER BY num_departures DESC
LIMIT 10;
--------------------------------------------------
SUCCEEDED
TRIAL: 2
--------------------------------------------------
SELECT DATE_TRUNC(departure_datetime, MONTH) AS departure_month,
       COUNT(*) AS flight_count
FROM flight_reservations.flights
WHERE YEAR(departure_datetime) = 2023
GROUP BY departure_month
ORDER BY flight_count DESC
LIMIT 10;
--------------------------------------------------
FAILED
TRIAL: 3
--------------------------------------------------
SELECT DATE_TRUNC(departure_datetime, MONTH) AS departure_month,
       COUNT(*) AS flight_count
FROM flight_reservations.flights
WHERE EXTRACT(YEAR FROM departure_datetime) = 2023
GROUP BY depa

CPU times: user 266 ms, sys: 58.7 ms, total: 325 ms
Wall time: 28.1 s


Unnamed: 0,Query,Result,Latency
0,"SELECT DATE_TRUNC(departure_datetime, MONTH) AS departure_month,\n COUNT(*) AS flight_count\nFROM flight_reservations.flights\nWHERE EXTRACT(YEAR FROM departure_datetime) = 2023\nGROUP BY departure_month\nORDER BY flight_count DESC\nLIMIT 10;",departure_month flight_count 0 2023-11-01 8 1 2023-12-01 8,4.94495
1,"SELECT DATE_TRUNC(departure_datetime, MONTH) AS departure_month,\n COUNT(*) AS flight_count\nFROM flight_reservations.flights\nWHERE EXTRACT(YEAR FROM departure_datetime) = 2023\nGROUP BY departure_month\nORDER BY flight_count DESC\nLIMIT 10;",departure_month flight_count 0 2023-11-01 8 1 2023-12-01 8,5.269197
2,"SELECT DATE_TRUNC(departure_datetime, MONTH) AS departure_month,\n COUNT(*) AS flight_count\nFROM flight_reservations.flights\nWHERE EXTRACT(YEAR FROM departure_datetime) = 2023\nGROUP BY departure_month\nORDER BY flight_count DESC\nLIMIT 10;",departure_month flight_count 0 2023-11-01 8 1 2023-12-01 8,5.334219
3,"SELECT DATE_TRUNC(departure_datetime, MONTH) AS departure_month,\n COUNT(*) AS num_departures\nFROM flight_reservations.flights\nWHERE departure_datetime >= '2023-01-01'\n AND departure_datetime < '2024-01-01'\nGROUP BY departure_month\nORDER BY num_departures DESC\nLIMIT 10;",departure_month num_departures 0 2023-11-01 8 1 2023-12-01 8,6.642796


In [28]:
result_df = sql_output.loc[0, 'Result']
result_df

Unnamed: 0,departure_month,flight_count
0,2023-11-01,8
1,2023-12-01,8


### Scenario 5: Customer Age Group
Group customers by age brackets and count the number in each bracket.

In [29]:
question = "Group customers into five distinct age brackets and count the number of customers in each bracket."

In [30]:
prompt = seed_prompt.format(question, schema_columns)

In [31]:
%%time

response = generate_and_execute_sql(prompt=prompt, return_all=True)
sql_output = response['dataframe']
sql_output

TRIAL: 1
--------------------------------------------------
SELECT
  CASE
    WHEN date_of_birth IS NULL THEN 'Unknown'
    WHEN date_of_birth > DATE_SUB(CURRENT_DATE(), INTERVAL 65 YEAR) THEN '65+'
    WHEN date_of_birth > DATE_SUB(CURRENT_DATE(), INTERVAL 50 YEAR) THEN '50-64'
    WHEN date_of_birth > DATE_SUB(CURRENT_DATE(), INTERVAL 35 YEAR) THEN '35-49'
    WHEN date_of_birth > DATE_SUB(CURRENT_DATE(), INTERVAL 20 YEAR) THEN '20-34'
    ELSE '0-19'
  END AS age_bracket,
  COUNT(customer_id) AS num_customers
FROM flight_reservations.customers
GROUP BY age_bracket;
--------------------------------------------------
SUCCEEDED
TRIAL: 2
--------------------------------------------------
WITH customer_age_brackets AS (
  SELECT 
    customer_id, 
    DATE_DIFF(CURRENT_DATE(), date_of_birth, YEAR) AS age,
    CASE 
      WHEN age < 20 THEN "0-19"
      WHEN age BETWEEN 20 AND 29 THEN "20-29"
      WHEN age BETWEEN 30 AND 39 THEN "30-39"
      WHEN age BETWEEN 40 AND 49 THEN "40-49"
     

CPU times: user 126 ms, sys: 41.2 ms, total: 168 ms
Wall time: 27.5 s


Unnamed: 0,Query,Result,Latency
0,"SELECT\n CASE\n WHEN date_of_birth IS NULL THEN 'Unknown'\n WHEN date_of_birth > DATE_SUB(CURRENT_DATE(), INTERVAL 65 YEAR) THEN '65+'\n WHEN date_of_birth > DATE_SUB(CURRENT_DATE(), INTERVAL 50 YEAR) THEN '50-64'\n WHEN date_of_birth > DATE_SUB(CURRENT_DATE(), INTERVAL 35 YEAR) THEN '35-49'\n WHEN date_of_birth > DATE_SUB(CURRENT_DATE(), INTERVAL 20 YEAR) THEN '20-34'\n ELSE '0-19'\n END AS age_bracket,\n COUNT(customer_id) AS num_customers\nFROM flight_reservations.customers\nGROUP BY age_bracket;",age_bracket num_customers 0 65+ 17 1 0-19 3,9.799164


In [32]:
result_df = sql_output.loc[0, 'Result']
result_df

Unnamed: 0,age_bracket,num_customers
0,65+,17
1,0-19,3


### Scenario 6: Age Calculation
Calculate the age of customers based on their date of birth and filter those who are above X years old.

In [33]:
question = "Identify and rank all customers aged 18+ who have `Confirmed` reservations for the current month, ordered by their age. Make sure to display their ages in the result."

In [34]:
prompt = seed_prompt.format(question, schema_columns)

In [35]:
%%time

response = generate_and_execute_sql(prompt=prompt, return_all=True)
sql_output = response['dataframe']
sql_output

TRIAL: 1
--------------------------------------------------
SELECT
  c.customer_id,
  c.first_name,
  c.last_name,
  DATEDIFF(CURRENT_DATE(), c.date_of_birth) AS age_years,
  r.reservation_id
FROM flight_reservations.customers c
JOIN flight_reservations.reservations r
ON CAST(c.customer_id AS STRING) = r.customer_id
WHERE r.status = 'Confirmed'
AND r.reservation_datetime BETWEEN DATE_TRUNC(CURRENT_DATE(), MONTH) AND DATE_ADD(DATE_TRUNC(CURRENT_DATE(), MONTH), INTERVAL 1 MONTH)
AND DATEDIFF(CURRENT_DATE(), c.date_of_birth) >= 18
ORDER BY age_years DESC;
--------------------------------------------------
FAILED
TRIAL: 2
--------------------------------------------------
SELECT
  c.customer_id,
  c.first_name,
  c.last_name,
  DATEDIFF(CURRENT_DATE(), c.date_of_birth) AS age_years,
  r.reservation_id
FROM flight_reservations.customers c
JOIN flight_reservations.reservations r
ON CAST(c.customer_id AS STRING) = CAST(r.customer_id AS STRING)
WHERE r.status = 'Confirmed'
AND r.reservation_da

CPU times: user 221 ms, sys: 51 ms, total: 272 ms
Wall time: 29.3 s


Unnamed: 0,Query,Result,Latency
0,"SELECT\n c.customer_id,\n c.first_name,\n c.last_name,\n DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) AS age_years,\n r.reservation_id\nFROM flight_reservations.customers c\nJOIN flight_reservations.reservations r\nON CAST(c.customer_id AS STRING) = CAST(r.customer_id AS STRING)\nWHERE r.status = 'Confirmed'\nAND r.reservation_datetime BETWEEN DATE_TRUNC(CURRENT_DATE(), MONTH) AND DATE_ADD(DATE_TRUNC(CURRENT_DATE(), MONTH), INTERVAL 1 MONTH)\nAND DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) >= 18\nORDER BY age_years DESC;",customer_id first_name last_name age_years reservation_id 0 17 Olivia Newton 75 17 1 16 Nick Fury 58 16 2 13 Kate Winslet 48 14 3 20 Ryan Reynolds 47 20 4 15 Mary Jane 30 15 5 18 Peter Parker 22 18,6.439828
1,"SELECT\n c.customer_id,\n c.first_name,\n c.last_name,\n DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) AS age_years,\n r.reservation_id\nFROM flight_reservations.customers c\nJOIN flight_reservations.reservations r\nON CAST(c.customer_id AS STRING) = CAST(r.customer_id AS STRING)\nWHERE r.status = 'Confirmed'\nAND r.reservation_datetime BETWEEN DATE_TRUNC(CURRENT_DATE(), MONTH) AND DATE_ADD(DATE_TRUNC(CURRENT_DATE(), MONTH), INTERVAL 1 MONTH)\nAND DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) >= 18\nORDER BY age_years DESC;",customer_id first_name last_name age_years reservation_id 0 17 Olivia Newton 75 17 1 16 Nick Fury 58 16 2 13 Kate Winslet 48 14 3 20 Ryan Reynolds 47 20 4 15 Mary Jane 30 15 5 18 Peter Parker 22 18,6.673772
2,"SELECT\n c.customer_id,\n c.first_name,\n c.last_name,\n DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) AS age_years,\n r.reservation_id\nFROM flight_reservations.customers c\nJOIN flight_reservations.reservations r\nON CAST(c.customer_id AS STRING) = CAST(r.customer_id AS STRING)\nWHERE r.status = 'Confirmed'\nAND r.reservation_datetime BETWEEN DATE_TRUNC(CURRENT_DATE(), MONTH) AND DATE_ADD(DATE_TRUNC(CURRENT_DATE(), MONTH), INTERVAL 1 MONTH)\nAND DATE_DIFF(CURRENT_DATE(), c.date_of_birth, YEAR) >= 18\nORDER BY age_years DESC;",customer_id first_name last_name age_years reservation_id 0 17 Olivia Newton 75 17 1 16 Nick Fury 58 16 2 13 Kate Winslet 48 14 3 20 Ryan Reynolds 47 20 4 15 Mary Jane 30 15 5 18 Peter Parker 22 18,8.561313


In [36]:
result_df = sql_output.loc[0, 'Result']
result_df

Unnamed: 0,customer_id,first_name,last_name,age_years,reservation_id
0,17,Olivia,Newton,75,17
1,16,Nick,Fury,58,16
2,13,Kate,Winslet,48,14
3,20,Ryan,Reynolds,47,20
4,15,Mary,Jane,30,15
5,18,Peter,Parker,22,18
