In [1]:
# MIT License

# Copyright 2024 Google LLC.
# SPDX-License-Identifier: Apache-2.0

# Generating Explore Assistant Training Examples

This notebook is a companion to the [Explore Assistant Looker + GenAI Solution](https://github.com/LukaFontanilla/looker-explore-assistant) and will allow you to more easily add new training examples to your model. The notebook expects a CSV upload (use [this template](https://docs.google.com/spreadsheets/d/1cTenogLjsDekwVcN-wRUgH8cLb0wdPGTNbHfXt89sS4/edit#gid=0)) of prompts and their corresponding expanded URLs. It will then print out the necessary code for you to add your new examples into your model.

In [2]:
# @title Import necessary packages
try:
  import urllib.parse
except:
  %pip install urllib.parse

try:
  import re
except:
  %pip install re

try:
  import pandas as pd
except:
  %pip install pandas

try:
  import numpy as np
except:
  %pip install numpy

try:
  import io
except:
  %pip install io

from urllib.parse import urlsplit, unquote, parse_qs
import json
import re
import pandas as pd
import numpy as np
import io
from google.colab import files


In [3]:
# @title Upload the templated CSV file with your prompts and their corresponding expanded explore URLs
# **Upload the CSV**
uploaded = files.upload()  # Opens a file upload dialog

# **Check file was uploaded**
if len(uploaded.keys()) > 0:
  for filename in uploaded.keys():
    # **Read the CSV into a Pandas DataFrame**
    df = pd.read_csv(io.BytesIO(uploaded[filename]))
else:
  print("No CSV file was uploaded.")


def parse_url(query_data):
    parsed_url = parse_qs(urlsplit(query_data).query)

    # reconstruct url for training data
    decoded_url_modifiedvisjson = ''
    # return url parameters as a string
    for query_param in parsed_url.items():
        # parse fields
        if query_param[0] == 'fields':
            decoded_url_modifiedvisjson += f'fields={query_param[1][0]}'
        # parse limit
        if query_param[0] == 'limit':
            decoded_url_modifiedvisjson += f'&limit={query_param[1][0]}'
        # parse column limit
        if query_param[0] == 'column_limit':
            decoded_url_modifiedvisjson += f'&column_limit={query_param[1][0]}'
        # parse filters
        if query_param[0].startswith('f['):
            decoded_url_modifiedvisjson += f'&{query_param[0]}={query_param[1][0]}'
        # parse pivots
        if query_param[0] == 'pivots':
            decoded_url_modifiedvisjson += f'&pivots={query_param[1][0]}'
        # parse fill fields
        if query_param[0] == 'fill_fields':
            decoded_url_modifiedvisjson += f'&fill_fields={query_param[1][0]}'
        # parse dynamic fields ie. custom fields and table calcs
        if query_param[0] == 'dynamic_fields':
            decoded_url_modifiedvisjson += f'&dynamic_fields={query_param[1][0]}'
        # parse sorts
        if query_param[0] == 'sorts':
            decoded_url_modifiedvisjson += f'&sorts={query_param[1][0]}'
        # parse just vis type
        if query_param[0] == 'vis':
            vis_type = re.search(r'("type":\s*"([^,}]+))', query_param[1][0])
            if vis_type:
                decoded_url_modifiedvisjson += '&vis={' + vis_type.group(1) + '}'
            # if no vis type, don't add
            else:
                continue
        # if none of the above skip as it's not needed
        else:
            continue
    return decoded_url_modifiedvisjson
    # print(f"""Components of your url: \n {parsed_url}\nReconstructed url for Explore Assistant (copy this): \n {decoded_url_modifiedvisjson}""")



# run each expanded URL through the parser and return formatted URL
# for index, col in enumerate(df.columns):
#     print(f"Column Name: {col}, Index Location: {index}")


for index, row in df.iterrows():
  # print(f"Row value: {row}, Index Location: {index}")
  formatted_url = parse_url(row['expanded_url'])
  # print('formatted url', formatted_url)
  df.loc[index,'formatted_url'] = formatted_url


Saving [Template] Looker Explore Assistant New Example Template - Sheet1 (6).csv to [Template] Looker Explore Assistant New Example Template - Sheet1 (6).csv


In [12]:
# @title Generate examples
# @markdown Copy the resulting code below.

# @markdown If you are using the BQML deployment, paste the code into your BQ console in the project where you deployed the explore assistant and then run all sections of the code.

# @markdown If you are using the cloud functions deployment, add the formatted examples to the jsonl file within your explore assistant cloud function.
def escape_quotes(text):
    """Escapes all double quotes in a string by adding a backslash before them."""
    return text.replace('"', '\\"')

examples = ""
examples += '{'

for index, row in df.iterrows():
  ## new
  # print("Index:", index)
  row[0] = escape_quotes(row[0])
  row[2] = escape_quotes(row[2])
  ## new
  if index == len(df) - 1:
    examples += f'\n"input": "{row["prompt"]}",\n'
    examples += f'"output": "{row["formatted_url"]}"\n'+'}\n'
  else:
    examples += f'\n"input": "{row["prompt"]}",\n'
    examples += f'"output": "{row["formatted_url"]}"\n'+'}, {'

print(examples)

{
"input": "what is my total gross margin broken out by day for the last 90 days",
"output": "fields=order_items.total_gross_margin,order_items.created_date&fill_fields=order_items.created_date&f[order_items.created_date]=90 days&limit=500&column_limit=50"
}, {
"input": "total inventory by age",
"output": "fields=inventory_items.days_in_inventory,inventory_items.count&sorts=inventory_items.days_in_inventory&limit=500&column_limit=50"
}, {
"input": "total gross sales by date for the last 30 days, pivoted by country as a line chart",
"output": "fields=order_items.created_date,order_items.total_sale_price,users.country&pivots=users.country&fill_fields=order_items.created_date&f[order_items.created_date]=30 days&sorts=users.country,order_items.created_date desc&limit=500&column_limit=50&vis={\"type\":\"looker_line\"}"
}

