# BrainTrust Text2SQL Fine Tune Tutorial

Welcome to [BrainTrust](https://www.braintrustdata.com/)! This tutorial will teach you how to finetune a `gpt-3.5-turbo` to generate SQL and evaluate it using BrainTrust compared to regular `gpt-3.5-turbo`.

Before starting, please make sure that you _already_ have a BrainTrust account. If you do not, please [sign up](https://www.braintrustdata.com) or [get in touch](mailto:info@braintrustdata.com).

## 1. Install and setup variables
Let's first setup our API key variables and install some dependencies.

In [None]:
# NOTE: Replace YOUR_OPENAI_KEY with your OpenAI API Key and YOUR_BRAINTRUST_API_KEY with your BrainTrust API key. Do not put it in quotes.
%env OPENAI_API_KEY=
%env GOOGLE_AI_API_KEY=
%env BRAINTRUST_API_KEY=

In [None]:
%pip install -U /home/ubuntu/braintrust/braintrust/sdk/py duckdb datasets openai pyarrow autoevals google-generativeai
%pip install -U google-auth google-auth-oauthlib google-auth-httplib2

In [None]:
import openai
import json
import braintrust
import google.generativeai as palm
import os

openai.api_key = os.environ["OPENAI_API_KEY"]

## 2. Define helper functions
We'll define some helper functions that help us work with SQL related data and queries.

In [None]:
# Import libraries + define helper functions
import duckdb
from datasets import load_dataset
import json
from Levenshtein import distance
import openai
import os
import pyarrow as pa
import time

NUM_TEST_EXAMPLES = 30

# Define some helper functions
def get_table(table):
    rows = [
        {h: row[i] for (i, h) in enumerate(table["header"])} for row in table["rows"]
    ]

    return pa.Table.from_pylist(rows)

AGG_OPS = [None, "MAX", "MIN", "COUNT", "SUM", "AVG"]
COND_OPS = [" ILIKE ", ">", "<"]  # , "OP"]


def esc_fn(s):
    return f'''"{s.replace('"', '""')}"'''


def esc_value(s):
    if isinstance(s, str):
        return s.replace("'", "''")
    else:
        return s

def codegen_query(query):
    header = query["table"]["header"]

    projection = f"{esc_fn(header[query['sql']['sel']])}"

    agg_op = AGG_OPS[query["sql"]["agg"]]
    if agg_op is not None:
        projection = f"{agg_op}({projection})"

    conds = query["sql"]["conds"]

    filters = " and ".join(
        [
            f"""{esc_fn(header[field])}{COND_OPS[cond]}'{esc_value(value)}'"""
            for (field, cond, value) in zip(
                conds["column_index"], conds["operator_index"], conds["condition"]
            )
        ]
    )

    if filters:
        filters = f" WHERE {filters}"

    return f'SELECT {projection} FROM "table"{filters}'

OPENAI_CACHE = None
def openai_req(model, messages, max_tokens):
    global OPENAI_CACHE
    if OPENAI_CACHE is None:
        os.makedirs("data", exist_ok=True)
        OPENAI_CACHE = duckdb.connect(database="data/oai_cache.duckdb")
        OPENAI_CACHE.query(
            "CREATE TABLE IF NOT EXISTS cache (params text, response text)"
        )

    for i in range(5):
      try:
        resp = openai.ChatCompletion.create(
            model=model,
            messages=messages,
            max_tokens=max_tokens,
        )
        break
      except openai.error.RateLimitError:
        print("Rate limited... Sleeping for 30 seconds")
        time.sleep(30)

    response_text = resp["choices"][0]['message']["content"]

    # messages to string
    prompt = "".join([m["content"] for m in messages])

    return prompt, response_text

def green(s):
  return "\x1b[32m" + s + "\x1b[0m"

def run_query(sql, table_record):
    table = get_table(table_record)  # noqa
    rel_from_arrow = duckdb.arrow(table)

    result = rel_from_arrow.query("table", sql).fetchone()
    if result and len(result) > 0:
        return result[0]
    return None

def score(r1, r2):
    if r1 is None and r2 is None:
        return 1
    if r1 is None or r2 is None:
        return 0

    r1, r2 = str(r1), str(r2)

    total_len = max(len(r1), len(r2))
    return 1 - distance(r1, r2) / total_len

3. ## Prepare a training set

We'll use the `wikisql`[ dataset from Hugging Face](https://huggingface.co/datasets/wikisql) to create a training set of data to fine tune a model on.

In [None]:
# Initialize data from WikiSQL
train_data = list(load_dataset("wikisql")["train"])

def createTrainExample(query):
    table = query["table"]
    rows = [
        {h: row[i] for (i, h) in enumerate(table["header"])}
        for row in table["rows"]
    ]
    meta = "\n".join(f'"{h}": {[row[h] for row in rows[:10]]}' for h in table["header"])
    prompt = f"""
    Print a SQL query (over a table named "table" quoted with double quotes) that answers the question below.

    You have the following columns:
    {meta}

    The format should be
    Question: the question to ask
    SQL: the SQL to generate

    Question: {query['question']}
    SQL: """.format()
    example = {
        "messages": [
            {
            "role":"system",
            "content":"You are an expert at generating SQL. Respond with just SQL."
            },
            {"role": "user", "content": prompt},
            {"role":"assistant", "content": codegen_query(query)}
        ]
    }
    print(example)
    return example


createTrainExample(train_data[0])


In [None]:
# Create training examples for Google's Text Bison model
def createTrainExampleInstruct(query):
    table = query["table"]
    rows = [
        {h: row[i] for (i, h) in enumerate(table["header"])}
        for row in table["rows"]
    ]
    meta = "\n".join(f'"{h}": {[row[h] for row in rows[:10]]}' for h in table["header"])
    prompt = f"""
    You are an expert at generating SQL. Respond with just SQL.

    Print a SQL query (over a table named "table" quoted with double quotes) that answers the question below.

    You have the following columns:
    {meta}

    The format should be
    Question: the question to ask
    SQL: the SQL to generate

    Question: {query['question']}
    SQL: """.format()
    example = {
        "text_input": prompt,
        "output": codegen_query(query),
    }
    print(example)
    return example


createTrainExampleInstruct(train_data[0])


In [None]:
# Save our training data to a file
with open('train-sql.JSONL', mode='w', newline='') as file:
    for i in range(100):
        row = createTrainExample(train_data[i])
        #add each row
        json.dump(row, file)
        file.write('\n')

# Save our training data to a csv file

import csv
with open('train-sql.csv', mode='w', newline='') as file:
    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['text_input', 'output'])
    for i in range(100):
        row = createTrainExampleInstruct(train_data[i])
        #add each row
        writer.writerow([row['text_input'], row['output']])

### Finetune GPT3.5 Turbo for SQL

In [None]:
# Can optionally set if you finetuned through the web UI or in a separate tutorial.
# If you set this, you can skip to the next section.
# FTGPT="ft:..."

In [None]:
# Upload our training data to OpenAI
file = openai.File.create(
  file=open("train-sql.JSONL", "rb"),
  purpose='fine-tune'
)

Wait 30s to 1 minute for the file to be processed. :)

In [None]:
# Start a fine-tuning job
job = openai.FineTuningJob.create(training_file=file['id'], model="gpt-3.5-turbo")

In [None]:
# Wait for the fine-tune to complete
FTGPT = ""
for i in range(100):
    check = openai.FineTuningJob.retrieve(job['id'])
    if (check.fine_tuned_model):
        FTGPT = check.fine_tuned_model
        break
    time.sleep(30)
print(FTGPT)

### Finetune text-bison for SQL

Go to [Google's Makersuite and fine tune a text-bison model](https://makersuite.google.com/app/tuned_models/new_tuned_model) using the `train-sql.csv` file we generated above. Then, follow their [guide to authenticate](https://developers.generativeai.google/tutorials/oauth_quickstart) and install gcloud locally.

In [None]:
# Put the client_scret.json in the same directory as this notebook file
# This will open a browser link to authenticate with Google and create a file called "application_default_credentials.json" somewhere.
!gcloud auth application-default login --client-id-file client_secret.json --scopes='https://www.googleapis.com/auth/cloud-platform,https://www.googleapis.com/auth/generative-language.tuning'

# Copy the credentials file "application_default_credentials.json" to the same directory as this notebook file

In [None]:
import json
import google.oauth2.credentials

# Load the client secrets
with open('client_secret.json', 'r') as f:
    client_secrets = json.load(f)


with open('application_default_credentials.json', 'r') as f:
    token_info = json.load(f)

    
creds = google.oauth2.credentials.Credentials(
    token=None,
    refresh_token=token_info.get('refresh_token'),
    token_uri=client_secrets['installed']['token_uri'],
    client_id=client_secrets['installed']['client_id'],
    client_secret=client_secrets['installed']['client_secret'],
)

palm.configure(credentials=creds)


In [None]:
palm.configure(credentials = creds)
import google.generativeai as palm

print('Available base models:', [m.name for m in palm.list_models()])
print('My tuned models:', [m.name for m in palm.list_tuned_models()])

In [None]:
googleFTModels = [m.name for m in palm.list_tuned_models()]
FTBISON = googleFTModels[0]

## 4. Evaluate our finetuned model

Finally, we'll load in an evaluation dataset, define an evaluation function, and then compare our results with BrainTrust.

In [None]:
# load in an evaluation dataset
data = list(load_dataset("wikisql")["test"])

In [None]:
def googleai_req(model, messages, max_tokens):
    # only use messages for now
    prompt = "\n".join([m["content"] for m in messages])
    response = palm.generate_text(prompt=prompt, model=model )
    result = response.result

    return prompt, result

In [None]:
# Define a generation function
def text2sql(query, modelName, type="openai"):
    table = query["table"]
    meta = "\n".join(f'"{h}"' for h in table["header"])

    prompt = f"""
Print a SQL query (over a table named "table" quoted with double quotes) that answers the question below.
USE THE DOUBLE QUOTES ON TABLE!
You have the following columns:
{meta}

The format should be
Question: the question to ask
SQL: the SQL to generate

Question: {query['question']}
SQL: """

    messages = [
        {
            "role":"system",
            "content":"You are an expert at generating SQL. Respond with just SQL."
         },
         {
            "role":"user",
            "content":prompt,
         }
    ]
    print("RUNNING WITH: model:", modelName)

    if type == "openai":
        prompt, resp = openai_req(model=modelName, messages=messages, max_tokens=1024)
    if type == "google":
        prompt, resp = googleai_req(model=modelName, messages=messages, max_tokens=1024)
    print(resp)

    return (
        prompt,
        resp,
        resp.rstrip(";")
        if resp
        else None,
    )

In [None]:
prompt, resp, _ = text2sql(data[0], "models/text-bison-001", type="google")
print(prompt + green(resp))

output_sql = resp.rstrip(";")
table = get_table(data[0]['table'])
print("Correct answer:", data[0]["sql"]["human_readable"],)
duckdb.arrow(table).query("table", output_sql)


In [None]:
# Define an evaluation function
def runEvaluation(modelName, type="openai"):
    # Initialize BrainTrust experiment
    with braintrust.init(project="openai-google-battle-sql", experiment=modelName) as experiment:
        for i in range(NUM_TEST_EXAMPLES):
            print(f"{i+1}/{NUM_TEST_EXAMPLES}")
            query = data[i]
            gt_query = codegen_query(query)
            gt_answer = run_query(gt_query, query["table"])
        
            prompt, _, sql = text2sql(query, modelName)
            # Why?
            sql = sql.replace("output:", "")
            try:
                answer = run_query(sql, query["table"])
            except Exception as e:
                answer = f"FAILED: {e}"
        
            #Log to BrainTrust
            experiment.log(
                input={"question": query["question"]},
                output=answer,
                expected=gt_answer,
                scores={
                    "answer": score(gt_answer, answer),
                    "query": score(gt_query, sql),
                },
                metadata={
                    "prompt": prompt,
                    "gt_sql": gt_query,
                    "output_sql": sql,
                    "id": i,
                },
            )
        
        # Print experiment results
        print(experiment.summarize())

In [None]:
#Evaluate base text bison
runEvaluation("models/text-bison-001", type="google")

In [None]:
#Evaluate finetuned bison
runEvaluation(FTBISON, type="google")

In [None]:
# Evaluate the base 3.5 turbo model
runEvaluation("gpt-3.5-turbo", type="openai")

In [None]:
# Evaluate the fine tuned GPT 3.5 turbo model
runEvaluation(FTGPT, type="openai")