# BrainTrust Text2SQL Fine Tune Tutorial

Welcome to [BrainTrust](https://www.braintrustdata.com/)! This tutorial will teach you how to finetune a `gpt-3.5-turbo` to generate SQL and evaluate it using BrainTrust compared to regular `gpt-3.5-turbo`.

Before starting, please make sure that you _already_ have a BrainTrust account. If you do not, please [sign up](https://www.braintrustdata.com) or [get in touch](mailto:info@braintrustdata.com).

## 1. Install and setup variables
Let's first setup our API key variables and install some dependencies.

In [None]:
!pip install braintrust duckdb datasets openai pyarrow autoevals

In [None]:
import openai
import json
import braintrust

OPENAI_API_KEY=""
BT_API_KEY=""
openai.api_key = OPENAI_API_KEY

## 2. Define helper functions
We'll define some helper functions that help us work with SQL related data and queries.

In [None]:
# Import libraries + define helper functions

import duckdb
from datasets import load_dataset
import json
from Levenshtein import distance
import openai
import os
import pyarrow as pa
import time

NUM_TEST_EXAMPLES = 10

# Define some helper functions
def get_table(table):
    rows = [
        {h: row[i] for (i, h) in enumerate(table["header"])} for row in table["rows"]
    ]

    return pa.Table.from_pylist(rows)

AGG_OPS = [None, "MAX", "MIN", "COUNT", "SUM", "AVG"]
COND_OPS = [" ILIKE ", ">", "<"]  # , "OP"]


def esc_fn(s):
    return f'''"{s.replace('"', '""')}"'''


def esc_value(s):
    if isinstance(s, str):
        return s.replace("'", "''")
    else:
        return s

def codegen_query(query):
    header = query["table"]["header"]

    projection = f"{esc_fn(header[query['sql']['sel']])}"

    agg_op = AGG_OPS[query["sql"]["agg"]]
    if agg_op is not None:
        projection = f"{agg_op}({projection})"

    conds = query["sql"]["conds"]

    filters = " and ".join(
        [
            f"""{esc_fn(header[field])}{COND_OPS[cond]}'{esc_value(value)}'"""
            for (field, cond, value) in zip(
                conds["column_index"], conds["operator_index"], conds["condition"]
            )
        ]
    )

    if filters:
        filters = f" WHERE {filters}"

    return f'SELECT {projection} FROM "table"{filters}'

OPENAI_CACHE = None
def openai_req(ChatCompletion=openai.ChatCompletion, **kwargs):
    global OPENAI_CACHE
    if OPENAI_CACHE is None:
        os.makedirs("data", exist_ok=True)
        OPENAI_CACHE = duckdb.connect(database="data/oai_cache.duckdb")
        OPENAI_CACHE.query(
            "CREATE TABLE IF NOT EXISTS cache (params text, response text)"
        )

    param_key = json.dumps(kwargs)
    resp = OPENAI_CACHE.execute(
        """SELECT response FROM "cache" WHERE params=?""", [param_key]
    ).fetchone()
    if resp:
        return json.loads(resp[0])

    for i in range(5):
      try:
        resp = ChatCompletion.create(**kwargs).to_dict()
        break
      except openai.error.RateLimitError:
        print("Rate limited... Sleeping for 30 seconds")
        time.sleep(30)


    OPENAI_CACHE.execute(
        """INSERT INTO "cache" VALUES (?, ?)""", [param_key, json.dumps(resp)]
    )

    return resp

def green(s):
  return "\x1b[32m" + s + "\x1b[0m"

def run_query(sql, table_record):
    table = get_table(table_record)  # noqa
    rel_from_arrow = duckdb.arrow(table)

    result = rel_from_arrow.query("table", sql).fetchone()
    if result and len(result) > 0:
        return result[0]
    return None

def score(r1, r2):
    if r1 is None and r2 is None:
        return 1
    if r1 is None or r2 is None:
        return 0

    r1, r2 = str(r1), str(r2)

    total_len = max(len(r1), len(r2))
    return 1 - distance(r1, r2) / total_len

3. ## Prepare a training set

We'll use the `wikisql`[ dataset from Hugging Face](https://huggingface.co/datasets/wikisql) to create a training set of data to fine tune a model on.

In [None]:
# Initialize data from WikiSQL
train_data = list(load_dataset("wikisql")["train"])

def createTrainExample(query):
    table = query["table"]
    rows = [
        {h: row[i] for (i, h) in enumerate(table["header"])}
        for row in table["rows"]
    ]
    meta = "\n".join(f'"{h}": {[row[h] for row in rows[:10]]}' for h in table["header"])
    prompt = f"""
    Print a SQL query (over a table named "table" quoted with double quotes) that answers the question below.

    You have the following columns:
    {meta}

    The format should be
    Question: the question to ask
    SQL: the SQL to generate

    Question: {query['question']}
    SQL: """.format()
    example = {
        "messages": [
            {
            "role":"system",
            "content":"You are an expert at generating SQL. Respond with just SQL."
            },
            {"role": "user", "content": prompt},
            {"role":"assistant", "content": codegen_query(query)}
        ]
    }
    print(example)
    return example


createTrainExample(train_data[0])


In [None]:
# Save our training data to a file
with open('train-sql.JSONL', mode='w', newline='') as file:
    for i in range(100):
        row = createTrainExample(train_data[i])
        #add each row
        json.dump(row, file)
        file.write('\n')

In [None]:
# Upload our training data to OpenAI
file = openai.File.create(
  file=open("train-sql.JSONL", "rb"),
  purpose='fine-tune'
)

Wait 30s to 1 minute for the file to be processed. :)

In [None]:
# Start a fine-tuning job
job = openai.FineTuningJob.create(training_file=file['id'], model="gpt-3.5-turbo")

In [None]:
# Wait for the fine-tune to complete
FTMODELNAME = ""
for i in range(100):
    check = openai.FineTuningJob.retrieve(job['id'])
    if (check.fine_tuned_model):
        FTMODELNAME = check.fine_tuned_model
        break
    time.sleep(30)
print(FTMODELNAME)

## 4. Evaluate our finetuned model

Finally, we'll load in an evaluation dataset, define an evaluation function, and then compare our results with BrainTrust.

In [None]:
# load in an evaluation dataset
data = list(load_dataset("wikisql")["test"])

In [None]:
# Define a generation function
def text2sql(query, modelName):
    table = query["table"]
    meta = "\n".join(f'"{h}"' for h in table["header"])

    prompt = f"""
Print a SQL query (over a table named "table" quoted with double quotes) that answers the question below.

You have the following columns:
{meta}

The format should be
Question: the question to ask
SQL: the SQL to generate

Question: {query['question']}
SQL: """

    messages = [
        {
            "role":"system",
            "content":"You are an expert at generating SQL. Respond with just SQL."
         },
         {
            "role":"user",
            "content":prompt,
         }
    ]
    resp = openai_req(model=modelName, messages=messages, max_tokens=1024)
    print(resp)
    return (
        prompt,
        resp,
        resp["choices"][0]['message']["content"].rstrip(";")
        if len(resp["choices"]) > 0
        else None,
    )

prompt, resp, _ = text2sql(data[0], FTMODELNAME)
print(prompt + green(resp['choices'][0]['message']['content']))

output_sql = resp['choices'][0]['message']['content'].rstrip(";")
table = get_table(data[0]['table'])
print("Correct answer:", data[0]["sql"]["human_readable"],)
duckdb.arrow(table).query("table", output_sql)


In [None]:
# Define an evaluation function
def runEvaluation(modelName):
    # Initialize BrainTrust experiment
    bt = braintrust.init(project="text2sql-finetune", experiment=modelName, api_key=BT_API_KEY)
    for i in range(NUM_TEST_EXAMPLES):
        print(f"{i+1}/{NUM_TEST_EXAMPLES}\r")
        query = data[i]
        gt_query = codegen_query(query)
        gt_answer = run_query(gt_query, query["table"])

        prompt, _, sql = text2sql(query, modelName)
        try:
            answer = run_query(sql, query["table"])
        except Exception as e:
            answer = f"FAILED: {e}"

        #Log to BrainTrust
        bt.log(
            inputs={"question": query["question"]},
            output=answer,
            expected=gt_answer,
            scores={
                "answer": score(gt_answer, answer),
                "query": score(gt_query, sql),
            },
            metadata={
                "prompt": prompt,
                "gt_sql": gt_query,
                "output_sql": sql,
                "id": i,
            },
        )

    # Print experiment results
    print(bt.summarize())

In [None]:
#Evaluate base GPT3.5-turbo
runEvaluation("gpt-3.5-turbo")

In [None]:
# Evaluate our finetuned model
runEvaluation(FTMODELNAME)

Once you run the two blocks above, you should get a link to the BrainTrust web-ui to compare the results.

![results.png](results.png)

We can see that finetuning significantly improved the ability of GPT-3.5-Turbo to generate SQL queries! Next, you can add more training data or maybe try improving the prompt and then evaluating with BrainTrust to assess your changes.

Now, you are on your journey of building reliable AI apps with BrainTrust.

Learn more on our docs @ [https://www.braintrustdata.com/docs](https://www.braintrustdata.com/docs).
