# Rewriting queries with OpenAI Structured Outputs

In [1]:
query = "How do I keep my plants alive while I'm on holiday?"

In [2]:
from pydantic import BaseModel

class RewrittenQuery(BaseModel):
  initial_query: str
  new_query: str

In [3]:
from openai import OpenAI
client = OpenAI()

In [4]:
from utils import view, SYSTEM_REWRITE

view(SYSTEM_REWRITE)


You are an expert in query expansion and natural language processing. 
Your task is to generate an optimized search query based on the user's input query. 
Follow these guidelines:

[1;36m1[0m. Analyze the input query for key concepts and intent.
[1;36m2[0m. Identify any ambiguous terms or phrases that could be clarified.
[1;36m3[0m. Consider common synonyms, related terms, and alternative phrasings to improve the search.
[1;36m4[0m. If applicable, expand acronyms or abbreviations.
[1;36m5[0m. Incorporate any relevant context or domain-specific knowledge.
[1;36m6[0m. Ensure the expanded query maintains the original intent of the user's question.
[1;36m7[0m. Prioritize clarity and specificity in the rewritten query.
[1;36m8[0m. If the original query is already optimal, you may return it unchanged.

Your goal is to produce a single, refined query that will return the best search results. 
The rewritten query should be a natural language question or statement, not a list 

In [5]:
completion = client.beta.chat.completions.parse(    
  model="gpt-4o-2024-08-06",
  messages=[
    {"role": "system", "content": SYSTEM_REWRITE},
    {"role": "user", "content": query},
  ],
  response_format=RewrittenQuery,
)

In [6]:
view(completion)

ParsedChatCompletion[1m[[0mRewrittenQuery[1m][0m[1m([0m
    [33mid[0m=[32m'chatcmpl-A2lrV6Y5T834K0DCmJdXMRTjSUY0F'[0m,
    [33mchoices[0m=[1m[[0m
        ParsedChoice[1m[[0mRewrittenQuery[1m][0m[1m([0m
            [33mfinish_reason[0m=[32m'stop'[0m,
            [33mindex[0m=[1;36m0[0m,
            [33mlogprobs[0m=[3;35mNone[0m,
            [33mmessage[0m=[35mParsedChatCompletionMessage[0m[1m[[0mRewrittenQuery[1m][0m[1m([0m
                [33mcontent[0m=[32m'[0m[32m{[0m[32m"initial_query":"How do I keep my plants alive while I\'m on holiday?","new_query":"What [0m
[32mare the best methods for ensuring my indoor and outdoor plants remain healthy and watered during a vacation?"[0m[32m}[0m[32m'[0m,
                [33mrefusal[0m=[3;35mNone[0m,
                [33mrole[0m=[32m'assistant'[0m,
                [33mfunction_call[0m=[3;35mNone[0m,
                [33mtool_calls[0m=[1m[[0m[1m][0m,
                [33mpars

In [10]:
def rewrite_query(query, system, response_format):
  completion = client.beta.chat.completions.parse(    
    model="gpt-4o-2024-08-06",
    messages=[
      {"role": "system", "content": SYSTEM_REWRITE},
      {"role": "user", "content": query},
    ],
    response_format=response_format,
  )
  return completion.choices[0].message.parsed.new_query

In [None]:
## Olympics opening ceremony 

In [11]:
import duckdb

In [12]:
con = duckdb.connect("olympics.duckdb")

In [13]:
con.sql("DESCRIBE olympics")


┌─────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│ column_name │ column_type │  null   │   key   │ default │  extra  │
│   varchar   │   varchar   │ varchar │ varchar │ varchar │ varchar │
├─────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ index       │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ embeddings  │ FLOAT[1m[[0m[1;36m1024[0m[1m][0m │ YES     │ NULL    │ NULL    │ NULL    │
│ text        │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ url         │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ title       │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
└─────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘

In [14]:
view(
  con.sql("SELECT index, text FROM olympics LIMIT 10")
)

┌───────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────
────┐
│ index │                                                     text                                                 
│
│ int64 │                                                   varchar                                                
│
├───────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────
────┤
│     [1;36m0[0m │ The [1;36m2024[0m Olympics opened in Paris in spectacular style with thousands of athletes sailing along the River
…  │
│     [1;36m1[0m │ Swapping a stadium for a waterway for the first time to open the [32m"greatest show on Earth"[0m, the near 
four-h…  │
│     [1;36m2[0m │ Blue, white and red fireworks had raised the Tricolore above Austerlitz Bridge before [1;36m6[0m,[1;36m800[0m athletes from
…  │
│     [1;36m3[0m │ There were surprise performances through the ceremony, inclu

In [15]:
from search import Search
s = Search(con)

In [16]:
from eval import load_questions

In [17]:
questions = load_questions("data/questions.json")

In [18]:
%%time
new_questions = {
  question: rewrite_query(
    question, SYSTEM_REWRITE, RewrittenQuery
  )
  for question in questions.to_dict()
}

CPU times: user 1.5 s, sys: 79.1 ms, total: 1.58 s
Wall time: 31.3 s


In [19]:
view(new_questions)

[1m{[0m
    [32m'How many competitors are there?'[0m: [32m"How many competitors are there in the specific industry or event I'm [0m
[32minterested in?"[0m,
    [32m'How many medals will be won on Saturday?'[0m: [32m'How many medals are expected to be won during the events scheduled[0m
[32mfor Saturday at the ongoing sports competition?'[0m,
    [32m'How many times has Paris hosted the Olympic games?'[0m: [32m'How many times has the city of Paris, France, been the [0m
[32mhost city for the Olympic Games?'[0m,
    [32m'Was Gaga there?'[0m: [32m"Was Lady Gaga present at the event or location I'm referring to?"[0m,
    [32m'What colors were the fireworks during the ceremony?'[0m: [32m'What were the specific colors of the fireworks [0m
[32mdisplayed during the ceremony?'[0m,
    [32m'What started the day of the opening ceremony?'[0m: [32m'What event marked the beginning of the day on which the [0m
[32mopening ceremony took place?'[0m,
    [32m'What thing

In [20]:
result1 = {
  question: {
    str(index): score
    for index, score in (s.vector_search(question)
                          .select("index, score")
                          .fetchall()
                        )
  }
  for question in questions.to_dict()
}

In [21]:
result2 = {
  question: {
    str(index): score
    for index, score in (s.vector_search(new_questions[question])
                          .select("index, score")
                          .fetchall()
                        )
  }
  for question in questions.to_dict()
}

In [22]:
from eval import as_run
from ranx import compare

In [23]:
runs = [
  as_run(result1, "Vector"), 
  as_run(result2, "Vector Rewrite")
]

In [24]:
compare(
  questions,
  runs=runs,
  metrics=["hit_rate"],
)


#    Model             Hit Rate
---  --------------  ----------
a    Vector               [1;36m0.773[0m
b    Vector Rewrite       [1;36m0.818[0m

In [25]:
from eval import compare_table

In [26]:
view(compare_table(questions, runs))

[3m                             Comparing Retrieval Techniques                             [0m
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━┓
┃[1m [0m[1mQuestion                                                  [0m[1m [0m┃[1m [0m[1mVector[0m[1m [0m┃[1m [0m[1mVector Rewrite[0m[1m [0m┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━┩
│ How many competitors are there?                            │   ✅   │       ✅       │
│ How many medals will be won on Saturday?                   │   ✅   │       ✅       │
│ How many times has Paris hosted the Olympic games?         │   ✅   │       ✅       │
│ Was Gaga there?                                            │   ✅   │       ✅       │
│ What colors were the fireworks during the ceremony?        │   ✅   │       ✅       │
│ What started the day of the opening ceremony?              │   ❌   │       ❌       │
│ What things went wrong?                 