# Query generation

## Imports

In [None]:
import re
from openai import OpenAI 
from dotenv import load_dotenv

load_dotenv()

import pyterrier as pt
if not pt.started():
  pt.init()

## Generate Core17 queries

In [None]:
data = 'core17'
dataset = pt.get_dataset('irds:nyt/trec-core-2017')
f_out_path = '../queries/gpt-4o/{}.csv'.format(data)

client = OpenAI()

pattern = re.compile(r"\d+\.")

strategy = "P-1"
# strategy = "P-2"
# strategy = "P-3"

for topic in list(dataset.get_topics().iterrows()):
    qid = topic[1]['qid']
    query = topic[1]['title']
    description = topic[1]['description']
    narrative = topic[1]['narrative']

    if strategy == "P-1":
        prompt = "You are a generator of search query variants. Generate one hundred keyword queries about {}. Your reply is a numbered list of search queries.".format(query)

    if strategy == "P-2":
        prompt = "You are a generator of search query variants. Generate one hundred keyword queries about {}. {} {} Your reply is a numbered list of search queries.".format(query, description, narrative)
    
    if strategy == "P-3":
        examples = 'Example queries for the topic about "recycling lead acid batteries" include "1. battery recycling facilities", "2. car battery", "3. car battery disposal". Other query examples for the topic about "symptoms of heart attack" include "1. Early signs and symptoms of having a heart attack", "2. general heart attack symptoms", "3. Heart and stroke foundation", query examples for the topic about "evidence for evolution" include "1. acceptance of theory of evolution", "2. current evidence about the theory of evolution", and "3. current evidence for evolution", query examples for the topic about "identifying spider bites" include "1. common garden spiders", "2. different insect bites", and "3. examples of spider bites", query examples for the topic about "raspberry pi" include "1. best deal raspberry pi computer", "2. buy raspberry pi", and "3. cost of raspberry pi".'
        prompt = "You are a generator of search query variants. Generate one hundred keyword queries about {}. {} Your reply is a numbered list of search queries.".format(query, examples)

    completion = client.chat.completions.create(
        model="gpt-4o",
        temperature=0.0,
        seed=42,
        messages=[{"role": "user", "content": prompt}]
    )

    content = completion.choices[0].message.content
    queries = [pattern.sub("", q).strip() for q in content.split('\n') if str(q.split('.')[0]).isdigit()]

    cnt = 1
    with open(f_out_path, 'a') as f_out:
        for q in queries:
            f_out.write(';'.join([str(cnt),str(strategy),str(qid),q+'\n']))
            cnt += 1

## Generate Core18 queries

In [None]:
data = 'core18'
dataset = pt.get_dataset('irds:wapo/v2/trec-core-2018')
f_out_path = '../queries/gpt-4o/{}.csv'.format(data)

client = OpenAI()

pattern = re.compile(r"\d+\.")

strategy = "P-1"
# strategy = "P-2"
# strategy = "P-3"

for topic in list(dataset.get_topics().iterrows()):
    qid = topic[1]['qid']
    query = topic[1]['title']
    description = topic[1]['description']
    narrative = topic[1]['narrative']

    if strategy == "P-1":
        prompt = "You are a generator of search query variants. Generate one hundred keyword queries about {}. Your reply is a numbered list of search queries.".format(query)

    if strategy == "P-2":
        prompt = "You are a generator of search query variants. Generate one hundred keyword queries about {}. {} {} Your reply is a numbered list of search queries.".format(query, description, narrative)
    
    if strategy == "P-3":
        examples = 'Example queries for the topic about "recycling lead acid batteries" include "1. battery recycling facilities", "2. car battery", "3. car battery disposal". Other query examples for the topic about "symptoms of heart attack" include "1. Early signs and symptoms of having a heart attack", "2. general heart attack symptoms", "3. Heart and stroke foundation", query examples for the topic about "evidence for evolution" include "1. acceptance of theory of evolution", "2. current evidence about the theory of evolution", and "3. current evidence for evolution", query examples for the topic about "identifying spider bites" include "1. common garden spiders", "2. different insect bites", and "3. examples of spider bites", query examples for the topic about "raspberry pi" include "1. best deal raspberry pi computer", "2. buy raspberry pi", and "3. cost of raspberry pi".'
        prompt = "You are a generator of search query variants. Generate one hundred keyword queries about {}. {} Your reply is a numbered list of search queries.".format(query, examples)

    completion = client.chat.completions.create(
        model="gpt-4o",
        temperature=0.0,
        seed=42,
        messages=[{"role": "user", "content": prompt}]
    )

    content = completion.choices[0].message.content
    queries = [pattern.sub("", q).strip() for q in content.split('\n') if str(q.split('.')[0]).isdigit()]

    cnt = 1
    with open(f_out_path, 'a') as f_out:
        for q in queries:
            f_out.write(';'.join([str(cnt),str(strategy),str(qid),q+'\n']))
            cnt += 1

## Generate Robust04 queries

In [None]:
data = 'robust04'
dataset = pt.get_dataset('irds:disks45/nocr/trec-robust-2004')
f_out_path = '../queries/gpt-4o/{}.csv'.format(data)

client = OpenAI()

pattern = re.compile(r"\d+\.")

strategy = "P-1"
# strategy = "P-2"
# strategy = "P-3"

for topic in list(dataset.get_topics().iterrows()):
    qid = topic[1]['qid']
    query = topic[1]['title']
    description = topic[1]['description']
    narrative = topic[1]['narrative']

    if strategy == "P-1":
        prompt = "You are a generator of search query variants. Generate one hundred keyword queries about {}. Your reply is a numbered list of search queries.".format(query)

    if strategy == "P-2":
        prompt = "You are a generator of search query variants. Generate one hundred keyword queries about {}. {} {} Your reply is a numbered list of search queries.".format(query, description, narrative)
    
    if strategy == "P-3":
        examples = 'Example queries for the topic about "recycling lead acid batteries" include "1. battery recycling facilities", "2. car battery", "3. car battery disposal". Other query examples for the topic about "symptoms of heart attack" include "1. Early signs and symptoms of having a heart attack", "2. general heart attack symptoms", "3. Heart and stroke foundation", query examples for the topic about "evidence for evolution" include "1. acceptance of theory of evolution", "2. current evidence about the theory of evolution", and "3. current evidence for evolution", query examples for the topic about "identifying spider bites" include "1. common garden spiders", "2. different insect bites", and "3. examples of spider bites", query examples for the topic about "raspberry pi" include "1. best deal raspberry pi computer", "2. buy raspberry pi", and "3. cost of raspberry pi".'
        prompt = "You are a generator of search query variants. Generate one hundred keyword queries about {}. {} Your reply is a numbered list of search queries.".format(query, examples)

    completion = client.chat.completions.create(
        model="gpt-4o",
        temperature=0.0,
        seed=42,
        messages=[{"role": "user", "content": prompt}]
    )

    content = completion.choices[0].message.content
    queries = [pattern.sub("", q).strip() for q in content.split('\n') if str(q.split('.')[0]).isdigit()]

    cnt = 1
    with open(f_out_path, 'a') as f_out:
        for q in queries:
            f_out.write(';'.join([str(cnt),str(strategy),str(qid),q+'\n']))
            cnt += 1

## Generate Robust05 queries

In [None]:
data = 'robust05'
dataset = pt.get_dataset('irds:aquaint/trec-robust-2005') 
f_out_path = '../queries/gpt-4o/{}.csv'.format(data)

client = OpenAI()

pattern = re.compile(r"\d+\.")

strategy = "P-1"
# strategy = "P-2"
# strategy = "P-3"

for topic in list(dataset.get_topics().iterrows()):
    qid = topic[1]['qid']
    query = topic[1]['title']
    description = topic[1]['description']
    narrative = topic[1]['narrative']

    if strategy == "P-1":
        prompt = "You are a generator of search query variants. Generate one hundred keyword queries about {}. Your reply is a numbered list of search queries.".format(query)

    if strategy == "P-2":
        prompt = "You are a generator of search query variants. Generate one hundred keyword queries about {}. {} {} Your reply is a numbered list of search queries.".format(query, description, narrative)
    
    if strategy == "P-3":
        examples = 'Example queries for the topic about "recycling lead acid batteries" include "1. battery recycling facilities", "2. car battery", "3. car battery disposal". Other query examples for the topic about "symptoms of heart attack" include "1. Early signs and symptoms of having a heart attack", "2. general heart attack symptoms", "3. Heart and stroke foundation", query examples for the topic about "evidence for evolution" include "1. acceptance of theory of evolution", "2. current evidence about the theory of evolution", and "3. current evidence for evolution", query examples for the topic about "identifying spider bites" include "1. common garden spiders", "2. different insect bites", and "3. examples of spider bites", query examples for the topic about "raspberry pi" include "1. best deal raspberry pi computer", "2. buy raspberry pi", and "3. cost of raspberry pi".'
        prompt = "You are a generator of search query variants. Generate one hundred keyword queries about {}. {} Your reply is a numbered list of search queries.".format(query, examples)

    completion = client.chat.completions.create(
        model="gpt-4o",
        temperature=0.0,
        seed=42,
        messages=[{"role": "user", "content": prompt}]
    )

    content = completion.choices[0].message.content
    queries = [pattern.sub("", q).strip() for q in content.split('\n') if str(q.split('.')[0]).isdigit()]

    cnt = 1
    with open(f_out_path, 'a') as f_out:
        for q in queries:
            f_out.write(';'.join([str(cnt),str(strategy),str(qid),q+'\n']))
            cnt += 1