In [1]:
import re
import json
import pandas as pd

# Read datasets

In [2]:
# def read_dataset(name, split):
#     return json.load(open(f'../data/preprocessed/{name}/{name}_{split}.json'))['dataset']

def read_dataset(name, split):
    return json.load(open(f'data/e2e_validation_datasets/{name}_input_dataset.json'))

# Make tasks

In [3]:
system_prompt = """You are a text-to-SPARQL converter for Wikidata. Given a natural language question (QUESTION) and a set of extracted entities (QUESTION ENTITIES) - Wikidata IDs and labels, generate an optimized SPARQL query that retrieves relevant data from Wikidata's query service. Ensure the query is efficient, using appropriate properties, filters, and service clauses where necessary.
Do not include any explanations, comments, or additional text before or after the SPARQL query. Output only the SPARQL query enclosed within triple backticks (```).

Examples:
QUESTION: What is the name of the capital of Romania?
ENTITIES:
Q218 - Romania

RELATIONS:
P36 - capital
P1376 - capital of

assistant: ```select ?answer where { wd:Q218 wdt:P36 ?answer }```

QUESTION: Which countries have places with more than two caves?
ENTITIES:
Q35509: cave

RELATIONS:
P31 - instance of
P17 - country

assistant: ``select distinct ?uri where { ?cave wdt:P31 wd:Q35509 ; wdt:P17 ?uri  } group by ?uri having ( count ( ?cave ) > 2 )```


QUESTION: Who did the current head of the government of Kinmen succeed in office?
ENTITIES:
Q249870 - Kinmen County

RELATIONS:
P6: head of government
P1365 - replaces

assistant: ```SELECT ?item ?itemLabel (YEAR(?starttime) AS ?yearstarttime) ?endtime WHERE { wd:Q249870 p:P6 ?s  . ?s  ps:P6 ?item . ?s  pq:P580 ?starttime  . FILTER NOT EXISTS{ ?s pq:P582 ?endtime .}. } order by desc(?starttime)```
"""

question_task = lambda question, entities,  lang='en': f'''QUESTION: {question}\nQUESTION ENTITIES:\n{format_id2alias(id2alias, lang)}'''

In [4]:
# response_format = {
#     "type": "json_schema",
#     "json_schema": {
#         "name": "SPARQL_query",
#         "strict": True,
#         "schema": {
#             "type": "object",
#             "properties": {
#                 "sparql_query": {"type": "string"}
#             },
#             "required": ["sparql_query"],
#             "additionalProperties": False
#         }
#     }

def format_gold_qid(id_map):
    if not id_map:
        return " - None\n"
    return "".join(f"{pid} - {label.get('en', 'N/A')}\n" for pid, label in id_map.items() if label)

def format_qid(id_map):
    if not id_map:
        return " - None\n"
    return "".join(f"{qid} - {label}\n" for qid, label in id_map.items() if label)

def format_id2alias(id2alias, lang='en'):
    alias_list = []
    for wikidata_id, alias_lang_dict in id2alias.items():
        if alias_lang_dict:
            label = alias_lang_dict.get(lang)
            alias_list.append(f'{wikidata_id}: {label}')
    return '\n'.join(alias_list)

def create_task(item, lang='en'):
    question = item[f'question']
    # entities = item['entities']['question'] if item['entities']['question'] not in (None, {}) else item['entities']['query']
    # relations = item['relations']['question'] if item['relations']['question'] not in (None, {}) else item['relations']['query']
    # entities_string = format_gold_qid(entities)
    # relations_string = format_gold_qid(relations)

    entities = item['entities']
    relations = item['relations']
    entities_string = format_qid(entities)
    relations_string = format_qid(relations)
    

    return f"QUESTION: {question}\nENTITIES:\n{entities_string}\nRELATIONS:\n{relations_string}\n"


def create_request(request_id, task, metadata, model_name, system_prompt=system_prompt):
    return {
        "custom_id": request_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": model_name,
            "store": True,
            "messages": [
                {
                  "role": "system",
                  "content": system_prompt
                },
                {
                  "role": "user",
                  "content": task,
                }
            ],
            "metadata": metadata,
           # "response_format": response_format,
            "seed": 42,      
        }
    }

def item2request(item, dataset_name, model_name='gpt-4', lang='en'):
    request_id = str(item['id'])
    question = item[f'question']
    task = create_task(item)
    metadata = {
        "dataset": dataset_name,
        "question": question
    }
    
    return create_request(request_id, task, metadata, model_name=model_name)

def create_jsonl_file(requests_list, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        for request in requests_list:
            f.write(json.dumps(request, ensure_ascii=False) + "\n")

    print(f"JSONL file created successfully at {file_path}")


def extract_sparql(sparql_string):
    match = re.search(r'```(.*?)```', sparql_string, re.DOTALL)
    if match:
        return match.group(1).strip()
    else:
        return sparql_string

# Test

In [14]:
from openai import OpenAI

API_KEY = ''
client = OpenAI(api_key=API_KEY)

model_name = 'gpt-4'
lang = 'en'

# RuBQ

In [15]:
dataset_name = 'rubq'

dataset = read_dataset(dataset_name, 'test')
assert len({d['id'] for d in dataset}) == len(dataset), "Duplicate IDs found!"

requests_list = [item2request(item, dataset_name, model_name=model_name, lang=lang) for item in dataset]
batch_filepath = f"data/gpt_as_kgqa/batches/{dataset_name}_batch.jsonl"

create_jsonl_file(requests_list, batch_filepath)

batch_input_file = client.files.create(
    file=open(batch_filepath, "rb"),
    purpose="batch"
)

batch = client.batches.create(
    input_file_id=batch_input_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
        "dataset": dataset_name
    }
)

JSONL file created successfully at data/gpt_as_kgqa/batches/rubq_batch.jsonl


In [19]:
file_id = batch_input_file.id
batch_id = batch.id

batch = client.batches.retrieve(batch_id)
print(batch)

Batch(id='batch_67d716338e3c819086de9fe7d8d9e848', completion_window='24h', created_at=1742149171, endpoint='/v1/chat/completions', input_file_id='file-JCu93JW4uzv2JLqm2EbamX', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1742149774, error_file_id=None, errors=None, expired_at=None, expires_at=1742235571, failed_at=None, finalizing_at=1742149739, in_progress_at=1742149173, metadata={'dataset': 'rubq'}, output_file_id='file-QQidiwrXiRYDNCEpyb6g1d', request_counts=BatchRequestCounts(completed=474, failed=0, total=474))


In [21]:
batch = client.batches.retrieve(batch_id)
output_file_id = batch.output_file_id

file_response = client.files.content(output_file_id)
responses = [json.loads(line) for line in file_response.text.splitlines()]

gpt_predicted_sparqls = {response['custom_id']: extract_sparql(response['response']['body']['choices'][0]['message']['content']) for response in responses}
id2question = {str(item['id']): item['question'] for item in dataset}
id2sparql = {str(item['id']): item['query'] for item in dataset}
assert set(gpt_predicted_sparqls.keys()) == set(id2question.keys())

prediction = pd.DataFrame({
    'question': id2question,
    'sparql': id2sparql,
    'gpt_sparql': gpt_predicted_sparqls
})

prediction.to_csv(f'data/gpt_as_kgqa/refined_results/{dataset_name}.csv')

In [22]:
prediction.head()

Unnamed: 0,question,sparql,gpt_sparql
4,Which country does the famous Easter island be...,select ?answer where { wd:Q14452 wdt:P17 ?answ...,SELECT ?answerLabel WHERE { wd:Q14452 wdt:P17 ...
7,Which music group is Mick Jagger's name inextr...,select ?answer where { wd:Q128121 wdt:P361 ?an...,SELECT ?group WHERE { wd:Q128121 wdt:P463 ?gro...
14,Where is the Summer garden?,select ?answer where { wd:Q1229234 wdt:P131 ?a...,SELECT ?location WHERE { wd:Q1229234 wdt:P276 ...
22,Which city is the capital of Turkmenistan?,select ?answer where { wd:Q874 wdt:P36 ?answer },select ?answer where { wd:Q874 wdt:P36 ?answer }
25,In which city was the first Russian revolution...,select ?answer where { wd:Q2533402 wdt:P159 ?a...,SELECT ?city ?cityLabel WHERE { wd:Q2533402 wd...


# Qald

In [25]:
dataset_name = 'qald'

dataset = read_dataset(dataset_name, 'test')
assert len({d['id'] for d in dataset}) == len(dataset), "Duplicate IDs found!"

requests_list = [item2request(item, dataset_name, model_name=model_name, lang=lang) for item in dataset]
batch_filepath = f"data/gpt_as_kgqa/batches/{dataset_name}_batch.jsonl"

create_jsonl_file(requests_list, batch_filepath)

batch_input_file = client.files.create(
    file=open(batch_filepath, "rb"),
    purpose="batch"
)

batch = client.batches.create(
    input_file_id=batch_input_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
        "dataset": dataset_name
    }
)

JSONL file created successfully at data/gpt_as_kgqa/batches/qald_batch.jsonl


In [29]:
file_id = batch_input_file.id
batch_id = batch.id

batch = client.batches.retrieve(batch_id)
print(batch)

Batch(id='batch_67d7191b666c81909b2fdd058f4e2d04', completion_window='24h', created_at=1742149915, endpoint='/v1/chat/completions', input_file_id='file-8ggBkNc8KQiob9aWnioi4E', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1742150078, error_file_id=None, errors=None, expired_at=None, expires_at=1742236315, failed_at=None, finalizing_at=1742150051, in_progress_at=1742149916, metadata={'dataset': 'qald'}, output_file_id='file-GfDEbdKeejh7VLfMLFQdeL', request_counts=BatchRequestCounts(completed=384, failed=0, total=384))


In [32]:
batch = client.batches.retrieve(batch_id)
output_file_id = batch.output_file_id

file_response = client.files.content(output_file_id)
responses = [json.loads(line) for line in file_response.text.splitlines()]

gpt_predicted_sparqls = {response['custom_id']: extract_sparql(response['response']['body']['choices'][0]['message']['content']) for response in responses}
id2question = {str(item['id']): item['question'] for item in dataset}
id2sparql = {str(item['id']): item['query'] for item in dataset}
assert set(gpt_predicted_sparqls.keys()) == set(id2question.keys())

prediction = pd.DataFrame({
    'question': id2question,
    'sparql': id2sparql,
    'gpt_sparql': gpt_predicted_sparqls
})

prediction.to_csv(f'data/gpt_as_kgqa/refined_results/{dataset_name}.csv')

In [33]:
prediction.head()

Unnamed: 0,question,sparql,gpt_sparql
0,After whom is the Riemannian geometry named?,select distinct ?result where { wd:Q761383 wdt...,SELECT ?answer WHERE { wd:Q761383 wdt:P138 ?an...
1,Which animal participated in a military operat...,select distinct ?result where { ?result wdt:P3...,SELECT ?animal ?animalLabel WHERE {\n ?animal...
2,"among the characters in the witcher, who has t...",select distinct ?result where { wd:Q11835640 w...,SELECT ?character WHERE { ?character wdt:P451 ...
3,"among the founders of tencent company, who has...",select distinct ?result where { wd:Q860580 wdt...,SELECT ?founder ?founderLabel WHERE {\n wd:Q4...
4,among the other representative work of the aut...,select distinct ?result where { wd:Q696071 wdt...,SELECT ?work ?workLabel WHERE { \n wd:Q696071...


### LcQuad

In [35]:
dataset_name = 'lcquad'

dataset = read_dataset(dataset_name, 'test')
assert len({d['id'] for d in dataset}) == len(dataset), "Duplicate IDs found!"

requests_list = [item2request(item, dataset_name, model_name=model_name, lang=lang) for item in dataset]
batch_filepath = f"data/gpt_as_kgqa/batches/{dataset_name}_batch.jsonl"

create_jsonl_file(requests_list, batch_filepath)

batch_input_file = client.files.create(
    file=open(batch_filepath, "rb"),
    purpose="batch"
)

batch = client.batches.create(
    input_file_id=batch_input_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
        "dataset": dataset_name
    }
)

JSONL file created successfully at data/gpt_as_kgqa/batches/lcquad_batch.jsonl


In [42]:
file_id = batch_input_file.id
batch_id = batch.id

batch = client.batches.retrieve(batch_id)
print(batch)

Batch(id='batch_67d719f942bc8190ba965409fc0c918c', completion_window='24h', created_at=1742150137, endpoint='/v1/chat/completions', input_file_id='file-JTExccHAfQXAdzJQFNcszf', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1742152679, error_file_id=None, errors=None, expired_at=None, expires_at=1742236537, failed_at=None, finalizing_at=1742152269, in_progress_at=1742150139, metadata={'dataset': 'lcquad'}, output_file_id='file-T2wTvBmP9RUwzK1fBieV2F', request_counts=BatchRequestCounts(completed=4540, failed=0, total=4540))


In [43]:
batch = client.batches.retrieve(batch_id)
output_file_id = batch.output_file_id

file_response = client.files.content(output_file_id)
responses = [json.loads(line) for line in file_response.text.splitlines()]

gpt_predicted_sparqls = {response['custom_id']: extract_sparql(response['response']['body']['choices'][0]['message']['content']) for response in responses}
id2question = {str(item['id']): item['question'] for item in dataset}
id2sparql = {str(item['id']): item['query'] for item in dataset}
assert set(gpt_predicted_sparqls.keys()) == set(id2question.keys())

prediction = pd.DataFrame({
    'question': id2question,
    'sparql': id2sparql,
    'gpt_sparql': gpt_predicted_sparqls
})

prediction.to_csv(f'data/gpt_as_kgqa/refined_results/{dataset_name}.csv')

In [44]:
prediction.head()

Unnamed: 0,question,sparql,gpt_sparql
0,What was the population of Somalia in 2009-0-0?,select ?obj where { wd:Q1045 p:P1082 ?s . ?s p...,SELECT ?population WHERE { wd:Q1045 p:P1082 ?s...
1,Which female actress is the voice over on Sout...,select ?answer where { wd:Q16538 wdt:P725 ?ans...,SELECT DISTINCT ?actress ?actressLabel WHERE {...
2,What was the population of Clermont-Ferrand on...,select ?obj where { wd:Q42168 p:P1082 ?s . ?s ...,SELECT ?population WHERE { \n wd:Q42168 p:P10...
3,On Lake Winnipeg what is the lakes on river?,select distinct ?answer where { ?answer wdt:P4...,SELECT ?river ?riverLabel WHERE { wd:Q3272 wdt...
4,What open cluster has the largest radius?,select ?ent where { ?ent wdt:P31 wd:Q11387 . ?...,SELECT ?cluster ?clusterLabel ?radius WHERE { ...


### PAT

In [45]:
dataset_name = 'pat'

dataset = read_dataset(dataset_name, 'test')
assert len({d['id'] for d in dataset}) == len(dataset), "Duplicate IDs found!"

requests_list = [item2request(item, dataset_name, model_name=model_name, lang=lang) for item in dataset]
batch_filepath = f"data/gpt_as_kgqa/batches/{dataset_name}_batch.jsonl"

create_jsonl_file(requests_list, batch_filepath)

batch_input_file = client.files.create(
    file=open(batch_filepath, "rb"),
    purpose="batch"
)

batch = client.batches.create(
    input_file_id=batch_input_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
        "dataset": dataset_name
    }
)

JSONL file created successfully at data/gpt_as_kgqa/batches/pat_batch.jsonl


In [48]:
file_id = batch_input_file.id
batch_id = batch.id

batch = client.batches.retrieve(batch_id)
print(batch)

Batch(id='batch_67d7256860088190abec027056d18120', completion_window='24h', created_at=1742153064, endpoint='/v1/chat/completions', input_file_id='file-TVzumss8MaGNU8TeaNDgoB', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1742154483, error_file_id=None, errors=None, expired_at=None, expires_at=1742239464, failed_at=None, finalizing_at=1742154379, in_progress_at=1742153067, metadata={'dataset': 'pat'}, output_file_id='file-4dUNuYGraoKeAfKiEUb4Tb', request_counts=BatchRequestCounts(completed=1210, failed=0, total=1210))


In [50]:
batch = client.batches.retrieve(batch_id)
output_file_id = batch.output_file_id

file_response = client.files.content(output_file_id)
responses = [json.loads(line) for line in file_response.text.splitlines()]

gpt_predicted_sparqls = {response['custom_id']: extract_sparql(response['response']['body']['choices'][0]['message']['content']) for response in responses}
id2question = {str(item['id']): item['question'] for item in dataset}
id2sparql = {str(item['id']): item['query'] for item in dataset}
assert set(gpt_predicted_sparqls.keys()) == set(id2question.keys())

prediction = pd.DataFrame({
    'question': id2question,
    'sparql': id2sparql,
    'gpt_sparql': gpt_predicted_sparqls
})

prediction.to_csv(f'data/gpt_as_kgqa/refined_results/{dataset_name}.csv')

In [51]:
prediction.head()

Unnamed: 0,question,sparql,gpt_sparql
10719,Who is the head coach of the team that Naoki W...,\n SELECT ?item ?itemLabel ?starttime ?endt...,SELECT ?coach ?coachLabel WHERE {\n wd:Q28921...
2502,Who was the previous chair of World Wide Fund ...,\n SELECT ?item ?itemLabel ?starttime ?endt...,SELECT ?chairperson ?chairpersonLabel (YEAR(?e...
11622,What is the home venue of the team that Neil D...,\n SELECT ?item ?itemLabel ?starttime ?endt...,SELECT ?venue ?venueLabel WHERE {\n wd:Q38742...
12496,Where was the previous head coach of Philadelp...,\n SELECT ?item ?itemLabel ?starttime ?endt...,SELECT ?placeOfBirth WHERE {\n wd:Q219714 p:P...
10820,Who is the owner of the team that Zaza Pachuli...,\n SELECT ?item ?itemLabel (YEAR(?starttime...,SELECT ?owner WHERE { wd:Q319614 p:P54 ?member...
