In [1]:
import re
import json
import aiohttp
import asyncio
import pandas as pd
from tqdm import tqdm
from aiohttp import ClientTimeout
from SPARQLWrapper import SPARQLWrapper, JSON

In [2]:
async def execute_sparql(session, query, timeout=30, max_retries=3):
    SEM = asyncio.Semaphore(20) 
    
    if not query:
        return None

    url = "https://query.wikidata.org/sparql"
    headers = {"Accept": "application/sparql-results+json"}
    data = {"query": query, "format": "json"}

    async with SEM:  # Limit concurrent requests
        for attempt in range(1, max_retries + 1):
            try:
                async with session.post(url, data=data, headers=headers, timeout=ClientTimeout(total=timeout)) as response:
                    if response.status == 200:
                        results = await response.json()
                        return extract_answers_from_response(results)
                    elif response.status == 400:  # Query malformed
                        return None

            except aiohttp.ClientError as e:
                if attempt == max_retries:
                    return []
                await asyncio.sleep(1)
            except asyncio.TimeoutError:
                if attempt == max_retries:
                    return []
                await asyncio.sleep(1)
        return []

def extract_answers_from_response(response):
    answers = []
    if 'results' in response:
        for binding in response['results']['bindings']:
            for key, sub_answer in binding.items():
                value = sub_answer.get('value')
                if re.match(r"^https?://www\.wikidata\.org/entity/Q\d+$", value):
                    answers.append(extract_wikidata_id_from_link(value))
                else:
                    answers.append(value)
    elif 'boolean' in response:
        answers.append(response['boolean'])
    return answers

def extract_wikidata_id_from_link(link):
    match = re.search(r"https?://www\.wikidata\.org/entity/(Q\d+)", link)
    return match.group(1) if match else None

In [3]:
def extract_answers_from_response(response):
    answers = []
    
    if 'results' in response:
        for binding in response['results']['bindings']:
            for _, sub_answer in binding.items():
                value = sub_answer.get('value')
                if isinstance(value, str) and re.match(r"^https?://www\.wikidata\.org/entity/Q\d+$", value):
                    answers.append(value.split("/")[-1])
                else:
                    answers.append(value)
    elif 'boolean' in response:
        answers.append(response['boolean'])
    elif 'head' in response and 'vars' in response['head'] and not response.get('results', {}).get('bindings'):
        return []  # Ensures an empty response if no bindings are present
    
    return answers

async def execute_sparql_query(query: str):
    endpoint = "https://query.wikidata.org/sparql"
    params = {"query": query, "format": "json"}
    
    async with aiohttp.ClientSession() as session:
        try:
            async with session.get(endpoint, params=params) as response:
                if response.status != 200:
                    return None
                
                data = await response.json()
                extracted_answers = extract_answers_from_response(data)
                
                if not extracted_answers:
                    return []
                
                return extracted_answers
        
        except Exception as e:
            return "error", str(e)

def extract_sparql(text: str) -> str:
    pattern = r"```(?:[^\n]*\n)?(.*?)```"
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group(1).strip()

    return text.strip()

def calculate_metrics(correct, predicted):
    correct_set = set(correct)
    predicted_set = set(predicted)

    em = correct_set == predicted_set
    true_positives = len(correct_set & predicted_set)  # Intersection

    precision = true_positives / len(predicted_set) if predicted_set else 0
    recall = true_positives / len(correct_set) if correct_set else 0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return {'em': em, 'f1': f1_score, 'precision': precision, 'recall': recall}

## RuBQ

In [8]:
dataset_name = 'rubq'
rubq_results = pd.read_csv(f'data/gpt_as_kgqa/results/{dataset_name}.csv')

metrics_list = []


for index, row in tqdm(rubq_results.iterrows(), total=rubq_results.shape[0]):
    gold_query = row['sparql']
    pred_query = extract_sparql(row['gpt_sparql'])

    gold_entities = await execute_sparql_query(gold_query)
    pred_entities = await execute_sparql_query(pred_query)

    if not gold_entities:
        continue  

    if pred_entities is None:
        metric = {'em': False, 'f1': 0, 'precision': 0.0, 'recall': 0, 'incorrect': True, 'empty': False}
    else:
        metric = calculate_metrics(gold_entities, pred_entities)
        metric.update({'incorrect': False, 'empty': len(pred_entities) == 0})

    metrics_list.append(metric)

metrics_df = pd.DataFrame(metrics_list)
metrics_df[["f1", "precision", "recall", "incorrect", "empty"]].mean().round(3)*100

100%|█████████████████████████████████████████| 480/480 [06:41<00:00,  1.20it/s]


f1           71.5
precision    64.3
recall       86.3
incorrect     4.2
empty         3.7
dtype: float64

In [9]:
metrics_df[["f1", "precision", "recall", "incorrect", "empty"]].mean().round(3)*100

f1           71.5
precision    64.3
recall       86.3
incorrect     4.2
empty         3.7
dtype: float64

## QaLD

In [4]:
dataset_name = 'qald'
qald_results = pd.read_csv(f'data/gpt_as_kgqa/results/{dataset_name}.csv')

metrics_list = []


for index, row in tqdm(qald_results.iterrows(), total=qald_results.shape[0]):
    gold_query = row['sparql']
    pred_query = extract_sparql(row['gpt_sparql'])

    gold_entities = await execute_sparql_query(gold_query)
    pred_entities = await execute_sparql_query(pred_query)

    if not gold_entities:
        continue  

    if pred_entities is None:
        metric = {'em': False, 'f1': 0, 'precision': 0.0, 'recall': 0, 'incorrect': True, 'empty': False}
    else:
        metric = calculate_metrics(gold_entities, pred_entities)
        metric.update({'incorrect': False, 'empty': len(pred_entities) == 0})

    metrics_list.append(metric)

metrics_df = pd.DataFrame(metrics_list)
metrics_df[["f1", "precision", "recall", "incorrect", "empty"]].mean().round(3)*100

100%|█████████████████████████████████████████| 386/386 [11:59<00:00,  1.86s/it]


f1           56.6
precision    52.5
recall       66.9
incorrect     4.2
empty         7.6
dtype: float64

In [5]:
metrics_df[["f1", "precision", "recall", "incorrect", "empty"]].mean().round(3)*100

f1           56.6
precision    52.5
recall       66.9
incorrect     4.2
empty         7.6
dtype: float64

## LcQUAD_2.0

In [6]:
dataset_name = 'lcquad_2.0'
lcqaud_results = pd.read_csv(f'data/gpt_as_kgqa/results/{dataset_name}.csv')

metrics_list = []


for index, row in tqdm(lcqaud_results.iterrows(), total=lcqaud_results.shape[0]):
    gold_query = row['sparql']
    pred_query = extract_sparql(row['gpt_sparql'])

    gold_entities = await execute_sparql_query(gold_query)
    pred_entities = await execute_sparql_query(pred_query)

    if not gold_entities:
        continue  

    if pred_entities is None:
        metric = {'em': False, 'f1': 0, 'precision': 0.0, 'recall': 0, 'incorrect': True, 'empty': False}
    else:
        metric = calculate_metrics(gold_entities, pred_entities)
        metric.update({'incorrect': False, 'empty': len(pred_entities) == 0})

    metrics_list.append(metric)

metrics_df = pd.DataFrame(metrics_list)
metrics_df[["f1", "precision", "recall", "incorrect", "empty"]].mean().round(3)*100

100%|█████████████████████████████████████| 4541/4541 [1:19:52<00:00,  1.06s/it]


f1           46.8
precision    44.0
recall       54.9
incorrect     2.5
empty        25.6
dtype: float64

In [7]:
metrics_df[["f1", "precision", "recall", "incorrect", "empty"]].mean().round(3)*100

f1           46.8
precision    44.0
recall       54.9
incorrect     2.5
empty        25.6
dtype: float64

## PAT

In [10]:
dataset_name = 'pat'
pat_results = pd.read_csv(f'data/gpt_as_kgqa/results/{dataset_name}.csv')

metrics_list = []


for index, row in tqdm(pat_results.iterrows(), total=pat_results.shape[0]):
    gold_query = row['sparql']
    pred_query = extract_sparql(row['gpt_sparql'])

    gold_entities = await execute_sparql_query(gold_query)
    pred_entities = await execute_sparql_query(pred_query)

    if not gold_entities:
        continue  

    if pred_entities is None:
        metric = {'em': False, 'f1': 0, 'precision': 0.0, 'recall': 0, 'incorrect': True, 'empty': False}
    else:
        metric = calculate_metrics(gold_entities, pred_entities)
        metric.update({'incorrect': False, 'empty': len(pred_entities) == 0})

    metrics_list.append(metric)

metrics_df = pd.DataFrame(metrics_list)
metrics_df[["f1", "precision", "recall", "incorrect", "empty"]].mean().round(3)*100

100%|███████████████████████████████████████| 1199/1199 [52:36<00:00,  2.63s/it]


f1           37.3
precision    55.3
recall       33.3
incorrect     4.1
empty        24.2
dtype: float64