# 1. Library


In [1]:
%pip install langchain-community langchain-core

Note: you may need to restart the kernel to use updated packages.


# 2. Import


In [2]:
import os
import json
import rdflib

from typing import List
from langchain_community.graphs import OntotextGraphDBGraph

# 3. Fix Langchain OntotextGraphDBGraph

- OntotextGraphDBGraph does not support ASK query type and always return []
- Fixed to return Yes/No


In [3]:
class CustomOntotextGraphDBGraph(OntotextGraphDBGraph):
    def __init__(self, query_endpoint: str, schema: str) -> None:
        try:
            import rdflib
            from rdflib.plugins.stores import sparqlstore
        except ImportError:
            raise ImportError(
                "Could not import rdflib python package. "
                "Please install it with `pip install rdflib`."
            )

        auth = self._get_auth()
        store = sparqlstore.SPARQLStore(auth=auth)
        store.open(query_endpoint)

        self.graph = rdflib.Graph(store, identifier=None, bind_namespaces="none")
        self._check_connectivity()
        if not os.path.exists(schema):
            raise FileNotFoundError(f"File {schema} does not exist.")
        with open(schema, "r") as file:
            schema_string = file.readlines()
        self.schema = schema_string

    def query(
        self,
        query: str,
    ) -> List[rdflib.query.ResultRow]:
        """
        Query the graph.
        """
        from rdflib.query import ResultRow

        res = self.graph.query(query)
        if res.type == "ASK":
            return [r for r in res if isinstance(r, bool)]
        return [r for r in res if isinstance(r, ResultRow)]

# 4. Loading GraphDB


In [4]:
config = {
    "query_endpoint": "http://localhost:7200/repositories/imkg",
    "schema": "/Users/jerry/Desktop/FYP-working/fine-tune-openai-KGQA/KG/schema.txt",
}
graph = CustomOntotextGraphDBGraph(**config)

# 5. Calculate metrics


In [5]:
# Initialize counts
def calculate_metrics(results):
    query_level = {
        "TP": 0,  # True Positives
        "FN": 0,  # False Positives
        "FP": 0,  # Always 0 because query is always expecting correct results
        "TN": 0,  # Always 0 because query is always expecting correct results
    }
    item_level = {
        "TP": 0,  # True Positives
        "FN": 0,  # In sparql_response, but in sample_query
        "FP": 0,  # In sample_query, but in sparql_response
        "TN": 0,  # Always 0 because query is always expecting correct results
    }

    for data_point in results:
        sparql_response = data_point.get("sparql_response") or data_point.get(
            "generated_sparql"
        )  # Model-generated query
        sample_query = data_point.get("sparql")  # Ground-truth query

        if sparql_response and sample_query:
            try:
                generated_results = set(graph.query(sparql_response))
                sample_results = set(graph.query(sample_query))

                if generated_results == sample_results:
                    query_level["TP"] += 1
                else:
                    query_level["FN"] += 1

            except Exception as e:
                generated_results = set()
                error_message = str(e).lower()
                print(error_message)
                query_level["FN"] += 1
        item_level["TP"] += len(generated_results & sample_results)
        item_level["FN"] += len(sample_results - generated_results)
        item_level["FP"] += len(generated_results - sample_results)
    # Compute accuracy
    query_level_total_samples = sum(query_level.values())
    query_level_accuracy = (
        (query_level["TP"] + query_level["TN"]) / query_level_total_samples
        if query_level_total_samples
        else 0
    )
    query_level_precision = (
        query_level["TP"] / (query_level["TP"] + query_level["FP"])
        if (query_level["TP"] + query_level["FP"])
        else 0
    )
    query_level_recall = (
        query_level["TP"] / (query_level["TP"] + query_level["FN"])
        if (query_level["TP"] + query_level["FN"])
        else 0
    )
    query_level_results = {
        "accuracy": query_level_accuracy,
        "precision": query_level_precision,
        "recall": query_level_recall,
        "f1_score": (
            2
            * (query_level_precision * query_level_recall)
            / (query_level_precision + query_level_recall)
            if (query_level_precision + query_level_recall)
            else 0
        ),
    }
    item_level_total_samples = sum(item_level.values())
    item_level_accuracy = (
        (item_level["TP"] + item_level["TN"]) / item_level_total_samples
        if item_level_total_samples
        else 0
    )
    item_level_precision = (
        item_level["TP"] / (item_level["TP"] + item_level["FP"])
        if (item_level["TP"] + item_level["FP"])
        else 0
    )
    item_level_recall = (
        item_level["TP"] / (item_level["TP"] + item_level["FN"])
        if (item_level["TP"] + item_level["FN"])
        else 0
    )
    item_level_results = {
        "accuracy": item_level_accuracy,
        "precision": item_level_precision,
        "recall": item_level_recall,
        "f1_score": (
            2
            * (item_level_precision * item_level_recall)
            / (item_level_precision + item_level_recall)
            if (item_level_precision + item_level_recall)
            else 0
        ),
    }
    return query_level_results, item_level_results

# 6. Measure metrics


In [6]:
GPT4O_RESULTS_PATH = os.path.join(os.getcwd(), "data", "Gpt4o-qa_test_results.json")
QWEN_RESULTS_PATH = os.path.join(os.getcwd(), "data", "Qwen2.5-3B_qa_test_results.json")
LLAMA3_RESULTS_PATH = os.path.join(
    os.getcwd(), "data", "Llama3.2-3B-qa_test_results.json"
)

test_results_path = {
    "GPT4O": GPT4O_RESULTS_PATH,
    "QWEN": QWEN_RESULTS_PATH,
    "LLAMA3": LLAMA3_RESULTS_PATH,
}
for model, path in test_results_path.items():
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            results = json.load(f)
        query_level_metrics, item_level_metrics = calculate_metrics(results)

        print(
            f"{model} metrics:\nQuery Level Metrics:\n{json.dumps(query_level_metrics, indent=4)}"
        )
        print(f"Item Level Metrics:\n{json.dumps(item_level_metrics, indent=4)}")

    else:
        raise FileNotFoundError(f"{model} results file not found.")

you did something wrong formulating either the uri or your sparql query
GPT4O metrics:
Query Level Metrics:
{
    "accuracy": 0.8962264150943396,
    "precision": 1.0,
    "recall": 0.8962264150943396,
    "f1_score": 0.9452736318407959
}
Item Level Metrics:
{
    "accuracy": 0.8105849582172702,
    "precision": 0.9765100671140939,
    "recall": 0.8267045454545454,
    "f1_score": 0.8953846153846152
}
you did something wrong formulating either the uri or your sparql query
you did something wrong formulating either the uri or your sparql query
you did something wrong formulating either the uri or your sparql query
you did something wrong formulating either the uri or your sparql query
you did something wrong formulating either the uri or your sparql query
you did something wrong formulating either the uri or your sparql query
you did something wrong formulating either the uri or your sparql query
you did something wrong formulating either the uri or your sparql query
you did something w