In [43]:
import os
import json
import rdflib
import random
from typing import List
from openai import OpenAI
from dotenv import load_dotenv
from langchain_community.graphs import OntotextGraphDBGraph
from constants import (
    TRAINED_MODEL_ID_PATH,
    TEST_DATA_PATH,
    SPARQL_FIX_PROMPT,
    QA_PROMPT,
    SPARQL_GENERATION_PROMPT,
)

In [3]:
class CustomOntotextGraphDBGraph(OntotextGraphDBGraph):
    def query(
        self,
        query: str,
    ) -> List[rdflib.query.ResultRow]:
        """
        Query the graph.
        """
        from rdflib.query import ResultRow

        res = self.graph.query(query)
        if res.type == "ASK":
            return [r for r in res if isinstance(r, bool)]
        return [r for r in res if isinstance(r, ResultRow)]

## Create Graph OBJ


In [6]:
config = {
    "query_endpoint": "http://localhost:7200/repositories/imkg",
    "local_file": "/Users/jerry/Desktop/FYP-working/fine-tune-openai-KGQA/KG/iMKG.ttl",
    "local_file_format": "turtle",
}
try:
    graph = CustomOntotextGraphDBGraph(**config)
    print("Graph object created successfully.")
except Exception as e:
    print("Failed to create graph object:", e)

Graph object created successfully.


In [10]:
load_dotenv()

True

In [36]:
client = OpenAI()
with open(TRAINED_MODEL_ID_PATH, "r", encoding="utf-8") as f:
    model_name = f.read()
f.close()
print(f"Model name: {model_name}")

Model name: ft:gpt-4o-2024-08-06:personal::AKlfhHvk


In [28]:
TEST_DATA_PATH = os.path.join(os.getcwd(), "data", "qa_test.json")
with open(TEST_DATA_PATH, "r", encoding="utf-8") as f:
    test_dataset = json.load(f)

test_data_point = random.sample(test_dataset, 1)[0]
print(f"Test data point:\n{json.dumps(test_data_point, indent=4)}")

Test data point:
{
    "qid": "mcqa-9276cd07-e67e-11ee-8734-58961d663d9c",
    "question": "which films have the same director as [Leningrad Cowboys Go America] and featured [Blondin Miguel]?",
    "question_type": "movie_to_director_to_movie_constraint_actor",
    "topic_entity": [
        "Leningrad Cowboys Go America",
        "Blondin Miguel"
    ],
    "topic_entity_id": [
        "https://www.wikidata.org/entity/Q1817757",
        "https://www.wikidata.org/entity/Q95796320"
    ],
    "answer": [
        "Le Havre"
    ],
    "answer_id": [
        "https://www.wikidata.org/entity/Q736498"
    ],
    "sparql": "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT DISTINCT ?label ?uri WHERE { ?rel1 rdfs:label \"directed_by\" . ?e1 ?rel1 ?e2 ; rdfs:label \"Leningrad Cowboys Go America\" . ?uri ?rel1 ?e2 ; ?rel4 ?e4 ; rdfs:label ?label . FILTER (?label != \"Leningrad Cowboys Go America\") ?rel4 rdfs:label \"starred_actors\" . ?e4 rdfs:label \"Blondin Miguel\" . }"
}


## Call OPENAI FINETUNE MODEL


In [29]:
def call_openai(messages: list, model_name, **kwargs) -> str:
    print(f"Messages: {messages}")
    print(f"Model: {model_name}")
    response = client.chat.completions.create(
        messages=messages, model=model_name, **kwargs
    )

    return response.choices[0].message.content

In [39]:
query_generation_messages = [
    {"role": "system", "content": SPARQL_GENERATION_PROMPT},
    {"role": "user", "content": f"{test_data_point['question']}"},
]
query = call_openai(
    messages=query_generation_messages, model_name=model_name, temperature=0, top_p=0
)
print(f"Generated SparQL:\n{query}")

Messages: [{'role': 'system', 'content': '\nYou are a useful SparQL assistant. You are tasked to review a question and generate a SparQL to answer the question.\nSparQL Database used is WikiData. [<text>] is topic entity in the question.\nOnly use these two prefixes PREFIX wd: <https://www.wikidata.org/entity/> and PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> if needed.\nDo not use wdt syntax to query WikiData\n'}, {'role': 'user', 'content': 'which films have the same director as [Leningrad Cowboys Go America] and featured [Blondin Miguel]?'}]
Model: ft:gpt-4o-2024-08-06:personal::AKlfhHvk
Generated SparQL:
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT DISTINCT ?label ?uri WHERE { ?rel1 rdfs:label "directed_by" . ?e1 ?rel1 ?e2 ; rdfs:label "Leningrad Cowboys Go America" . ?uri ?rel1 ?e2 ; ?rel4 ?e4 ; rdfs:label ?label . FILTER (?label != "Leningrad Cowboys Go America") ?rel4 rdfs:label "starred_actors" . ?e4 rdfs:label "Blondin Miguel" . }


## HANDLE ERROR (IF THERE IS)


In [44]:
NO_RETRRY = 3
query_results = None
try:
    query_results = graph.query(query)
    print(f"Query results: {query_results}")
except Exception as e:
    retries = 0
    error_message = str(e)
    while retries < NO_RETRRY:
        try:
            print(f"Invalid query. Retry count: {retries+1}")
            fix_sparql_query_messages = [
                {"role": "system", "content": SPARQL_FIX_PROMPT},
                {
                    "role": "user",
                    "content": f"""Question: {test_data_point['question']}
Previous SparQL query: {query}
Error message: {error_message}
""",
                },
            ]
            query = call_openai(
                messages=fix_sparql_query_messages,
                model_name=model_name,
                temperature=0,
                top_p=0,
            )
            print(f"New query: {query}")
            query_results = graph.query(query)
            print(f"Query results: {query_results}")
            break
        except Exception as e:
            retries += 1
            error_message = str(e)
if query_results == None:
    raise Exception("Invalid SparQL query. Stop answering")

Query results: [(rdflib.term.Literal('Le Havre'), rdflib.term.URIRef('https://www.wikidata.org/entity/Q736498'))]


In [46]:
qa_answering_prompt = QA_PROMPT.replace("{{context}}", str(query_results))
qa_answering_prompt = qa_answering_prompt.replace("{{query}}", str(query))
qa_answering_messages = [
    {"role": "system", "content": qa_answering_prompt},
    {"role": "user", "content": f"Question: {test_data_point['question']}"},
]
final_response = call_openai(
    messages=qa_answering_messages, model_name="gpt-4o", temperature=0.5
)
print(f"Final Response:\n{final_response}")

Messages: [{'role': 'system', 'content': 'Generate a natural language response from the results of a SPARQL query.\nDon\'t use any internal knowledge to answer the question,\nJust say you don\'t know if no information is available from The results of a SPARQL query.\nIf the question is Yes/No question and there is no information available, answer No.\nSparQL query: PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT DISTINCT ?label ?uri WHERE { ?rel1 rdfs:label "directed_by" . ?e1 ?rel1 ?e2 ; rdfs:label "Leningrad Cowboys Go America" . ?uri ?rel1 ?e2 ; ?rel4 ?e4 ; rdfs:label ?label . FILTER (?label != "Leningrad Cowboys Go America") ?rel4 rdfs:label "starred_actors" . ?e4 rdfs:label "Blondin Miguel" . }\nThe results of a SparQL query: [(rdflib.term.Literal(\'Le Havre\'), rdflib.term.URIRef(\'https://www.wikidata.org/entity/Q736498\'))]'}, {'role': 'user', 'content': 'Question: which films have the same director as [Leningrad Cowboys Go America] and featured [Blondin Miguel]?'}]

In [48]:
# our query
generated_results = set(graph.query(query))

# sample query
sample_results = set(graph.query(test_data_point["sparql"]))

# Compare the results
if generated_results == sample_results:
    print("The queries produce identical results.")
else:
    print("The queries produce different results.")

The queries produce identical results.
