In [1]:
import utils
import argparse
import sys
import os
import json
import re
from typing import List, Dict, Set, Tuple
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [2]:
# base directory path
base_dir = r"c:/Users/yuche/OneDrive - TUM/phd/KGC exp/pilot-exp/exp-round1"
data_dir = os.path.join(base_dir, "data")
wikidata_dir = os.path.join(data_dir, "wikidata_tekgen")
wikidata_new_dir = os.path.join(data_dir, "wikidata_new")
# dbpedia_dir = os.path.join(data_dir, "dbpedia")

ontology_file_path = os.path.join(wikidata_dir, "ontologies", "1_movie_ontology.json")
test_file_path = os.path.join(wikidata_dir, "test/ont_1_movie_test.jsonl")
similarity_file_path = os.path.join(wikidata_dir, "baselines", "test_train_sent_similarity/ont_1_movie_test_train_similarity.json")
train_file_path = os.path.join(wikidata_dir, "train/ont_1_movie_train.jsonl")
ground_truth_file_path = os.path.join(wikidata_dir, "ground_truth/ont_1_movie_ground_truth.jsonl")

file_name = "ont_1_movie_n_rels_0_distractor_0_shot_results.json"
ds_output_path = os.path.join(base_dir, "llm_output", "deepseek-v3", file_name)

In [None]:
def calculate_precision_recall_f1(gold: set, pred: set) -> Tuple[float, float, float]:
    """
    Method to calculate precision, recall and f1:
        Precision is calculated as correct_triples/predicted_triples and
        Recall as correct_triples/gold_triples
        F1 as the harmonic mean of precision and recall.
    :param gold: items in the gold standard
    :param pred: items in the system prediction
    :return:
        p: float - precision
        r: float - recall
        f1: float - F1

    Example: 
    gold: {
    'newyorklocatedinunitedstates', 
    'berlincapitalofgermany', 
    'pariscapitaloffrance', 
    'machinelearningisaaifield'
    }
    pred: {
    'newyorklocatedinusa', 
    'berlincapitalofgermany', 
    'pariscapitaloffrance', 
    'aiisaatechnology'
    }
    """
    if len(pred) == 0:
        return 0, 0, 0
    p = len(gold.intersection(pred)) / len(pred)
    r = len(gold.intersection(pred)) / len(gold)
    if p + r > 0:
        f1 = 2 * ((p * r) / (p + r))
    else:
        f1 = 0
    return p, r, f1

In [None]:
def normalize_triple(sub_label: str, rel_label: str, obj_label: str) -> str:
    """
    Normalize triples for comparison in precision, recall calculations
    :param sub_label: subject string
    :param rel_label: relation string
    :param obj_label: object string
    :return: a normalized triple as a single concatenated string

    Example:
    >>> normalize_triple("New York", "located_in", "United States")
    'newyorklocatedinunitedstates'

    >>> normalize_triple("Machine_Learning", "is_a", "AI Field")
    'machinelearningisaaifield'
    """
    # remove spaces and underscores and make lower case
    sub_label = re.sub(r"(_|\s+)", '', sub_label).lower()
    rel_label = re.sub(r"(_|\s+)", '', rel_label).lower()
    obj_label = re.sub(r"(_|\s+)", '', obj_label).lower()
    # concatenate them to a single string
    tr_key = f"{sub_label}{rel_label}{obj_label}"
    return tr_key

In [5]:
def get_ontology_conformance(ontology: Dict, triples: List) -> Tuple[float, float]:
    """
    Calculate the ontology conformance and relation hallucination metrics.
    :param ontology: ontology to take into account with the concepts and relations
    :param triples: a set of triples generated by the system
    :return:
        ont_conformance: float - ontology conformance metric
        rel_hallucination: float - relation hallucination metric = 1 - ontology conformance
    """
    if len(triples) == 0:
        return 1, 0
    # replace spaces with underscores in the ontology relations
    ont_rels = [rel['label'].replace(" ", "_") for rel in ontology['relations']]
    # count the number of system triples relations that are in the ontology
    num_rels_conformant = len([tr for tr in triples if tr[1] in ont_rels])

    # ontology conformance is the number of system triples relations in the ontology divided by the total number of system triples
    ont_conformance = num_rels_conformant / len(triples)
    # relation hallucination is 1 - ontology conformance
    rel_hallucination = 1 - ont_conformance
    return ont_conformance, rel_hallucination

In [None]:
# initialize the local variables for the evaluation metrics for each ontology
t_p, t_r, t_f1, t_onto_conf, t_rel_halluc, t_sub_halluc, t_obj_halluc = 0, 0, 0, 0, 0, 0, 0
eval_metrics_list = list()

system_output = utils.load_json(ds_output_path)
ground_truth = utils.load_jsonl(ground_truth_file_path)
ontology = utils.load_json(ontology_file_path)
print(type(system_output), type(ground_truth), type(ontology))

<class 'list'> <class 'list'> <class 'dict'>


In [7]:
def convert_to_dict(data: List[Dict], id_name: str = "id") -> Dict:
    """
    Utility method to convert a list to a dictionary
    :param data: a list of dictionary objects
    :param id_name: the attribute to be used as the key for the dictionary
    :return: a dictionary with the same content as the list
    """
    return {item[id_name]: item for item in data}

In [32]:

def clean_entity_string(ps, entity: str) -> str:
    """
    Utility method to clean subject and object strings of triples
    :param ps: stemmer for stemming words before checking for hallucinations
    :param entity: subject or object string
    :return: the cleaned and normalized string
    """
    # stem every word for better matches
    stemmed_entity = "".join([ps.stem(word) for word in word_tokenize(entity)])
    # normalizing the string by removing white spaces, underscores and then converting to lower case
    normalized_stemmed_entity = re.sub(r"(_|\s+)", '', stemmed_entity).lower()
    # special handling for string with years to remove January 01
    return normalized_stemmed_entity.replace("01januari", "")

In [None]:

def get_subject_object_hallucinations(ps, ontology, test_sentence, triples) -> tuple[float, float]:
    """
    Calculate subject and object hallucinations metrics. As the context for calculating hallucinations, we consider the
    test sentence and the ontology concepts as relevant tokens.
    :param ps: stemmer for stemming words before checking for hallucinations
    :param ontology: ontology to take into account with the concepts and relations
    :param test_sentence: test sentences for which the triples are generated
    :param triples: a set of triples generated by the system
    :return:
        subj_hallucination: float - subject hallucination metric
        obj_hallucination: float - object hallucination metric
    """

    # if the set of triples are empty, we return 0
    if len(triples) == 0:
        return 0, 0

    # append the test sentence with concepts from the ontology
    test_sentence += " ".join([c["label"] for c in ontology['concepts']])
    # stem each word in the test sentence concatenated with the ontology concepts
    stemmed_sentence = "".join([ps.stem(word) for word in word_tokenize(test_sentence)])
    # normalize the text to remove white spaces and underscores
    normalized_stemmed_sentence = re.sub(r"(_|\s+)", '', stemmed_sentence).lower()

    # count the number of subject and object hallucinations
    num_subj_hallucinations, num_obj_hallucinations = 0, 0
    for triple in triples:
        # clean and normalize subject and object noun phrases the same way as the test sentence
        normalized_stemmed_subject = clean_entity_string(ps, triple[0])
        normalized_stemmed_object = clean_entity_string(ps, triple[2])

        # check if the subject/object is found in the stemmed sentence/context text. If not found, mark it as a hallucination
        if normalized_stemmed_sentence.find(normalized_stemmed_subject) == -1:
            num_subj_hallucinations += 1
        if normalized_stemmed_sentence.find(normalized_stemmed_object) == -1:
            num_obj_hallucinations += 1

    # divide the number of hallucinations by the number of triples to calculate the hallucination metrics
    subj_hallucination = num_subj_hallucinations / len(triples)
    obj_hallucination = num_obj_hallucinations / len(triples)
    return subj_hallucination, obj_hallucination

In [9]:
test = convert_to_dict(system_output)
print(test)

{'ont_1_movie_test_1': {'id': 'ont_1_movie_test_1', 'prompt': 'Extract relational triplets from the sentence based on the provided ontology relations.\nUse only the listed relations and ensure subjects and objects align with their specified restrictions.\nOnly return the triples in the format relation(subject, object), separated by commas. Do not include explanations, extra text, or comments. \n\nCONTEXT:\n\n\nOntology Relations: director(film, human), publication_date(film, ), \n\nTest Sentence: Bleach: Hell Verse (Japanese: BLEACH , Hepburn: BurÄ«chi Jigoku-Hen) is a 2010 Japanese animated film directed by Noriyuki Abe.\nTest Output: ', 'result': 'director(Bleach: Hell Verse, Noriyuki Abe), publication_date(Bleach: Hell Verse, 2010)', 'run_time': 6.019839763641357, 'usage_metadata': 'CompletionUsage(completion_tokens=28, prompt_tokens=132, total_tokens=160, completion_tokens_details=None, prompt_tokens_details=PromptTokensDetails(audio_tokens=None, cached_tokens=128), prompt_cache_hi

In [30]:
system_output = convert_to_dict(system_output)
ground_truth = convert_to_dict(ground_truth)
print(system_output)
print(ground_truth)

{'ont_1_movie_test_1': {'id': 'ont_1_movie_test_1', 'prompt': 'Extract relational triplets from the sentence based on the provided ontology relations.\nUse only the listed relations and ensure subjects and objects align with their specified restrictions.\nOnly return the triples in the format relation(subject, object), separated by commas. Do not include explanations, extra text, or comments. \n\nCONTEXT:\n\n\nOntology Relations: director(film, human), publication_date(film, ), \n\nTest Sentence: Bleach: Hell Verse (Japanese: BLEACH , Hepburn: BurÄ«chi Jigoku-Hen) is a 2010 Japanese animated film directed by Noriyuki Abe.\nTest Output: ', 'result': 'director(Bleach: Hell Verse, Noriyuki Abe), publication_date(Bleach: Hell Verse, 2010)', 'run_time': 6.019839763641357, 'usage_metadata': 'CompletionUsage(completion_tokens=28, prompt_tokens=132, total_tokens=160, completion_tokens_details=None, prompt_tokens_details=PromptTokensDetails(audio_tokens=None, cached_tokens=128), prompt_cache_hi

In [None]:
# initialize the local variables for the evaluation metrics for each ontology
t_p, t_r, t_f1, t_onto_conf, t_rel_halluc, t_sub_halluc, t_obj_halluc = 0, 0, 0, 0, 0, 0, 0
eval_metrics_list = list()

eval_output_path = os.path.join(base_dir, "llm_output", "deepseek-v3", "eval_metrics.json")
metric_path = os.path.join(base_dir, "llm_output", "deepseek-v3", "metrics.json")
for sent_id in list(system_output.keys())[0:9]:
    # get system output 
    system_triples = system_output[sent_id]['triples']


    # collect the ground truth triples
    if sent_id in ground_truth:
        gt_triples = [[tr['sub'], tr['rel'], tr['obj']] for tr in ground_truth[sent_id]['triples']]
        sentence = ground_truth[sent_id]["sent"]


        # collect the set of relations in ground truth triples, spaces are converted to "_" to make them
        # comparable with system triples
        gt_relations = {tr[1].replace(" ", "_") for tr in gt_triples}

                
        # filter out any triples in system output that does not match with ground truth relations
        # keep only the triples that have relations in the ground truth
        filtered_system_triples = [tr for tr in system_triples if tr[1] in gt_relations]


        # create a normalized string from subject, relation, object of each triple for comparison
        normalized_system_triples = {normalize_triple(tr[0], tr[1], tr[2]) for tr in filtered_system_triples}
        normalized_gt_triples = {normalize_triple(tr[0], tr[1], tr[2]) for tr in gt_triples}

        # compare the system output triples with ground truth triples and calculate precision, recall, f1
        precision, recall, f1 = calculate_precision_recall_f1(normalized_gt_triples, normalized_system_triples)

        # calculate ontology conformance and relation hallucination
        ont_conformance, rel_hallucination = get_ontology_conformance(ontology, system_triples)


        eval_metrics = {"id": sent_id, "precision": f"{precision:.2f}" , "recall": f"{recall:.2f}", "f1": f"{f1:.2f}",
                                "onto_conf": f"{ont_conformance:.2f}", "rel_halluc": f"{rel_hallucination:.2f}",
                                "llm_triples": system_triples, "filtered_llm_triples": filtered_system_triples,
                                "gt_triples": gt_triples, "sent": sentence}
        eval_metrics_list.append(eval_metrics)
        print(f"eval metrics: {eval_metrics}")

        # aggregate precision, recall, f1 for later averaging
        t_p += precision
        t_r += recall
        t_f1 += f1
        t_onto_conf += ont_conformance
        t_rel_halluc += rel_hallucination


with open(eval_output_path, 'w', encoding='utf-8') as f:
    json.dump(eval_metrics_list, f, ensure_ascii=False, indent=4)

# total_test_cases = len(system_output)
total_test_cases = 9
print(total_test_cases)
# average metrics calculate the average of evaluate metrics for all test cases in a given ontology
average_metrics = {
                           "avg_precision": f"{t_p/total_test_cases:.2f}",
                           "avg_recall": f"{t_r/total_test_cases:.2f}",
                           "avg_f1": f"{t_f1/total_test_cases:.2f}",
                           "avg_onto_conf": f"{t_onto_conf/total_test_cases:.2f}",
                           "avg_rel_halluc": f"{t_rel_halluc / total_test_cases:.2f}",
                           }
with open(metric_path, 'w', encoding='utf-8') as f:
    json.dump(average_metrics, f, ensure_ascii=False, indent=4)

eval metrics: {'id': 'ont_1_movie_test_1', 'precision': '0.50', 'recall': '0.50', 'f1': '0.50', 'onto_conf': '1.00', 'rel_halluc': '0.00', 'llm_triples': [['Bleach: Hell Verse', 'director', 'Noriyuki Abe'], ['Bleach: Hell Verse', 'publication_date', '2010']], 'filtered_llm_triples': [['Bleach: Hell Verse', 'director', 'Noriyuki Abe'], ['Bleach: Hell Verse', 'publication_date', '2010']], 'gt_triples': [['Bleach : Hell Verse', 'director', 'Noriyuki Abe'], ['Bleach : Hell Verse', 'publication date', '01 January 2010']], 'sent': 'Bleach: Hell Verse (Japanese: BLEACH , Hepburn: BurÄ«chi Jigoku-Hen) is a 2010 Japanese animated film directed by Noriyuki Abe.'}
eval metrics: {'id': 'ont_1_movie_test_2', 'precision': '0.00', 'recall': '0.00', 'f1': '0.00', 'onto_conf': '1.00', 'rel_halluc': '0.00', 'llm_triples': [["Keyboard Cat's original form", 'cast_member', 'Charlie Schmidt'], ["Keyboard Cat's original form", 'cast_member', 'Fatso'], ["Keyboard Cat's original form", 'director', 'Charlie Sch

In [15]:
def parse_result(result_str):
    """
    Convert result string from format relation(subject, object) to a list of triples [subject, relation, object].
    If the input does not match the expected format, return an empty list.
    """
    triples = []
    pattern = r'(\w+)\(([^,]+),\s*([^)]+)\)'
    
    matches = re.findall(pattern, result_str)
    
    if not matches:
        return []  # Return empty list if no valid triples are found

    for relation, subject, obj in matches:
        triples.append([subject.strip(), relation.strip(), obj.strip()])
    
    return triples

In [23]:
ds_folder = os.path.join(base_dir, "llm_output", "deepseek-v3")
gemini_folder = os.path.join(base_dir, "llm_output", "gemini-1.5-flash-8b")

In [24]:
# directory = ds_folder
directory = gemini_folder
for filename in os.listdir(directory):
    if filename.endswith(".json"):  # Process only JSON files
        file_path = os.path.join(directory, filename)
        print(f"Processing file: {file_path}")
        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)  # Load JSON content
           # Process each JSON object
            for item in data:
                item["triples"] = parse_result(item["result"])
                print(item["triples"])
        # Write back the updated JSON file
        with open(file_path, "w", encoding="utf-8") as file:
            json.dump(data, file, indent=4)


Processing file: c:/Users/yuche/OneDrive - TUM/phd/KGC exp/pilot-exp/exp-round1\llm_output\gemini-1.5-flash-8b\ont_1_movie_n_rels_0_distractor_0_shot_results.json
[['Bleach: Hell Verse', 'director', 'Noriyuki Abe']]
[]
[['series', 'director', 'Mitsuko Kase'], ['series', 'director', 'Takashi Imanishi']]
[['Spirited Away', 'genre', 'animated'], ['Spirited Away', 'director', 'Hayao Miyazaki'], ['Spirited Away', 'screenwriter', 'Hayao Miyazaki'], ['Spirited Away', 'production_company', 'Studio Ghibli']]
[['Looney Tunes: Back in Action', 'director', 'Joe Dante']]
[['The Life on the Earth - The Summer of Dioxin ( Inochi No Chikyuu - Dioxin No Natsu)', 'director', 'Satoshi Dezaki']]
[['The Flame of Love', 'director', 'Richard Eichberg'], ['The Flame of Love', 'director', 'Walter Summers'], ['The Flame of Love', 'cast_member', 'Anna May Wong'], ['The Flame of Love', 'cast_member', 'John Longden']]
[['Metal Skin Panic MADOX-01', 'director', 'Shinji Aramaki']]
[['film', 'director', 'Noriyuki Abe