In [1]:
import utils 
import os

In [2]:
# base directory path
base_dir = r"c:/Users/yuche/OneDrive - TUM/phd/KGC exp/pilot-exp/exp-round1"
data_dir = os.path.join(base_dir, "data")
dbpedia_dir = os.path.join(data_dir, "dbpedia_webnlg")
dbpedia_new_dir = os.path.join(data_dir, "dbpedia_new")

ontology_file_path = os.path.join(dbpedia_dir, "ontologies", "19_film_ontology.json")
test_file_path = os.path.join(dbpedia_dir, "test/ont_19_film_test.jsonl")
similarity_file_path = os.path.join(dbpedia_dir, "baselines", "test_train_sent_similarity/19_film_test_train_similarity.json")
train_file_path = os.path.join(dbpedia_dir, "train/ont_19_film_train.jsonl")
ground_truth_file_path = os.path.join(dbpedia_dir, "ground_truth/ont_19_film_ground_truth.jsonl")

In [3]:
ontology = utils.load_json(ontology_file_path)
tests = utils.load_jsonl(test_file_path)
test_train_similarity = utils.load_json(similarity_file_path)
train = utils.load_jsonl(train_file_path)
ground_truth = utils.load_jsonl(ground_truth_file_path)
print(f"ontology: {ontology}")
print(f"tests: {len(tests)}")
print(f"test_train_similarity: {len(test_train_similarity.keys())}")
print(f"train: {len(train)}")
print(f"ground_truth: {len(ground_truth)}")

ontology: {'title': 'Film Ontology', 'id': 'ont_19_film', 'concepts': [{'qid': 'Film', 'label': 'Film'}, {'qid': 'Channel', 'label': 'Channel'}, {'qid': 'Organisation', 'label': 'Organisation'}, {'qid': 'Company', 'label': 'Company'}, {'qid': 'Artist', 'label': 'Artist'}, {'qid': 'Type', 'label': 'Type'}, {'qid': 'Place', 'label': 'Place'}, {'qid': 'City', 'label': 'City'}, {'qid': 'Industry', 'label': 'Industry'}, {'qid': 'Occupation', 'label': 'Occupation'}, {'qid': 'Person', 'label': 'Person'}, {'qid': 'Country', 'label': 'Country'}, {'qid': 'Background', 'label': 'Background'}, {'qid': 'Service', 'label': 'Service'}, {'qid': 'Cinematography', 'label': 'Cinematography'}, {'qid': 'Language', 'label': 'Language'}, {'qid': 'Station', 'label': 'Station'}, {'qid': 'Club', 'label': 'Club'}], 'relations': [{'pid': 'timeshiftChannel', 'label': 'timeshiftChannel', 'domain': 'Film', 'range': 'Channel'}, {'pid': 'director', 'label': 'director', 'domain': 'Film', 'range': 'Person'}, {'pid': 'im

In [16]:
def get_one_example_prompt(train_sent):
    example_prompt = "\n\nExample Sentence: " + train_sent['sent']
    
    # Extracting the first triple from the list of triples
    triples = train_sent['triples']

    triples_string = ""
    for triple in triples:
        # Constructing the string representation of each triple
        triples_string += triple['rel'] + "(" + triple['sub'] + "," + triple['obj'] + ")" + ", "
    
    # Constructing the example output based on the new format
    example_prompt += "\nExample Output: " + triples_string 
    
    return example_prompt

In [6]:
# get domain and range of a relation    
def get_domain_range(relation: str, ontology: dict):
    """
    Returns the domain and range of a given relation label from the ontology data.
    
    :param relation: The label of the relation (string)
    :param data: The ontology data (dictionary loaded from JSON)
    :return: A tuple (domain, range) or (None, None) if relation not found
    """
    # Create a mapping of qid to label for concepts
    concept_map = {concept["qid"]: concept["label"] for concept in ontology.get("concepts", [])}
    
    for rel in ontology.get("relations", []):
        if rel.get("label") == relation:
            domain_id = rel.get("domain")
            range_id = rel.get("range")
            return concept_map.get(domain_id, None), concept_map.get(range_id, None)
    
    return None, None  # Return None if relation is not found

# get n ground truth ontology relations
def get_n_ontology_relations(gt_list: list, id, ontology: dict):
    gt_relations = []
    gt_id = id
    ontology = ontology
    for item in gt_list:
        if item['id'] == gt_id:
            triples = item['triples']
            break
    if triples == None:
        return "No triples found for the given id"        

    for triple in triples:
        gt_relations.append(triple['rel'])

    # remove duplicates
    gt_relations = list(set(gt_relations))
    return gt_relations
    
# get ontology prompt
def get_ontology_prompt(gt_rels: list, ontology: dict):

    onto_relations = ""
    for rel in gt_rels:
        domain, range = get_domain_range(rel, ontology)
        rel_string = rel.replace(" ", "_")
        if domain == None:
            domain = ""
        if range == None:
            range = ""
        onto_relations += f"{rel_string}({domain}, {range}), "
    return onto_relations


def get_three_example_prompt(simil_sent_id: list, train: list):
    three_example_prompt = ""
    for id in simil_sent_id:
        train_sent = get_train_sentence(id, train)
        one_example_prompt = get_one_example_prompt(train_sent)
        three_example_prompt += one_example_prompt + "\n"
    return three_example_prompt
        

# get test prompt
def get_test_prompt(test_sentence):
    test_prompt = "\n\nTest Sentence: " + test_sentence
    test_prompt += "\nTest Output: "
    return test_prompt


def get_similar_sentences(test_sentence_id, test_similar):
    for sim in test_similar:
        if sim == test_sentence_id:
            return test_similar[sim]


def get_train_sentence(simil_sent_id, train_sentences):
    for sent in train_sentences:
        if sent['id'] == simil_sent_id:
            return sent

In [7]:
# prepare prompt    
def prepare_prompt_zero_shot(test_sentence: str, gt_relations:list, ontology:dict) -> str:


    prompt_fixed = '''Extract relational triplets from the sentence based on the provided ontology relations.
Use only the listed relations and ensure subjects and objects align with their specified restrictions.
Only return the triples in the format relation(subject, object), separated by commas. Do not include explanations, extra text, or comments. \n
'''

    prompt = prompt_fixed
    prompt += 'CONTEXT:\n\n'
    prompt += '\nOntology Relations: '
    prompt += get_ontology_prompt(gt_relations, ontology)
    prompt += get_test_prompt(test_sentence)

    return prompt


def prepare_prompt_one_shot(test_sentence: str, gt_relations:list, ontology:dict, train_sent:dict) -> str:


    prompt_fixed = '''Extract relational triplets from the sentence based on the provided ontology relations and examples.
Use only the listed relations and ensure subjects and objects align with their specified restrictions.
Only return the triples in the format relation(subject, object), separated by commas. Do not include explanations, extra text, or comments. \n
'''

    prompt = prompt_fixed
    prompt += 'CONTEXT:\n\n'
    prompt += '\nOntology Relations: '
    prompt += get_ontology_prompt(gt_relations, ontology)
    prompt += get_one_example_prompt(train_sent)
    prompt += get_test_prompt(test_sentence)

    return prompt


def prepare_prompt_three_shot(test_sentence: str, gt_relations:list, ontology:dict, simil_sent_id: list, train: list) -> str:


    prompt_fixed = '''Extract relational triplets from the sentence based on the provided ontology relations and examples.
Use only the listed relations and ensure subjects and objects align with their specified restrictions.
Only return the triples in the format relation(subject, object), separated by commas. Do not include explanations, extra text, or comments. \n
'''

    prompt = prompt_fixed
    prompt += 'CONTEXT:\n\n'
    prompt += '\nOntology Relations: '
    prompt += get_ontology_prompt(gt_relations, ontology)
    prompt += get_three_example_prompt(simil_sent_id, train)
    prompt += get_test_prompt(test_sentence)

    return prompt


# generate n ground truth relation, prompts and save to the output file

In [8]:
output_folder_path = os.path.join(dbpedia_new_dir, "n_rels_0_distractors")
os.makedirs(output_folder_path, exist_ok=True)  # Will not throw an error if the folder exists

In [10]:
for item in tests:
    id = item["id"]
    item["gt_relations"] = get_n_ontology_relations(ground_truth, id, ontology)
    print(item["gt_relations"])

['starring', 'writer', 'musicComposer']
['director', 'writer', 'runtime', 'starring', 'budget']
['starring']
['director']
['type', 'distributor']
['starring', 'gross']
['starring', 'writer', 'runtime']
['starring', 'cinematography']
['director', 'starring', 'runtime']
['birthYear', 'writer', 'starring']
['starring', 'runtime', 'gross', 'cinematography']
['writer']
['starring', 'activeYearsStartYear']
['starring', 'editing', 'language']
['starring']
['starring', 'birthPlace']
['starring', 'editing', 'musicComposer', 'runtime']
['starring', 'musicComposer', 'director', 'runtime']
['director', 'runtime', 'gross', 'cinematography']
['director', 'runtime', 'distributor', 'starring', 'gross']
['starring', 'gross', 'distributor']
['starring', 'runtime']
['starring', 'birthDate', 'birthPlace']
['starring', 'writer']
['starring', 'writer', 'budget', 'director']
['starring', 'runtime', 'gross', 'cinematography']
['director', 'runtime', 'starring', 'gross', 'cinematography', 'musicComposer']
['ci

In [11]:
new_test_file_path = os.path.join(output_folder_path, "ont_19_film_test.jsonl")
utils.save_jsonl(tests, new_test_file_path)

Saved JSONL data to c:/Users/yuche/OneDrive - TUM/phd/KGC exp/pilot-exp/exp-round1\data\dbpedia_new\n_rels_0_distractors\ont_19_film_test.jsonl


In [12]:
zero_shot_prompts = []
for item in tests:
    prompt_item = {}
    prompt_item["id"] = item["id"]
    prompt_item["prompt"]= prepare_prompt_zero_shot(item["sent"], item["gt_relations"], ontology)
    print(prompt_item["id"])
    print(prompt_item["prompt"])
    print("-------------------")
    zero_shot_prompts.append(prompt_item)

ont_19_film_test_1
Extract relational triplets from the sentence based on the provided ontology relations.
Use only the listed relations and ensure subjects and objects align with their specified restrictions.
Only return the triples in the format relation(subject, object), separated by commas. Do not include explanations, extra text, or comments. 

CONTEXT:


Ontology Relations: starring(Film, Artist), writer(, Person), musicComposer(Film, Artist), 

Test Sentence: It's Great to be Young,a 1956 film starring Cecil Parker was composed by Louis Levy and written by Ted Willis.
Test Output: 
-------------------
ont_19_film_test_2
Extract relational triplets from the sentence based on the provided ontology relations.
Use only the listed relations and ensure subjects and objects align with their specified restrictions.
Only return the triples in the format relation(subject, object), separated by commas. Do not include explanations, extra text, or comments. 

CONTEXT:


Ontology Relations: d

In [13]:
output_file_path = os.path.join(output_folder_path, "ont_19_film_n_rels_0_distractor_0_shot_prompts.jsonl")
utils.save_jsonl(zero_shot_prompts, output_file_path)

Saved JSONL data to c:/Users/yuche/OneDrive - TUM/phd/KGC exp/pilot-exp/exp-round1\data\dbpedia_new\n_rels_0_distractors\ont_19_film_n_rels_0_distractor_0_shot_prompts.jsonl


# generate one shot example prompt

In [14]:
# read new test file if the previous block was not ran
new_test_file_path = os.path.join(output_folder_path, "ont_19_film_test.jsonl")
tests = utils.load_jsonl(new_test_file_path)
print(tests[0])

{'id': 'ont_19_film_test_1', 'sent': "It's Great to be Young,a 1956 film starring Cecil Parker was composed by Louis Levy and written by Ted Willis.", 'gt_relations': ['starring', 'writer', 'musicComposer']}


In [17]:
one_shot_prompts = []
for item in tests:
    prompt_item = {}
    prompt_item["id"] = item["id"]
    # get the similar train sentences for the test sentence
    similar_sents = get_similar_sentences(item["id"], test_train_similarity)
    # we retrieve by default the first similar sentence from the list of similar sentences
    simil_sent_id = similar_sents[0]

    # we get the train sentence from the train sentences list and from there we process each field sub_label, obj_label, rel_label
    train_sent = get_train_sentence(simil_sent_id, train)
    prompt_item["prompt"]= prepare_prompt_one_shot(item["sent"], item["gt_relations"], ontology, train_sent)
    print(prompt_item["id"])
    print(prompt_item["prompt"])
    print("-------------------")
    one_shot_prompts.append(prompt_item)

ont_19_film_test_1
Extract relational triplets from the sentence based on the provided ontology relations and examples.
Use only the listed relations and ensure subjects and objects align with their specified restrictions.
Only return the triples in the format relation(subject, object), separated by commas. Do not include explanations, extra text, or comments. 

CONTEXT:


Ontology Relations: starring(Film, Artist), writer(, Person), musicComposer(Film, Artist), 

Example Sentence: Released on 28th July 1944 the 89 minutes long film, "English Without Tears, " was directed by Harold French, produced by Anatole de Grunwald, while the songs in the film were composed by Nicholas Brodszky.
Example Output: runtime(English_Without_Tears,89.0), musicComposer(English_Without_Tears,Nicholas_Brodszky), releaseDate(English_Without_Tears,1944-07-28), director(English_Without_Tears,Harold_French), producer(English_Without_Tears,Anatole_de_Grunwald), 

Test Sentence: It's Great to be Young,a 1956 fil

In [18]:
output_file_path = os.path.join(output_folder_path, "ont_19_film_n_rels_0_distractor_1_shot_prompts.jsonl")
utils.save_jsonl(one_shot_prompts, output_file_path)

Saved JSONL data to c:/Users/yuche/OneDrive - TUM/phd/KGC exp/pilot-exp/exp-round1\data\dbpedia_new\n_rels_0_distractors\ont_19_film_n_rels_0_distractor_1_shot_prompts.jsonl


# generate three-shot examples prompt and save

In [19]:
three_shot_prompts = []
for item in tests:
    prompt_item = {}
    prompt_item["id"] = item["id"]
    # get the similar train sentences for the test sentence
    similar_sents = get_similar_sentences(item["id"], test_train_similarity)
    # we retrieve by default the first three similar sentence from the list of similar sentences
    simil_sent_id = similar_sents[0:3]
    prompt_item["prompt"]= prepare_prompt_three_shot(item["sent"], item["gt_relations"], ontology, simil_sent_id, train)
    print(prompt_item["id"])
    print(prompt_item["prompt"])
    print("-------------------")
    three_shot_prompts.append(prompt_item)

ont_19_film_test_1
Extract relational triplets from the sentence based on the provided ontology relations and examples.
Use only the listed relations and ensure subjects and objects align with their specified restrictions.
Only return the triples in the format relation(subject, object), separated by commas. Do not include explanations, extra text, or comments. 

CONTEXT:


Ontology Relations: starring(Film, Artist), writer(, Person), musicComposer(Film, Artist), 

Example Sentence: Released on 28th July 1944 the 89 minutes long film, "English Without Tears, " was directed by Harold French, produced by Anatole de Grunwald, while the songs in the film were composed by Nicholas Brodszky.
Example Output: runtime(English_Without_Tears,89.0), musicComposer(English_Without_Tears,Nicholas_Brodszky), releaseDate(English_Without_Tears,1944-07-28), director(English_Without_Tears,Harold_French), producer(English_Without_Tears,Anatole_de_Grunwald), 


Example Sentence: English Without Tears is an 8

In [20]:
output_file_path = os.path.join(output_folder_path, "ont_19_film_n_rels_0_distractor_3_shot_prompts.jsonl")
utils.save_jsonl(three_shot_prompts, output_file_path)

Saved JSONL data to c:/Users/yuche/OneDrive - TUM/phd/KGC exp/pilot-exp/exp-round1\data\dbpedia_new\n_rels_0_distractors\ont_19_film_n_rels_0_distractor_3_shot_prompts.jsonl


# generate baseline relations with one shot prompt
### the instruct is changed, the onto class is removed, only keep the baseline onto relations and example

In [21]:
output_folder_path = os.path.join(dbpedia_new_dir, "baseline_rels")
os.makedirs(output_folder_path, exist_ok=True)  # Will not throw an error if the folder exists

In [22]:
def get_concept_label(ontology, concept):
    """
    Get the label for the ontology concept
    :param ontology: ontology with concepts ids and labels
    :param ont_dom: input concept ID
    :return: the label of the input concept
    """
    for onto in ontology['concepts']:
        if onto['qid'] == concept:
            return onto['label']
        
        
def get_ontology_relations(ontology):
    """
    Generate a verbalized list of relations in the given ontology to be included in the prompt
    :param ontology:  an ontology as a dictionary
    :return: A string for ontology relations.
            e.g. cast_member(film,human), director (film,human), screenwriter(film,human), producer(film,human), ...
    """

    ont_rels = ""
    onto_rel_strings = list()
    for onto in ontology['relations']:
        ont_rel = onto['label']
        ont_rel = ont_rel.replace(" ", "_")
        ont_dom = onto['domain']
        ont_range = onto['range']
        ont_domain = get_concept_label(ontology, ont_dom)
        ont_range = get_concept_label(ontology, ont_range)

        if ont_rel == None:
            continue
        if ont_domain == None:
            ont_domain = ""
        if ont_range == None:
            ont_range = ""

        onto_rel_strings.append(f"{ont_rel} ({ont_domain},{ont_range})\n")

        ont_rels += ont_rel + "(" + ont_domain + "," + ont_range + "), "

    return ont_rels[0:-2]


def get_similar_sentences(test_sentence_id, test_similar):
    for sim in test_similar:
        if sim == test_sentence_id:
            return test_similar[sim]


def get_train_sentence(simil_sent_id, train_sentences):
    for sent in train_sentences:
        if sent['id'] == simil_sent_id:
            return sent


def get_example_prompt(train_sent):
    example_prompt = "\n\nExample Sentence: " + train_sent['sent']
    
    # Extracting the first triple from the list of triples
    triples = train_sent['triples']

    triples_string = ""
    for triple in triples:
        # Constructing the string representation of each triple
        triples_string += triple['rel'] + "(" + triple['sub'] + "," + triple['obj'] + ")" + ", "
    
    # Constructing the example output based on the new format
    example_prompt += "\nExample Output: " + triples_string 
    
    return example_prompt


def get_test_prompt(test_sentence):
    test_prompt = "\n\nTest Sentence: " + test_sentence
    test_prompt += "\nTest Output: "
    return test_prompt


def prepare_prompt(ontology: dict, test_sentence: str, train_sent: str) -> str:

    prompt_fixed = '''Extract relational triplets from the sentence based on the provided ontology relations and examples.
Use only the listed relations and ensure subjects and objects align with their specified restrictions.
Only return the triples in the format relation(subject, object), separated by commas. Do not include explanations, extra text, or comments. \n
'''

    prompt = prompt_fixed
    prompt += 'CONTEXT:\n\n'
    prompt += '\nOntology Relations: '
    prompt += get_ontology_relations(ontology)
    prompt += get_example_prompt(train_sent)
    prompt += get_test_prompt(test_sentence)

    return prompt

In [23]:
baseline_one_shot_prompts = []
for test_sentence in tests:
            test_sentence_id = test_sentence['id']
            # test sentence for which the prompt to be generated
            test_sentence = test_sentence['sent']

            # get the similar train sentences for the test sentence
            similar_sents = get_similar_sentences(test_sentence_id, test_train_similarity)
            # we retrieve by default the first similar sentence from the list of similar sentences
            simil_sent_id = similar_sents[0]

            # we get the train sentence from the train sentences list and from there we process each field sub_label, obj_label, rel_label
            train_sent = get_train_sentence(simil_sent_id, train)

            # prompt generation logic
            prompt = prepare_prompt(ontology, test_sentence, train_sent)
            prompt_data = {'id': test_sentence_id, 'prompt': prompt}
            baseline_one_shot_prompts.append(prompt_data)
            print(prompt_data)

{'id': 'ont_19_film_test_1', 'prompt': 'Extract relational triplets from the sentence based on the provided ontology relations and examples.\nUse only the listed relations and ensure subjects and objects align with their specified restrictions.\nOnly return the triples in the format relation(subject, object), separated by commas. Do not include explanations, extra text, or comments. \n\nCONTEXT:\n\n\nOntology Relations: timeshiftChannel(Film,Channel), director(Film,Person), imdbId(Film,), distributor(Film,Company), musicComposer(Film,Artist), type(Film,Type), deathDate(Artist,), club(Film,Club), editing(Film,Artist), releaseDate(Film,), child(Artist,Person), producer(Film,Person), birthPlace(Artist,Place), headquarter(Company,City), deathPlace(Artist,Place), location(Company,Place), budget(Film,), runtime(Film,), owner(Company,Person), starring(Film,Artist), iso6392Code(City,), writer(,Person), industry(Company,Industry), activeYearsStartYear(Artist,), occupation(Artist,Occupation), si

In [24]:
output_file_path = os.path.join(output_folder_path, "ont_19_film_baseline_rels_1_shot_prompts.jsonl")
utils.save_jsonl(baseline_one_shot_prompts, output_file_path)

Saved JSONL data to c:/Users/yuche/OneDrive - TUM/phd/KGC exp/pilot-exp/exp-round1\data\dbpedia_new\baseline_rels\ont_19_film_baseline_rels_1_shot_prompts.jsonl
