# Dataset utils for Knowledge Graph Construction

In [None]:
#| default_exp ml.kg.dataset

In [None]:
#| hide
from fastcore.test import *
from nbdev.showdoc import *

In [None]:
#|export
from bellek.utils import split_camel_case

In [None]:
#|export

def _transform_triplet(triplet_string: str):
    delimiter = " | "
    entity1,rel,entity2 = triplet_string.split(delimiter)
    rel = ' '.join([word.lower() for word in split_camel_case(rel)])
    return delimiter.join([entity1, rel, entity2])

def _batch_transform_webnlg(examples):
    for lex, mts in zip(examples['lex'], examples['modified_triple_sets']):
        for text in lex['text']:
            triplets = [_transform_triplet(triplet_string) for triplet_string in mts['mtriple_set'][0]]
            yield dict(text=text, triplets=triplets)

def batch_transform_webnlg(examples):
    records = list(_batch_transform_webnlg(examples))
    return {
        'text': [record['text'] for record in records],
        'triplets': [record['triplets'] for record in records],
    }

In [None]:
#|hide
from datasets import load_dataset
ds = load_dataset("web_nlg", "release_v3.0_en", split="train[:10]")
erx_ds = ds.map(batch_transform_webnlg, batched=True, remove_columns=ds.column_names)

assert 'text' in erx_ds.features
assert 'triplets' in erx_ds.features
assert isinstance(erx_ds[0]['triplets'], list)
print(erx_ds[0])

{'text': 'The Aarhus is the airport of Aarhus, Denmark.', 'triplets': ['Aarhus_Airport | city served | "Aarhus, Denmark"']}


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()