# Dataset utils for joint entity relation extraction

In [None]:
#|default_exp jerx.dataset.docred

In [None]:
#|hide
from fastcore.test import *
from nbdev.showdoc import *

In [None]:
#|export

import string

In [None]:
#|export

puncts = set(string.punctuation)

def join_tokens(tokens):
    return ''.join(token if token in puncts else " " + token for token in tokens).strip()

def extract_sentences(example):
    for sent_tokens in example['sents']:
        yield join_tokens(sent_tokens).replace("- ", "-")

def extract_text(example):
    return ' '.join(extract_sentences(example))

def extract_triplets(example):
    for head, rel, tail in zip(example['labels']['head'], example['labels']['relation_text'], example['labels']['tail']):
        yield [example['vertexSet'][head][0]['name'], rel, example['vertexSet'][tail][0]['name']]

def transform_docred(example, delimiter="|"):
    triplets = [delimiter.join(triplet) for triplet in extract_triplets(example)]
    text = extract_text(example)
    return {'text': text, 'triplets': triplets}

In [None]:
#|hide
from datasets import load_dataset

ds = load_dataset("docred", split="validation[:10]")
jerx_ds = ds.map(transform_docred, remove_columns=ds.column_names)

assert 'text' in jerx_ds.features
assert 'triplets' in jerx_ds.features
assert isinstance(jerx_ds[0]['triplets'], list)
assert isinstance(jerx_ds[0]['triplets'][0], str)
jerx_ds[0]

{'text': "Skai TV is a Greek free-to-air television network based in Piraeus. It is part of the Skai Group, one of the largest media groups in the country. It was relaunched in its present form on 1st of April 2006 in the Athens metropolitan area, and gradually spread its coverage nationwide. Besides digital terrestrial transmission, it is available on the subscription-based encrypted services of Nova and Cosmote TV. Skai TV is also a member of Digea, a consortium of private television networks introducing digital terrestrial transmission in Greece. At launch, Skai TV opted for dubbing all foreign language content into Greek, instead of using subtitles. This is very uncommon in Greece for anything except documentaries( using voiceover dubbing) and children 's programmes( using lip-synced dubbing), so after intense criticism the station switched to using subtitles for almost all foreign shows.",
 'triplets': ['Piraeus|country|Greece',
  'Skai Group|country|Greece',
  'Athens|country|Gre

In [None]:
#|hide
import nbdev; nbdev.nbdev_export()