# Utilities of joint entity and relation extraction

In [None]:
#|default_exp jerx.utils

In [None]:
#|hide
from fastcore.test import *
from nbdev.showdoc import *

In [None]:
#|export
import re
from typing import TypeAlias, Iterable, Callable, Any, Generator

from bellem.logging import get_logger

log = get_logger(__name__)

In [None]:
#|export

Entity: TypeAlias = str|tuple[str, str]
Relation: TypeAlias = str
Triplet: TypeAlias = tuple[Entity, Relation, Entity]

In [None]:
#|export

def remove_enumeration(s):
    # This pattern matches any leading digits followed by a dot and a space
    pattern = r'^\d+\.\s'
    # Use re.sub to replace the matched pattern with an empty string
    return re.sub(pattern, '', s)

def parse_triplet_strings(text: str, delimiter: str="|") -> list[str]:
    return [remove_enumeration(line) for line in text.splitlines() if line and line.count(delimiter) == 2]

def parse_triplets(text: str, delimiter: str="|") -> list[Triplet]:
    return [tuple([s.strip() for s in triplet_string.split(delimiter)]) for triplet_string in parse_triplet_strings(text, delimiter=delimiter)]

In [None]:
#|hide
text = """
  Sure! Here are the entity-relation-entity triplets for the given text:

Aleksandre_Guruli|club|US_Lesquin
Paris | capitalOf | France

Please provide the next text for extraction.
"""
assert sorted(parse_triplet_strings(text)) == ["Aleksandre_Guruli|club|US_Lesquin", "Paris | capitalOf | France"]
assert sorted(parse_triplets(text)) == [("Aleksandre_Guruli", "club", "US_Lesquin"), ('Paris', 'capitalOf', 'France')]

In [None]:
#|hide
import nbdev; nbdev.nbdev_export()