Skip to content

Commit

Permalink
Add incremental converter builder (#34)
Browse files Browse the repository at this point in the history
Closes #33
  • Loading branch information
cthoyt committed Feb 27, 2023
1 parent 3bb5fe2 commit b5aaa1a
Show file tree
Hide file tree
Showing 3 changed files with 150 additions and 17 deletions.
23 changes: 23 additions & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,26 @@ CURIEs using a combination of :meth:`curies.Converter.expand` and :class:`rdflib
converter = curies.get_obo_converter()
uri_ref = rdflib.URIRef(converter.expand("CHEBI:138488"))
Incremental Converters
----------------------
As suggested in `#13 <https://github.com/cthoyt/curies/issues/33>`_, new prefixes
can be added to an existing converter like in the following:

.. code-block::
import curies
converter = curies.get_obo_converter()
converter.add_prefix("hgnc", "https://bioregistry.io/hgnc:")
Similarly, an empty converter can be instantiated using an empty list
for the `records` argument and prefixes can be added one at a time
(note this currently does not allow for adding synonyms separately):

.. code-block::
import curies
converter = curies.Converter(records=[])
converter.add_prefix("hgnc", "https://bioregistry.io/hgnc:")
83 changes: 78 additions & 5 deletions src/curies/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
TYPE_CHECKING,
Any,
Callable,
Collection,
DefaultDict,
Dict,
Iterable,
Expand Down Expand Up @@ -122,7 +123,7 @@ def _get_duplicate_prefixes(records: List[Record]) -> List[Tuple[Record, Record,
]


def _get_prefix_map(records: List[Record]) -> Mapping[str, str]:
def _get_prefix_map(records: List[Record]) -> Dict[str, str]:
rv = {}
for record in records:
rv[record.prefix] = record.uri_prefix
Expand All @@ -131,7 +132,7 @@ def _get_prefix_map(records: List[Record]) -> Mapping[str, str]:
return rv


def _get_reverse_prefix_map(records: List[Record]) -> Mapping[str, str]:
def _get_reverse_prefix_map(records: List[Record]) -> Dict[str, str]:
rv = {}
for record in records:
rv[record.uri_prefix] = record.prefix
Expand Down Expand Up @@ -182,17 +183,17 @@ class Converter:
"""

#: The expansion dictionary with prefixes as keys and priority URI prefixes as values
prefix_map: Mapping[str, str]
prefix_map: Dict[str, str]
#: The mapping from URI prefixes to prefixes
reverse_prefix_map: Mapping[str, str]
reverse_prefix_map: Dict[str, str]
#: A prefix trie for efficient parsing of URIs
trie: StringTrie

def __init__(self, records: List[Record], *, delimiter: str = ":", strict: bool = True) -> None:
"""Instantiate a converter.
:param records:
A list of records
A list of records. If you plan to build a converter incrementally, pass an empty list.
:param strict:
If true, raises issues on duplicate URI prefixes
:param delimiter:
Expand All @@ -214,6 +215,78 @@ def __init__(self, records: List[Record], *, delimiter: str = ":", strict: bool
self.reverse_prefix_map = _get_reverse_prefix_map(records)
self.trie = StringTrie(self.reverse_prefix_map)

def _check_record(self, record: Record) -> None:
"""Check if the record can be added."""
if record.prefix in self.prefix_map:
raise ValueError(f"new record has duplicate prefix: {record.prefix}")
if record.uri_prefix in self.reverse_prefix_map:
raise ValueError(f"new record has duplicate URI prefix: {record.uri_prefix}")
for prefix_synonym in record.prefix_synonyms:
if prefix_synonym in self.prefix_map:
raise ValueError(f"new record has duplicate prefix: {prefix_synonym}")
for uri_prefix_synonym in record.uri_prefix_synonyms:
if uri_prefix_synonym in self.reverse_prefix_map:
raise ValueError(f"new record has duplicate URI prefix: {uri_prefix_synonym}")

def add_record(self, record: Record) -> None:
"""Append a record to the converter."""
self._check_record(record)

self.prefix_map[record.prefix] = record.uri_prefix
for prefix_synonym in record.prefix_synonyms:
self.prefix_map[prefix_synonym] = record.uri_prefix

self.reverse_prefix_map[record.uri_prefix] = record.prefix
self.trie[record.uri_prefix] = record.prefix
for uri_prefix_synonym in record.uri_prefix_synonyms:
self.reverse_prefix_map[uri_prefix_synonym] = record.prefix
self.trie[uri_prefix_synonym] = record.prefix

def add_prefix(
self,
prefix: str,
uri_prefix: str,
prefix_synonyms: Optional[Collection[str]] = None,
uri_prefix_synonyms: Optional[Collection[str]] = None,
) -> None:
"""Append a prefix to the converter.
:param prefix:
The prefix to append, e.g., ``go``
:param uri_prefix:
The URI prefix to append, e.g., ``http://purl.obolibrary.org/obo/GO_``
:param prefix_synonyms:
An optional collection of synonyms for the prefix such as ``gomf``, ``gocc``, etc.
:param uri_prefix_synonyms:
An optional collections of synonyms for the URI prefix such as
``https://bioregistry.io/go:``, ``http://www.informatics.jax.org/searches/GO.cgi?id=GO:``, etc.
This can be used to add missing namespaces on-the-fly to an existing converter:
>>> import curies
>>> converter = curies.get_obo_converter()
>>> converter.add_prefix("hgnc", "https://bioregistry.io/hgnc:")
>>> converter.expand("hgnc:1234")
'https://bioregistry.io/hgnc:1234'
>>> converter.expand("GO:0032571 ")
'http://purl.obolibrary.org/obo/GO_0032571'
This can also be used to incrementally build up a converter from scratch:
>>> import curies
>>> converter = curies.Converter(records=[])
>>> converter.add_prefix("hgnc", "https://bioregistry.io/hgnc:")
>>> converter.expand("hgnc:1234")
'https://bioregistry.io/hgnc:1234'
"""
record = Record(
prefix=prefix,
uri_prefix=uri_prefix,
prefix_synonyms=sorted(prefix_synonyms or []),
uri_prefix_synonyms=sorted(uri_prefix_synonyms or []),
)
self.add_record(record)

@classmethod
def from_extended_prefix_map(
cls, records: LocationOr[Iterable[Union[Record, Dict[str, Any]]]], **kwargs: Any
Expand Down
61 changes: 49 additions & 12 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,13 @@ class TestConverter(unittest.TestCase):

def setUp(self) -> None:
"""Set up the converter test case."""
self.converter = Converter.from_prefix_map(
{
"CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
"MONDO": "http://purl.obolibrary.org/obo/MONDO_",
"GO": "http://purl.obolibrary.org/obo/GO_",
"OBO": "http://purl.obolibrary.org/obo/",
}
)
self.simple_obo_prefix_map = {
"CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
"MONDO": "http://purl.obolibrary.org/obo/MONDO_",
"GO": "http://purl.obolibrary.org/obo/GO_",
"OBO": "http://purl.obolibrary.org/obo/",
}
self.converter = Converter.from_prefix_map(self.simple_obo_prefix_map)

def test_invalid_record(self):
"""Test throwing an error for invalid records."""
Expand Down Expand Up @@ -82,16 +81,21 @@ def test_invalid_records(self):
def test_convert(self):
"""Test compression."""
self.assertEqual({"CHEBI", "MONDO", "GO", "OBO"}, self.converter.get_prefixes())
self._assert_convert(self.converter)

def _assert_convert(self, converter: Converter):
self.assertIn("GO", converter.prefix_map)
self.assertIn("http://purl.obolibrary.org/obo/GO_", converter.reverse_prefix_map)
self.assertIn("http://purl.obolibrary.org/obo/GO_", converter.trie)
for curie, uri in [
("CHEBI:1", "http://purl.obolibrary.org/obo/CHEBI_1"),
("OBO:unnamespaced", "http://purl.obolibrary.org/obo/unnamespaced"),
]:
self.assertEqual(curie, self.converter.compress(uri))
self.assertEqual(uri, self.converter.expand(curie))
self.assertEqual(curie, converter.compress(uri))
self.assertEqual(uri, converter.expand(curie))

self.assertIsNone(self.converter.compress("http://example.org/missing:00000"))
self.assertIsNone(self.converter.expand("missing:00000"))
self.assertIsNone(converter.compress("http://example.org/missing:00000"))
self.assertIsNone(converter.expand("missing:00000"))

def test_remote(self):
"""Test loading a remote JSON-LD context."""
Expand Down Expand Up @@ -306,6 +310,39 @@ def test_file_bulk(self):
lines = [line.strip().split("\t") for line in path.read_text().splitlines()]
self.assertEqual("CHEBI:1", lines[idx][0])

def test_incremental(self):
"""Test building a converter from an incremental interface."""
converter = Converter([])
for prefix, uri_prefix in self.simple_obo_prefix_map.items():
converter.add_prefix(prefix, uri_prefix)
converter.add_prefix(
"hgnc",
"https://bioregistry.io/hgnc:",
prefix_synonyms=["HGNC"],
uri_prefix_synonyms=["https://identifiers.org/hgnc:"],
)
self._assert_convert(converter)
self.assertEqual(
"hgnc:1234",
converter.compress("https://bioregistry.io/hgnc:1234"),
)
self.assertEqual(
"hgnc:1234",
converter.compress("https://identifiers.org/hgnc:1234"),
)
self.assertEqual("https://bioregistry.io/hgnc:1234", converter.expand("HGNC:1234"))

with self.assertRaises(ValueError):
converter.add_prefix("GO", "...")
with self.assertRaises(ValueError):
converter.add_prefix("...", "http://purl.obolibrary.org/obo/GO_")
with self.assertRaises(ValueError):
converter.add_prefix(
"...", "...", uri_prefix_synonyms=["http://purl.obolibrary.org/obo/GO_"]
)
with self.assertRaises(ValueError):
converter.add_prefix("...", "...", prefix_synonyms=["GO"])


class TestVersion(unittest.TestCase):
"""Trivially test a version."""
Expand Down

0 comments on commit b5aaa1a

Please sign in to comment.