biolink · RichardBruskiewich · Jan 12, 2022 · Jan 12, 2022
diff --git a/kgx/sink/tsv_sink.py b/kgx/sink/tsv_sink.py
@@ -8,8 +8,7 @@
     extension_types,
     archive_write_mode,
     archive_format,
-    remove_null,
-    _sanitize_export,
+    build_export_row
 )
 
 
@@ -23,6 +22,7 @@
     "category",
     "provided_by",
 }
+DEFAULT_LIST_DELIMITER = "|"
 
 
 class TsvSink(Sink):
@@ -60,6 +60,7 @@ def __init__(
             if compression in archive_write_mode
             else None
         )
+        self.list_delimiter = kwargs["list_delimiter"] if "list_delimiter" in kwargs else DEFAULT_LIST_DELIMITER
         self.nodes_file_basename = f"{self.basename}_nodes.{self.extension}"
         self.edges_file_basename = f"{self.basename}_edges.{self.extension}"
         if self.dirname:
@@ -96,7 +97,7 @@ def write_node(self, record: Dict) -> None:
             A node record
 
         """
-        row = self._build_export_row(record)
+        row = build_export_row(record, list_delimiter=self.list_delimiter)
         row["id"] = record["id"]
         values = []
         for c in self.ordered_node_columns:
@@ -116,7 +117,7 @@ def write_edge(self, record: Dict) -> None:
             An edge record
 
         """
-        row = self._build_export_row(record)
+        row = build_export_row(record, list_delimiter=self.list_delimiter)
         values = []
         for c in self.ordered_edge_columns:
             if c in row:
@@ -144,30 +145,6 @@ def finalize(self) -> None:
                 if os.path.isfile(self.edges_file_name):
                     os.remove(self.edges_file_name)
 
-    @staticmethod
-    def _build_export_row(data: Dict) -> Dict:
-        """
-        Casts all values to primitive types like str or bool according to the
-        specified type in ``_column_types``. Lists become pipe delimited strings.
-
-        Parameters
-        ----------
-        data: Dict
-            A dictionary containing key-value pairs
-
-        Returns
-        -------
-        Dict
-            A dictionary containing processed key-value pairs
-
-        """
-        tidy_data = {}
-        for key, value in data.items():
-            new_value = remove_null(value)
-            if new_value:
-                tidy_data[key] = _sanitize_export(key, new_value)
-        return tidy_data
-
     @staticmethod
     def _order_node_columns(cols: Set) -> OrderedSet:
         """

diff --git a/kgx/source/json_source.py b/kgx/source/json_source.py
@@ -16,6 +16,7 @@ class JsonSource(TsvSource):
     def __init__(self):
         super().__init__()
         self.compression = None
+        self.list_delimiter = None
 
     def parse(
         self,

diff --git a/kgx/source/jsonl_source.py b/kgx/source/jsonl_source.py
@@ -18,6 +18,7 @@ class JsonlSource(JsonSource):
 
     def __init__(self):
         super().__init__()
+        self.list_delimiter = None
 
     def parse(
         self,

diff --git a/kgx/source/tsv_source.py b/kgx/source/tsv_source.py
@@ -17,6 +17,8 @@
 
 log = get_logger()
 
+DEFAULT_LIST_DELIMITER = "|"
+
 
 class TsvSource(Source):
     """
@@ -26,6 +28,7 @@ class TsvSource(Source):
 
     def __init__(self):
         super().__init__()
+        self.list_delimiter = DEFAULT_LIST_DELIMITER
 
     def set_prefix_map(self, m: Dict) -> None:
         """
@@ -85,6 +88,8 @@ def parse(
             # set '\n' to be the default line terminator to prevent
             # truncation of lines due to hidden/escaped carriage returns
             kwargs["lineterminator"] = "\n"  # type: ignore
+        if "list_delimeter" in kwargs:
+            self.list_delimiter = kwargs["list_delimiter"]
 
         mode = (
             archive_read_mode[compression] if compression in archive_read_mode else None
@@ -219,7 +224,7 @@ def read_node(self, node: Dict) -> Optional[Tuple[str, Dict]]:
             A tuple that contains node id and node data
         """
         node = validate_node(node)
-        node_data = sanitize_import(node.copy())
+        node_data = sanitize_import(node.copy(), list_delimiter=self.list_delimiter)
         if "id" in node_data:
 
             n = node_data["id"]
@@ -267,7 +272,7 @@ def read_edge(self, edge: Dict) -> Optional[Tuple]:
 
         """
         edge = validate_edge(edge)
-        edge_data = sanitize_import(edge.copy())
+        edge_data = sanitize_import(edge.copy(), list_delimiter=self.list_delimiter)
         if "id" not in edge_data:
             edge_data["id"] = generate_uuid()
         s = edge_data["subject"]

diff --git a/kgx/utils/kgx_utils.py b/kgx/utils/kgx_utils.py
@@ -33,8 +33,6 @@
 CORE_NODE_PROPERTIES = {"id", "name"}
 CORE_EDGE_PROPERTIES = {"id", "subject", "predicate", "object", "type"}
 
-LIST_DELIMITER = "|"
-
 
 class GraphEntityType(Enum):
     GRAPH = "graph"
@@ -843,14 +841,17 @@ def generate_edge_identifiers(graph: BaseGraph):
             data["id"] = generate_uuid()
 
 
-def sanitize_import(data: Dict) -> Dict:
+def sanitize_import(data: Dict, list_delimiter: str=None) -> Dict:
     """
     Sanitize key-value pairs in dictionary.
+    This should be used to ensure proper syntax and types for node and edge data as it is imported.
 
     Parameters
     ----------
     data: Dict
         A dictionary containing key-value pairs
+    list_delimiter: str
+        Optionally provide a delimiter character or string to be used to split strings into lists.
 
     Returns
     -------
@@ -862,14 +863,19 @@ def sanitize_import(data: Dict) -> Dict:
     for key, value in data.items():
         new_value = remove_null(value)
         if new_value is not None:
-            tidy_data[key] = _sanitize_import(key, new_value)
+            tidy_data[key] = _sanitize_import_property(key, new_value, list_delimiter)
     return tidy_data
 
 
-def _sanitize_import(key: str, value: Any) -> Any:
+def _sanitize_import_property(key: str, value: Any, list_delimiter: str) -> Any:
     """
     Sanitize value for a key for the purpose of import.
 
+    Casts all values to primitive types like str or bool according to the
+    specified type in ``column_types``.
+
+    If a list_delimiter is provided lists will be converted into strings using the delimiter.
+
     Parameters
     ----------
     key: str
@@ -883,7 +889,6 @@ def _sanitize_import(key: str, value: Any) -> Any:
         Sanitized value
 
     """
-    new_value: Any
     if key in column_types:
         if column_types[key] == list:
             if isinstance(value, (list, set, tuple)):
@@ -894,17 +899,20 @@ def _sanitize_import(key: str, value: Any) -> Any:
                 new_value = list(value)
             elif isinstance(value, str):
                 value = value.replace("\n", " ").replace("\t", " ")
-                new_value = [x for x in value.split(LIST_DELIMITER) if x]
+                new_value = [x for x in value.split(list_delimiter) if x] if list_delimiter else value
             else:
                 new_value = [str(value).replace("\n", " ").replace("\t", " ")]
         elif column_types[key] == bool:
             try:
                 new_value = bool(value)
             except:
                 new_value = False
+        # the rest of this if/else block doesn't seem right:
+        # it's not checking the type against the expected type even though one exists
         elif isinstance(value, (str, float)):
             new_value = value
         else:
+            # we might want to raise an exception or somehow indicate a type mismatch in the input data
             new_value = str(value).replace("\n", " ").replace("\t", " ")
     else:
         if isinstance(value, (list, set, tuple)):
@@ -914,9 +922,9 @@ def _sanitize_import(key: str, value: Any) -> Any:
             ]
             new_value = list(value)
         elif isinstance(value, str):
-            if LIST_DELIMITER in value:
+            if list_delimiter and list_delimiter in value:
                 value = value.replace("\n", " ").replace("\t", " ")
-                new_value = [x for x in value.split(LIST_DELIMITER) if x]
+                new_value = [x for x in value.split(list_delimiter) if x]
             else:
                 new_value = value.replace("\n", " ").replace("\t", " ")
         elif isinstance(value, bool):
@@ -931,24 +939,56 @@ def _sanitize_import(key: str, value: Any) -> Any:
     return new_value
 
 
-def _sanitize_export(key: str, value: Any) -> Any:
+def build_export_row(data: Dict, list_delimiter: str=None) -> Dict:
+    """
+    Sanitize key-value pairs in dictionary.
+    This should be used to ensure proper syntax and types for node and edge data as it is exported.
+
+    Parameters
+    ----------
+    data: Dict
+        A dictionary containing key-value pairs
+    list_delimiter: str
+        Optionally provide a delimiter character or string to be used to convert lists into strings.
+
+    Returns
+    -------
+    Dict
+        A dictionary containing processed key-value pairs
+
+    """
+    tidy_data = {}
+    for key, value in data.items():
+        new_value = remove_null(value)
+        if new_value:
+            tidy_data[key] = _sanitize_export_property(key, new_value, list_delimiter)
+    return tidy_data
+
+
+def _sanitize_export_property(key: str, value: Any, list_delimiter: str=None) -> Any:
     """
     Sanitize value for a key for the purpose of export.
 
+    Casts all values to primitive types like str or bool according to the
+    specified type in ``column_types``.
+
+    If a list_delimiter is provided lists will be converted into strings using the delimiter.
+
     Parameters
     ----------
     key: str
         Key corresponding to a node/edge property
     value: Any
         Value corresponding to the key
+    list_delimiter: str
+        Optionally provide a delimiter character or string to be used to convert lists into strings.
 
     Returns
     -------
     value: Any
         Sanitized value
 
     """
-    new_value: Any
     if key in column_types:
         if column_types[key] == list:
             if isinstance(value, (list, set, tuple)):
@@ -958,7 +998,7 @@ def _sanitize_export(key: str, value: Any) -> Any:
                     else v
                     for v in value
                 ]
-                new_value = LIST_DELIMITER.join([str(x) for x in value])
+                new_value = list_delimiter.join([str(x) for x in value]) if list_delimiter else value
             else:
                 new_value = (
                     str(value).replace("\n", " ").replace('\\"', "").replace("\t", " ")
@@ -974,15 +1014,18 @@ def _sanitize_export(key: str, value: Any) -> Any:
             )
     else:
         if type(value) == list:
-            new_value = LIST_DELIMITER.join([str(x) for x in value])
-            new_value = (
-                new_value.replace("\n", " ").replace('\\"', "").replace("\t", " ")
-            )
+            value = [
+                v.replace("\n", " ").replace('\\"', "").replace("\t", " ")
+                if isinstance(v, str)
+                else v
+                for v in value
+            ]
+            new_value = list_delimiter.join([str(x) for x in value]) if list_delimiter else value
             column_types[key] = list
         elif type(value) == bool:
             try:
                 new_value = bool(value)
-                column_types[key] = bool
+                column_types[key] = bool  # this doesn't seem right, shouldn't column_types come from the biolink model?
             except:
                 new_value = False
         else:
@@ -992,30 +1035,6 @@ def _sanitize_export(key: str, value: Any) -> Any:
     return new_value
 
 
-def _build_export_row(data: Dict) -> Dict:
-    """
-    Casts all values to primitive types like str or bool according to the
-    specified type in ``_column_types``. Lists become pipe delimited strings.
-
-    Parameters
-    ----------
-    data: Dict
-        A dictionary containing key-value pairs
-
-    Returns
-    -------
-    Dict
-        A dictionary containing processed key-value pairs
-
-    """
-    tidy_data = {}
-    for key, value in data.items():
-        new_value = remove_null(value)
-        if new_value:
-            tidy_data[key] = _sanitize_export(key, new_value)
-    return tidy_data
-
-
 def remove_null(input: Any) -> Any:
     """
     Remove any null values from input.

diff --git a/tests/resources/test_nodes.csv b/tests/resources/test_nodes.csv
@@ -1,4 +1,4 @@
 id,category,name,description,provided_by
-CURIE:123,biolink:Gene,Gene 123,"Node of type Gene, CURIE:123",test_nodes.tsv
-CURIE:456,biolink:Disease,Disease 456,"Node of type Disease, CURIE:456",test_nodes.tsv
+CURIE:123,biolink:NamedThing|biolink:Gene,Gene 123,"Node of type Gene, CURIE:123",test_nodes.tsv
+CURIE:456,biolink:NamedThing|biolink:Disease,Disease 456,"Node of type Disease, CURIE:456",test_nodes.tsv
 CURIE:000,biolink:NamedThing,NamedThing 000,"Node of type NamedThing, CURIE:000",test_nodes.tsv
diff --git a/tests/resources/test_nodes.tsv b/tests/resources/test_nodes.tsv
@@ -1,4 +1,4 @@
 id	name	category	description
-CURIE:123	Gene 123	biolink:Gene	"Node of type Gene, CURIE:123"
-CURIE:456	Disease 456	biolink:Disease	"Node of type Disease, CURIE:456"
+CURIE:123	Gene 123	biolink:NamedThing|biolink:Gene	"Node of type Gene, CURIE:123"
+CURIE:456	Disease 456	biolink:NamedThing|biolink:Disease	"Node of type Disease, CURIE:456"
 CURIE:000	NamedThing 000	biolink:NamedThing	"Node of type NamedThing, CURIE:000"
diff --git a/tests/resources/valid.json b/tests/resources/valid.json
@@ -15,10 +15,14 @@
       ]
     },
     {
-      "id": "MONDO:0005002",
-      "name": "chronic obstructive pulmonary disease",
-      "category": [
-        "biolink:Disease"
+      "id": "PUBCHEM.COMPOUND:10429502",
+      "name": "16|A-Methyl Prednisolone",
+      "category": [
+        "biolink:ChemicalEntity",
+        "biolink:NamedThing"
+      ],
+      "equivalent_identifiers": [
+        "CHEMBL.COMPOUND:CHEMBL1940557"
       ]
     },
     {

diff --git a/tests/resources/valid.json.gz b/tests/resources/valid.json.gz
diff --git a/tests/resources/valid_nodes.jsonl b/tests/resources/valid_nodes.jsonl
@@ -4,3 +4,4 @@
 {"id": "MONDO:0013329", "name": "familial clubfoot due to 17q23.1q23.2 microduplication", "category": ["biolink:Disease"]}
 {"id": "MONDO:0017148", "name": "heritable pulmonary arterial hypertension", "category": ["biolink:Disease"]}
 {"id": "MONDO:0007841", "name": "coxopodopatellar syndrome", "category": ["biolink:Disease"]}
+{"id": "PUBCHEM.COMPOUND:10429502", "name": "16|A-Methyl Prednisolone", "category": ["biolink:ChemicalEntity", "biolink:NamedThing"]}