Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

list delimiters (issue 345) #370

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
33 changes: 5 additions & 28 deletions kgx/sink/tsv_sink.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
extension_types,
archive_write_mode,
archive_format,
remove_null,
_sanitize_export,
build_export_row
)


Expand All @@ -23,6 +22,7 @@
"category",
"provided_by",
}
DEFAULT_LIST_DELIMITER = "|"


class TsvSink(Sink):
Expand Down Expand Up @@ -60,6 +60,7 @@ def __init__(
if compression in archive_write_mode
else None
)
self.list_delimiter = kwargs["list_delimiter"] if "list_delimiter" in kwargs else DEFAULT_LIST_DELIMITER
self.nodes_file_basename = f"{self.basename}_nodes.{self.extension}"
self.edges_file_basename = f"{self.basename}_edges.{self.extension}"
if self.dirname:
Expand Down Expand Up @@ -96,7 +97,7 @@ def write_node(self, record: Dict) -> None:
A node record

"""
row = self._build_export_row(record)
row = build_export_row(record, list_delimiter=self.list_delimiter)
row["id"] = record["id"]
values = []
for c in self.ordered_node_columns:
Expand All @@ -116,7 +117,7 @@ def write_edge(self, record: Dict) -> None:
An edge record

"""
row = self._build_export_row(record)
row = build_export_row(record, list_delimiter=self.list_delimiter)
values = []
for c in self.ordered_edge_columns:
if c in row:
Expand Down Expand Up @@ -144,30 +145,6 @@ def finalize(self) -> None:
if os.path.isfile(self.edges_file_name):
os.remove(self.edges_file_name)

@staticmethod
def _build_export_row(data: Dict) -> Dict:
"""
Casts all values to primitive types like str or bool according to the
specified type in ``_column_types``. Lists become pipe delimited strings.

Parameters
----------
data: Dict
A dictionary containing key-value pairs

Returns
-------
Dict
A dictionary containing processed key-value pairs

"""
tidy_data = {}
for key, value in data.items():
new_value = remove_null(value)
if new_value:
tidy_data[key] = _sanitize_export(key, new_value)
return tidy_data

@staticmethod
def _order_node_columns(cols: Set) -> OrderedSet:
"""
Expand Down
1 change: 1 addition & 0 deletions kgx/source/json_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class JsonSource(TsvSource):
def __init__(self):
super().__init__()
self.compression = None
self.list_delimiter = None

def parse(
self,
Expand Down
1 change: 1 addition & 0 deletions kgx/source/jsonl_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class JsonlSource(JsonSource):

def __init__(self):
super().__init__()
self.list_delimiter = None

def parse(
self,
Expand Down
9 changes: 7 additions & 2 deletions kgx/source/tsv_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

log = get_logger()

DEFAULT_LIST_DELIMITER = "|"


class TsvSource(Source):
"""
Expand All @@ -26,6 +28,7 @@ class TsvSource(Source):

def __init__(self):
super().__init__()
self.list_delimiter = DEFAULT_LIST_DELIMITER

def set_prefix_map(self, m: Dict) -> None:
"""
Expand Down Expand Up @@ -85,6 +88,8 @@ def parse(
# set '\n' to be the default line terminator to prevent
# truncation of lines due to hidden/escaped carriage returns
kwargs["lineterminator"] = "\n" # type: ignore
if "list_delimeter" in kwargs:
self.list_delimiter = kwargs["list_delimiter"]

mode = (
archive_read_mode[compression] if compression in archive_read_mode else None
Expand Down Expand Up @@ -219,7 +224,7 @@ def read_node(self, node: Dict) -> Optional[Tuple[str, Dict]]:
A tuple that contains node id and node data
"""
node = validate_node(node)
node_data = sanitize_import(node.copy())
node_data = sanitize_import(node.copy(), list_delimiter=self.list_delimiter)
if "id" in node_data:

n = node_data["id"]
Expand Down Expand Up @@ -267,7 +272,7 @@ def read_edge(self, edge: Dict) -> Optional[Tuple]:

"""
edge = validate_edge(edge)
edge_data = sanitize_import(edge.copy())
edge_data = sanitize_import(edge.copy(), list_delimiter=self.list_delimiter)
if "id" not in edge_data:
edge_data["id"] = generate_uuid()
s = edge_data["subject"]
Expand Down
101 changes: 60 additions & 41 deletions kgx/utils/kgx_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@
CORE_NODE_PROPERTIES = {"id", "name"}
CORE_EDGE_PROPERTIES = {"id", "subject", "predicate", "object", "type"}

LIST_DELIMITER = "|"


class GraphEntityType(Enum):
GRAPH = "graph"
Expand Down Expand Up @@ -843,14 +841,17 @@ def generate_edge_identifiers(graph: BaseGraph):
data["id"] = generate_uuid()


def sanitize_import(data: Dict) -> Dict:
def sanitize_import(data: Dict, list_delimiter: str=None) -> Dict:
"""
Sanitize key-value pairs in dictionary.
This should be used to ensure proper syntax and types for node and edge data as it is imported.

Parameters
----------
data: Dict
A dictionary containing key-value pairs
list_delimiter: str
Optionally provide a delimiter character or string to be used to split strings into lists.

Returns
-------
Expand All @@ -862,14 +863,19 @@ def sanitize_import(data: Dict) -> Dict:
for key, value in data.items():
new_value = remove_null(value)
if new_value is not None:
tidy_data[key] = _sanitize_import(key, new_value)
tidy_data[key] = _sanitize_import_property(key, new_value, list_delimiter)
return tidy_data


def _sanitize_import(key: str, value: Any) -> Any:
def _sanitize_import_property(key: str, value: Any, list_delimiter: str) -> Any:
"""
Sanitize value for a key for the purpose of import.

Casts all values to primitive types like str or bool according to the
specified type in ``column_types``.

If a list_delimiter is provided lists will be converted into strings using the delimiter.

Parameters
----------
key: str
Expand All @@ -883,7 +889,6 @@ def _sanitize_import(key: str, value: Any) -> Any:
Sanitized value

"""
new_value: Any
if key in column_types:
if column_types[key] == list:
if isinstance(value, (list, set, tuple)):
Expand All @@ -894,17 +899,20 @@ def _sanitize_import(key: str, value: Any) -> Any:
new_value = list(value)
elif isinstance(value, str):
value = value.replace("\n", " ").replace("\t", " ")
new_value = [x for x in value.split(LIST_DELIMITER) if x]
new_value = [x for x in value.split(list_delimiter) if x] if list_delimiter else value
else:
new_value = [str(value).replace("\n", " ").replace("\t", " ")]
elif column_types[key] == bool:
try:
new_value = bool(value)
except:
new_value = False
# the rest of this if/else block doesn't seem right:
# it's not checking the type against the expected type even though one exists
elif isinstance(value, (str, float)):
new_value = value
else:
# we might want to raise an exception or somehow indicate a type mismatch in the input data
new_value = str(value).replace("\n", " ").replace("\t", " ")
else:
if isinstance(value, (list, set, tuple)):
Expand All @@ -914,9 +922,9 @@ def _sanitize_import(key: str, value: Any) -> Any:
]
new_value = list(value)
elif isinstance(value, str):
if LIST_DELIMITER in value:
if list_delimiter and list_delimiter in value:
value = value.replace("\n", " ").replace("\t", " ")
new_value = [x for x in value.split(LIST_DELIMITER) if x]
new_value = [x for x in value.split(list_delimiter) if x]
else:
new_value = value.replace("\n", " ").replace("\t", " ")
elif isinstance(value, bool):
Expand All @@ -931,24 +939,56 @@ def _sanitize_import(key: str, value: Any) -> Any:
return new_value


def _sanitize_export(key: str, value: Any) -> Any:
def build_export_row(data: Dict, list_delimiter: str=None) -> Dict:
"""
Sanitize key-value pairs in dictionary.
This should be used to ensure proper syntax and types for node and edge data as it is exported.

Parameters
----------
data: Dict
A dictionary containing key-value pairs
list_delimiter: str
Optionally provide a delimiter character or string to be used to convert lists into strings.

Returns
-------
Dict
A dictionary containing processed key-value pairs

"""
tidy_data = {}
for key, value in data.items():
new_value = remove_null(value)
if new_value:
tidy_data[key] = _sanitize_export_property(key, new_value, list_delimiter)
return tidy_data


def _sanitize_export_property(key: str, value: Any, list_delimiter: str=None) -> Any:
"""
Sanitize value for a key for the purpose of export.

Casts all values to primitive types like str or bool according to the
specified type in ``column_types``.

If a list_delimiter is provided lists will be converted into strings using the delimiter.

Parameters
----------
key: str
Key corresponding to a node/edge property
value: Any
Value corresponding to the key
list_delimiter: str
Optionally provide a delimiter character or string to be used to convert lists into strings.

Returns
-------
value: Any
Sanitized value

"""
new_value: Any
if key in column_types:
if column_types[key] == list:
if isinstance(value, (list, set, tuple)):
Expand All @@ -958,7 +998,7 @@ def _sanitize_export(key: str, value: Any) -> Any:
else v
for v in value
]
new_value = LIST_DELIMITER.join([str(x) for x in value])
new_value = list_delimiter.join([str(x) for x in value]) if list_delimiter else value
else:
new_value = (
str(value).replace("\n", " ").replace('\\"', "").replace("\t", " ")
Expand All @@ -974,15 +1014,18 @@ def _sanitize_export(key: str, value: Any) -> Any:
)
else:
if type(value) == list:
new_value = LIST_DELIMITER.join([str(x) for x in value])
new_value = (
new_value.replace("\n", " ").replace('\\"', "").replace("\t", " ")
)
value = [
v.replace("\n", " ").replace('\\"', "").replace("\t", " ")
if isinstance(v, str)
else v
for v in value
]
new_value = list_delimiter.join([str(x) for x in value]) if list_delimiter else value
column_types[key] = list
elif type(value) == bool:
try:
new_value = bool(value)
column_types[key] = bool
column_types[key] = bool # this doesn't seem right, shouldn't column_types come from the biolink model?
except:
new_value = False
else:
Expand All @@ -992,30 +1035,6 @@ def _sanitize_export(key: str, value: Any) -> Any:
return new_value


def _build_export_row(data: Dict) -> Dict:
"""
Casts all values to primitive types like str or bool according to the
specified type in ``_column_types``. Lists become pipe delimited strings.

Parameters
----------
data: Dict
A dictionary containing key-value pairs

Returns
-------
Dict
A dictionary containing processed key-value pairs

"""
tidy_data = {}
for key, value in data.items():
new_value = remove_null(value)
if new_value:
tidy_data[key] = _sanitize_export(key, new_value)
return tidy_data


def remove_null(input: Any) -> Any:
"""
Remove any null values from input.
Expand Down
4 changes: 2 additions & 2 deletions tests/resources/test_nodes.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
id,category,name,description,provided_by
CURIE:123,biolink:Gene,Gene 123,"Node of type Gene, CURIE:123",test_nodes.tsv
CURIE:456,biolink:Disease,Disease 456,"Node of type Disease, CURIE:456",test_nodes.tsv
CURIE:123,biolink:NamedThing|biolink:Gene,Gene 123,"Node of type Gene, CURIE:123",test_nodes.tsv
CURIE:456,biolink:NamedThing|biolink:Disease,Disease 456,"Node of type Disease, CURIE:456",test_nodes.tsv
CURIE:000,biolink:NamedThing,NamedThing 000,"Node of type NamedThing, CURIE:000",test_nodes.tsv
4 changes: 2 additions & 2 deletions tests/resources/test_nodes.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
id name category description
CURIE:123 Gene 123 biolink:Gene "Node of type Gene, CURIE:123"
CURIE:456 Disease 456 biolink:Disease "Node of type Disease, CURIE:456"
CURIE:123 Gene 123 biolink:NamedThing|biolink:Gene "Node of type Gene, CURIE:123"
CURIE:456 Disease 456 biolink:NamedThing|biolink:Disease "Node of type Disease, CURIE:456"
CURIE:000 NamedThing 000 biolink:NamedThing "Node of type NamedThing, CURIE:000"
12 changes: 8 additions & 4 deletions tests/resources/valid.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,14 @@
]
},
{
"id": "MONDO:0005002",
"name": "chronic obstructive pulmonary disease",
"category": [
"biolink:Disease"
"id": "PUBCHEM.COMPOUND:10429502",
"name": "16|A-Methyl Prednisolone",
"category": [
"biolink:ChemicalEntity",
"biolink:NamedThing"
],
"equivalent_identifiers": [
"CHEMBL.COMPOUND:CHEMBL1940557"
]
},
{
Expand Down
Binary file modified tests/resources/valid.json.gz
Binary file not shown.
1 change: 1 addition & 0 deletions tests/resources/valid_nodes.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
{"id": "MONDO:0013329", "name": "familial clubfoot due to 17q23.1q23.2 microduplication", "category": ["biolink:Disease"]}
{"id": "MONDO:0017148", "name": "heritable pulmonary arterial hypertension", "category": ["biolink:Disease"]}
{"id": "MONDO:0007841", "name": "coxopodopatellar syndrome", "category": ["biolink:Disease"]}
{"id": "PUBCHEM.COMPOUND:10429502", "name": "16|A-Methyl Prednisolone", "category": ["biolink:ChemicalEntity", "biolink:NamedThing"]}