Skip to content

Commit

Permalink
Improve logging when skipping values or entities in mappings (#1425)
Browse files Browse the repository at this point in the history
* Return the (cleaned) value added to a proxy

This is a small change that allows callers of the `unsafe_add` method to determine if a value has been removed as part of value cleaning. This is helpful to be able to log/warn when values are removed.

* Log warning when discarding unclean values

Right now, mappings silently discard unclean values. This makes it difficult to debug mapping issues during development as well as to monitor mappings running in production.

* Log warning when entities are skipped during mappings
  • Loading branch information
tillprochaska committed May 8, 2024
1 parent c37c07b commit e3fb599
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 21 deletions.
16 changes: 14 additions & 2 deletions followthemoney/mapping/entity.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
from hashlib import sha1
from warnings import warn
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set
Expand All @@ -15,6 +16,8 @@
from followthemoney.model import Model
from followthemoney.mapping.query import QueryMapping

log = logging.getLogger(__name__)


class EntityMapping(object):

Expand Down Expand Up @@ -112,16 +115,24 @@ def map(
# from that accessible to phone and address parsers.
for prop in self.properties:
if prop.prop.type == registry.country:
prop.map(proxy, record, entities)
discarded_values = prop.map(proxy, record, entities)
for value in discarded_values:
log.warn(f"[{self.name}] Discarded unclean value \"{value}\" for property \"{prop.prop.qname}\".")

for prop in self.properties:
if prop.prop.type != registry.country:
prop.map(proxy, record, entities)
discarded_values = prop.map(proxy, record, entities)
for value in discarded_values:
log.warn(f"[{self.name}] Discarding unclean value \"{value}\" for property \"{prop.prop.qname}\".")

# Generate the ID at the end to avoid self-reference checks on empty
# keys.
proxy.id = self.compute_key(record)
if proxy.id is None:
if self.id_column:
log.warn(f"[{self.name}] Skipping entity because no ID could be computed. Make sure that there are no empty values in the \"{self.id_column}\" column.")
if self.keys:
log.warn(f"[{self.name}] Skipping entity because no ID could be computed. Make sure that there are no empty values in key columns.")
return None

for prop in self.properties:
Expand All @@ -130,6 +141,7 @@ def map(
# the mapping, not in the model. Basically it means: if
# this row of source data doesn't have that field, then do
# not map it again.
log.warn(f"[{self.name}] Skipping entity because required property \"{prop.prop.name}\" is empty.")
return None
return proxy

Expand Down
18 changes: 15 additions & 3 deletions followthemoney/mapping/property.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,13 +112,13 @@ def record_values(self, record: Record) -> List[str]:

def map(
self, proxy: EntityProxy, record: Record, entities: Dict[str, EntityProxy]
) -> None:
) -> List[str]:
if self.entity is not None:
entity = entities.get(self.entity)
if entity is not None:
proxy.unsafe_add(self.prop, entity.id, cleaned=True)
inline_names(proxy, entity)
return None
return []

# clean the values returned by the query, or by using literals, or
# formats.
Expand All @@ -133,5 +133,17 @@ def map(
splote.extend(value.split(self.split))
values = splote

discarded_values: List[str] = []

for value in values:
proxy.unsafe_add(self.prop, value, fuzzy=self.fuzzy, format=self.format)
added_value = proxy.unsafe_add(
prop=self.prop,
value=value,
fuzzy=self.fuzzy,
format=self.format,
)

if value is not None and added_value is None:
discarded_values.append(value)

return discarded_values
37 changes: 21 additions & 16 deletions followthemoney/proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,27 +206,32 @@ def unsafe_add(
cleaned: bool = False,
fuzzy: bool = False,
format: Optional[str] = None,
) -> None:
) -> Optional[str]:
"""A version of `add()` to be used only in type-checking code. This accepts
only a single value, and performs input cleaning on the premise that the
value is already valid unicode."""
value is already valid unicode. Returns the value that has been added."""
if not cleaned and value is not None:
format = format or prop.format
value = prop.type.clean_text(value, fuzzy=fuzzy, format=format, proxy=self)
if value is not None:
# Somewhat hacky: limit the maximum size of any particular
# field to avoid overloading upstream aleph/elasticsearch.
value_size = len(value)
if prop.type.max_size is not None:
if self._size + value_size > prop.type.max_size:
# msg = "[%s] too large. Rejecting additional values."
# log.warning(msg, prop.name)
return None
self._size += value_size
self._properties.setdefault(prop.name, list())
if value not in self._properties[prop.name]:
self._properties[prop.name].append(value)
return None

if value is None:
return None

# Somewhat hacky: limit the maximum size of any particular
# field to avoid overloading upstream aleph/elasticsearch.
value_size = len(value)
if prop.type.max_size is not None:
if self._size + value_size > prop.type.max_size:
# msg = "[%s] too large. Rejecting additional values."
# log.warning(msg, prop.name)
return None
self._size += value_size
self._properties.setdefault(prop.name, list())

if value not in self._properties[prop.name]:
self._properties[prop.name].append(value)

return value

def set(
self,
Expand Down
19 changes: 19 additions & 0 deletions tests/test_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,25 @@ def test_base_functions(self):
with raises(InvalidData):
EntityProxy.from_dict(model, {})

def test_unsafe_add(self):
schema = model.schemata["Person"]
prop = schema.properties["phone"]

proxy = model.make_entity("Person")
value = proxy.unsafe_add(prop, "+12025557612")
assert value == "+12025557612"
assert proxy.get("phone") == ["+12025557612"]

proxy = model.make_entity("Person")
value = proxy.unsafe_add(prop, "+1 (202) 555-7612")
assert value == "+12025557612"
assert proxy.get("phone") == ["+12025557612"]

proxy = model.make_entity("Person")
value = proxy.unsafe_add(prop, "(202) 555-7612")
assert value == None
assert proxy.get("phone") == []

def test_pop(self):
proxy = EntityProxy.from_dict(model, ENTITY)
get_ids = proxy.get("idNumber")
Expand Down

0 comments on commit e3fb599

Please sign in to comment.