Skip to content

Commit

Permalink
remove redacted policies
Browse files Browse the repository at this point in the history
  • Loading branch information
Zsailer committed Jul 28, 2022
1 parent 5339800 commit b90ce6e
Show file tree
Hide file tree
Showing 18 changed files with 10 additions and 439 deletions.
14 changes: 3 additions & 11 deletions jupyter_events/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from datetime import datetime

from pythonjsonlogger import jsonlogger
from traitlets import Instance, List, default
from traitlets import Instance, default
from traitlets.config import Config, Configurable

from . import EVENTS_METADATA_VERSION
Expand All @@ -33,14 +33,6 @@ class EventLogger(Configurable):
""",
).tag(config=True)

redacted_policies = List(
default_value=None,
allow_none=True,
help="""A list of the redactionPolicies that will be redacted
from emitted events.
""",
).tag(config=True)

schemas = Instance(
SchemaRegistry,
help="""The SchemaRegistry for caching validated schemas
Expand All @@ -50,7 +42,7 @@ class EventLogger(Configurable):

@default("schemas")
def _default_schemas(self) -> SchemaRegistry:
return SchemaRegistry(redacted_policies=self.redacted_policies)
return SchemaRegistry()

def __init__(self, *args, **kwargs):
# We need to initialize the configurable before
Expand Down Expand Up @@ -164,7 +156,7 @@ def emit(self, id: str, version: int, event: dict, timestamp_override=None):
"__metadata_version__": EVENTS_METADATA_VERSION,
}
# Process this event, i.e. validate and redact (in place)
self.schemas.process_event(id, version, event)
self.schemas.validate_event(id, version, event)
capsule.update(event)
self.log.info(capsule)
return capsule
120 changes: 0 additions & 120 deletions jupyter_events/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,65 +6,6 @@
from .validators import validate_schema


def _pop_nested_redacted_fields(
schema_data: dict, policy_location: Sequence[Hashable]
) -> Any:
"""Pop a item nested anywhere in a dwictionary using the
list of (hashable) keys to locate the item.
"""
# Begin walking the sequence of keys to the policy
# location given.
nested_data = schema_data
for i, el in enumerate(policy_location[:-1]):
# Handle arrays of objects.
if el == "__array__":
for j, _ in enumerate(nested_data):
branch = policy_location[i + 1 :]
_pop_nested_redacted_fields(nested_data[j], branch)
return
# Try moving into nested child schema.
try:
nested_data = nested_data[el]
except KeyError:
return
# If we made it this far, we ended on a policy that needs to be popped.
return nested_data.pop(policy_location[-1])


def _find_redaction_policies(schema: dict) -> Dict[str, list]:
"""A recursive function that iterates an event schema
and returns a mapping of redaction policies to
(nested) properties (identified by a sequence of keys).
"""
redaction_policies: Dict[str, List[str]] = {}

def _extract_policies(subschema, key_sequence=()):
props = subschema["properties"]
for key, obj in props.items():
updated_key_sequence = key_sequence + (key,)

def _nested_extract_policies(obj, updated_key_sequence):
if isinstance(obj, dict):
if "properties" in obj:
_extract_policies(obj, updated_key_sequence)
if "items" in obj and "properties" in obj["items"]:
_nested_extract_policies(
obj["items"], updated_key_sequence + ("__array__",)
)

_nested_extract_policies(obj, updated_key_sequence)

# Update the list in place.
for policy in obj["redactionPolicies"]:
policies_list = redaction_policies.get(policy, [])
policies_list.append(updated_key_sequence)
redaction_policies[policy] = policies_list

# Start the recursion
_extract_policies(schema)
return redaction_policies


class EventSchema:
"""A validated schema that can be used.
Expand All @@ -85,48 +26,20 @@ class EventSchema:
resolver:
RefResolver for nested JSON schema references.
allowed_policies: set
A set of redaction policied allowed by this event schema.
Each property in the schema must have a `redactionPolicy`
annotation representing the level of sensitivity of the
data collected by this event. In order for that data
to be emitted of Jupyter Events, the matching redaction
policy must be listed here.
"""

def __init__(
self,
schema,
validator_class=validators.Draft7Validator,
resolver=None,
redacted_policies: Union[str, list, None] = None,
):
# Validate the schema against Jupyter Events metaschema.
validate_schema(schema)
# Build a mapping of all property redaction policies.
self._redaction_policies_locations = _find_redaction_policies(schema)
self._redacted_policies = self._validate_redacted_policies(redacted_policies)
# Create a validator for this schema
self._validator = validator_class(schema, resolver=resolver)
self._schema = schema

def _validate_redacted_policies(
self, redacted_policies: Union[None, List, str, set]
) -> set:
if redacted_policies is None:
return set()
value_type = type(redacted_policies)
if value_type == str and redacted_policies == "all":
return set(self.redaction_policies_locations.keys())
if value_type == list or value_type == set:
return set(redacted_policies)
raise TypeError(
"redacted_policies must be the literal string, 'all', or a list of "
"redaction polices"
)

@property
def id(self) -> str:
"""Schema $id field."""
Expand All @@ -141,53 +54,20 @@ def version(self) -> int:
def registry_key(self) -> Tuple[str, int]:
return (self.id, self.version)

@property
def redacted_policies(self) -> set:
"""The redaction policies that will not be redacted when an
incoming event is processed.
"""
return self._redacted_policies

@classmethod
def from_file(
cls,
filepath,
validator_class=validators.Draft7Validator,
resolver=None,
redacted_policies=None,
):
schema = yaml.load(filepath)
return cls(
schema=schema,
validator_class=validator_class,
resolver=resolver,
redacted_policies=redacted_policies,
)

@property
def redaction_policies_locations(self) -> Dict[str, List[str]]:
"""Mapping of the redaction policies in this schema to
the (nested) properties where they are defined.
"""
return self._redaction_policies_locations

def validate(self, data: dict) -> None:
"""Validate an incoming instance of this event schema."""
self._validator.validate(data)

def enforce_redaction_policies(self, data: dict) -> None:
"""Redact fields from"""
# # Find all policies not explicitly allowed.
# named_policies = set(self.redaction_policies_locations.keys())
# redacted_policies = named_policies - self.unredacted_policies
for policy_type in self.redacted_policies:
policy_locations = self._redaction_policies_locations[policy_type]
for item in policy_locations:
_pop_nested_redacted_fields(data, item)

def process(self, data: dict) -> None:
"""Validate event data and enforce an redaction policies (in place).
Nothing is returned by this method, because the data is redacted in place.
"""
self.validate(data)
self.enforce_redaction_policies(data)
23 changes: 3 additions & 20 deletions jupyter_events/schema_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,8 @@ class SchemaRegistryException(Exception):
class SchemaRegistry:
"""A convenient API for storing and searching a group of schemas."""

def __init__(self, schemas: dict = None, redacted_policies: list = None):
def __init__(self, schemas: dict = None):
self._schemas = schemas or {}
self._redacted_policies = redacted_policies

@property
def redacted_policies(self) -> Optional[List[Any]]:
"""A list of policies that will be redacted from
all events validated against this registry.
"""
return self._redacted_policies

def __contains__(self, registry_key: Tuple[str, int]):
"""Syntax sugar to check if a schema is found in the registry"""
Expand All @@ -40,14 +32,12 @@ def register(self, data: dict):
All schemas are validated against the Jupyter Events meta-schema
found here:
"""
schema = EventSchema(data, redacted_policies=self.redacted_policies)
schema = EventSchema(data)
self._add(schema)

def register_from_file(self, schema_filepath: str):
"""Register a schema from a file."""
schema = EventSchema.from_file(
schema_filepath, redacted_policies=self.redacted_policies
)
schema = EventSchema.from_file(schema_filepath)
self._add(schema)

def get(self, id: str, version: int) -> EventSchema:
Expand Down Expand Up @@ -82,10 +72,3 @@ def validate_event(self, id: str, version: int, data: dict) -> None:
"""
schema = self.get(id, version)
schema.validate(data)

def process_event(self, id: str, version: int, data: dict) -> None:
"""Validate and event and enforce an redaction policies (in place).
Nothing is returned by this method, because the data is redacted in place.
"""
schema = self.get(id, version)
schema.process(data)
5 changes: 0 additions & 5 deletions jupyter_events/schemas/event-metaschema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,6 @@ properties:
type: string
description:
type: string
redactionPolicies:
type: array
items:
type: string
properties:
type: object
additionalProperties:
Expand All @@ -27,5 +23,4 @@ properties:
required:
- $id
- version
- redactionPolicies
- properties
5 changes: 0 additions & 5 deletions jupyter_events/schemas/property-metaschema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@ properties:
type: string
description:
type: string
redactionPolicies:
type: array
items:
type: string
properties:
type: object
additionalProperties:
Expand All @@ -27,7 +23,6 @@ properties:

required:
- title
- redactionPolicies

additionalProperties:
$ref: http://event.jupyter.org/property-metaschema
Expand Down
34 changes: 0 additions & 34 deletions jupyter_events/traits.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,37 +42,3 @@ def validate(self, obj, value):
return value
else:
self.error(obj, value)


class SchemaOptions(TraitType):
"""A trait for handling options for recording schemas."""

info_text = "either a dictionary with schema options or a list with schema names."

def validate(self, obj, val):
# If the type is a dictionary.
if type(val) is dict:
for schema_name, data in val.items():
given_keys = set(data.keys())
# Compare against keys expected.
allowed_keys = {"allowed_categories", "allowed_properties"}
# There should be no extra keys (anything other than
# allowed_keys) in the schema options.
unknown_keys = given_keys.difference(allowed_keys)
if unknown_keys:
# Throw an error if there are unknown keys.
raise TraitError(
"The schema option, {schema_name}, includes "
"unknown key(s): {unknown_keys}".format(
schema_name=schema_name, unknown_keys=",".join(unknown_keys)
)
)
validated_val = val
# If the type is a list (for backwards compatibility).
elif type(val) is list:
validated_val = {}
for schema_name in val:
validated_val[schema_name] = {}
else:
raise TraitError("SchemaOptions must be of type dict or list.")
return validated_val
17 changes: 0 additions & 17 deletions tests/schemas/bad/missing-parent-policies.yaml

This file was deleted.

30 changes: 0 additions & 30 deletions tests/schemas/bad/missing-policy-array.yaml

This file was deleted.

Loading

0 comments on commit b90ce6e

Please sign in to comment.