Skip to content

Commit

Permalink
Updated nanopub validation
Browse files Browse the repository at this point in the history
  • Loading branch information
wshayes committed Jun 24, 2018
1 parent 5587910 commit 25cd9d4
Show file tree
Hide file tree
Showing 13 changed files with 781 additions and 73 deletions.
36 changes: 36 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"

[packages]
cityhash = "*"
click = "*"
coverage = "*"
jsonschema = "*"
python-arango = "*"
python-dateutil = "*"
python-json-logger = "*"
structlog = "*"
timy = "*"
Jinja2 = "*"

[dev-packages]
mypy = "*"
pytest = "*"
pytest-cache = "*"
pytest-cov = "*"
pytest-flakes = "*"
pytest-mypy = "*"
pytest-pep8 = "*"
pytest-sugar = "*"
releases = "*"
Sphinx = "*"
sphinx-autobuild = "*"
sphinx-autodoc-typehints = "*"
sphinx-click = "*"
sphinx-rtd-theme = "*"
twine = "*"

[requires]
python_version = "3.6"
643 changes: 643 additions & 0 deletions Pipfile.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions bel/lang/belobj.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def parse(self, statement: str, strict: bool = False, parseinfo: bool = False, r

# Check to see if empty string for bel statement
if len(self.bel_stmt) == 0:
self.validation_messages.append(('ERROR', 'Please include a valid BEL statement.'))
self.validation_messages.append(('ERROR', 'Please include a valid BEL statement - found empty string.'))
return self

try:
Expand All @@ -139,7 +139,7 @@ def parse(self, statement: str, strict: bool = False, parseinfo: bool = False, r
# if an error is returned, send to handle_syntax, error
error, visualize_error = bel_utils.handle_parser_syntax_error(e)
self.parse_visualize_error = visualize_error
self.validation_messages.append(('ERROR', error + " BEL: " + self.original_bel_stmt))
self.validation_messages.append(('ERROR', f'{error} BEL: {self.original_bel_stmt}\n{visualize_error}'))
self.ast = None

except Exception as e:
Expand Down
6 changes: 3 additions & 3 deletions bel/lang/partialparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ def parse_relations(belstr: str, char_locs: CharLocs, parsed: Parsed, errors: Er

for match in relations_pattern_middle.finditer(belstr):
(start, end) = match.span(1)
log.debug(f'Relation-middle {match}')
# log.debug(f'Relation-middle {match}')
end = end - 1 # adjust end to match actual end character index
if start != end:
test_range = set(range(start, end))
Expand Down Expand Up @@ -513,7 +513,7 @@ def print_spans(spans, max_idx: int) -> None:
for i in range(span[0], span[1] + 1):
bel_spans[i] = val[0]

print(''.join(bel_spans))
# print(''.join(bel_spans))

# Add second layer for Nested Objects if available
bel_spans = [' '] * (max_idx + 3)
Expand All @@ -523,7 +523,7 @@ def print_spans(spans, max_idx: int) -> None:
for i in range(span[0], span[1] + 1):
bel_spans[i] = val[0]

print(''.join(bel_spans))
# print(''.join(bel_spans))


def parsed_function_to_ast(parsed: Parsed, parsed_key):
Expand Down
58 changes: 46 additions & 12 deletions bel/nanopub/nanopubs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import bel.lang.belobj
import jsonschema
import requests
import mmh3
from cityhash import CityHash64

import bel.edge.edges
from bel.Config import config
Expand Down Expand Up @@ -144,38 +144,72 @@ def validate_to_schema(nanopub, schema) -> Tuple[bool, List[Tuple[str, str]]]:

# Following is used in nanopub-tools codebase
def hash_nanopub(nanopub: Mapping[str, Any]) -> str:
"""Create hash from nanopub for duplicate check"""
"""Create CityHash64 from nanopub for duplicate check
TODO - check that this hash value is consistent between C# and Python running on
laptop and server
Build string to hash
Collect flat array of (all values.strip()):
nanopub.type.name
nanopub.type.version
One of:
nanopub.citation.database.name
nanopub.citation.database.id
OR
nanopub.citation.database.uri
OR
nanopub.citation.database.reference
Extend with sorted list of assertions (SRO as single string with space between S, R and O)
Extend with sorted list of annotations (nanopub.annotations.type + ' ' + nanopub.annotations.id)
Convert array to string by joining array elements separated by a space
Create CityHash64(str) and return
"""

hash_list = []

# Type
hash_list.append(nanopub['nanopub']['type']['name'].strip())
hash_list.append(nanopub['nanopub']['type']['version'].strip())
hash_list.append(nanopub['nanopub']['type'].get('name', '').strip())
hash_list.append(nanopub['nanopub']['type'].get('version', '').strip())

# Citation
if nanopub['nanopub']['citation'].get('database', False):
hash_list.append(nanopub['nanopub']['citation']['database'].get('name').strip())
hash_list.append(nanopub['nanopub']['citation']['database'].get('id').strip())
hash_list.append(nanopub['nanopub']['citation']['database'].get('name', '').strip())
hash_list.append(nanopub['nanopub']['citation']['database'].get('id', '').strip())
elif nanopub['nanopub']['citation'].get('uri', False):
hash_list.append(nanopub['nanopub']['citation']['uri'].strip())
hash_list.append(nanopub['nanopub']['citation'].get('uri', '').strip())
elif nanopub['nanopub']['citation'].get('reference', False):
hash_list.append(nanopub['nanopub']['citation']['reference'].strip())
hash_list.append(nanopub['nanopub']['citation'].get('reference', '').strip())

# Assertions
assertions = []
for assertion in nanopub['nanopub']['assertions']:
print(assertion)
assertions.append(' '.join((assertion['subject'], assertion.get('relation', ''), assertion.get('object', ''))))
if assertion.get('relation') is None:
assertion['relation'] = ''
if assertion.get('object') is None:
assertion['object'] = ''
assertions.append(' '.join((assertion['subject'].strip(), assertion.get('relation', '').strip(), assertion.get('object', '').strip())).strip())
assertions = sorted(assertions)
hash_list.extend(assertions)

# Annotations
annotations = []

for anno in nanopub['nanopub']['annotations']:
annotations.append(' '.join((anno['type'], anno['id'])))
annotations.append(' '.join((anno.get('type', '').strip(), anno.get('id', '').strip())).strip())

annotations = sorted(annotations)
hash_list.extend(annotations)

return str(mmh3.hash128(' '.join(hash_list)))
return str(CityHash64(' '.join(hash_list)))
47 changes: 27 additions & 20 deletions bel/nanopub/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,34 +16,37 @@ def validate(nanopub: dict, error_level: str = 'WARNING') -> Tuple[str, str, str
Error Levels are similar to log levels - selecting WARNING includes both
WARNING and ERROR, selecting ERROR just includes ERROR
The validation result is a list of tuples containing what level of validation
issue this is, followed by what aspect of the nanopub "STRUCTURE" or
"ASSERTION" or "ANNOTATION", followed by the validation warning or error message.
The validation result is a list of objects containing
{
'level': 'Warning|Error',
'section': 'Assertion|Annotation|Structure',
'label': '{Error|Warning}-{Assertion|Annotation|Structure}', # to be used for faceting in Elasticsearch
'index': idx, # Index of Assertion or Annotation in Nanopub - starts at 0
'msg': msg, # Error or Warning message
}
Args:
nanopub: nanopub record starting with nanopub...
level: return WARNING or just ERROR? defaults to warnings and errors
Returns:
list(tuples): [("ERROR|WARNING", "STRUCTURE|ASSERTION|ANNOTATION", <msg>]
list(tuples): [{'level': 'Warning', 'section': 'Assertion', 'label': 'Warning-Assertion', 'index': 0, 'msg': <msg>}]
"""

if 'nanopub' in nanopub:
nanopub = nanopub['nanopub']

# Validation results
v = {
'ERROR': {"STRUCTURE": [], 'ASSERTION': [], 'ANNOTATION': []},
'WARNING': {"STRUCTURE": [], 'ASSERTION': [], 'ANNOTATION': []},
}
v = []

bel_version = config['bel']['lang']['default_bel_version']

# Structural checks
try:
if not isinstance(nanopub['assertions'], list):
v['ERROR']['STRUCTURE'].append("Assertions must be a list/array")
v.append({'level': 'Error', 'section': 'Structure', 'label': 'Error-Structure', 'msg': "Assertions must be a list/array"})
except Exception as e:
v['ERROR']['STRUCTURE'].append('Missing nanopub["assertions"]')
v.append({'level': 'Error', 'section': 'Structure', 'label': 'Error-Structure', 'msg': 'Missing nanopub["assertions"]'})

try:
if 'name' in nanopub['type'] and 'version' in nanopub['type']:
Expand All @@ -52,36 +55,40 @@ def validate(nanopub: dict, error_level: str = 'WARNING') -> Tuple[str, str, str
bel_version = nanopub['type']['version']

except Exception as e:
v['ERROR']['STRUCTURE'].append('Missing or badly formed type - must have nanopub["type"] = {"name": <name>, "version": <version}')
v.append({'level': 'Error', 'section': 'Structure', 'label': 'Error-Structure', 'msg': 'Missing or badly formed type - must have nanopub["type"] = {"name": <name>, "version": <version}'})

try:
for key in ['uri', 'database', 'reference']:
if key in nanopub['citation']:
break
else:
v['ERROR']['STRUCTURE'].append('nanopub["citation"] must have either a uri, database or reference key.')
v.append({'level': 'Error', 'section': 'Structure', 'label': 'Error-Structure', 'msg': 'nanopub["citation"] must have either a uri, database or reference key.'})
except Exception as e:
v['ERROR']['STRUCTURE'].append('nanopub must have a "citation" key with either a uri, database or reference key.')
v.append({'level': 'Error', 'section': 'Structure', 'label': 'Error-Structure', 'msg': 'nanopub must have a "citation" key with either a uri, database or reference key.'})

# Assertion checks

if 'assertions' in nanopub:
for assertion in nanopub['assertions']:
for idx, assertion in enumerate(nanopub['assertions']):
bo = bel.lang.belobj.BEL(bel_version, config['bel_api']['servers']['api_url'])
belstr = f'{assertion.get("subject")} {assertion.get("relation", "")} {assertion.get("object", "")}'
belstr = belstr.replace('None', '')
try:
messages = bo.parse(belstr, error_level=error_level).validation_messages
for message in messages:
(level, msg) = message
v[level]['ASSERTION'].append(msg)
if error_level == 'ERROR':
if level == 'ERROR':
v.append({'level': f'{level.title()}', 'section': 'Assertion', 'label': f'{level.title()}-Assertion', 'index': idx, 'msg': msg})
else:
v.append({'level': f'{level.title()}', 'section': 'Assertion', 'label': f'{level.title()}-Assertion', 'index': idx, 'msg': msg})

except Exception as e:
v['ERROR']['ASSERTION'].append(f'Could not parse: {belstr}')
v.append({'level': 'Error', 'section': 'Assertion', 'label': 'Error-Assertion', 'index': idx, 'msg': f'Could not parse: {belstr}'})
log.exception(f'Could not parse: {belstr}')

# Annotation checks
if error_level == 'WARNING':
for annotation in nanopub.get('annotations', []):
for idx, annotation in enumerate(nanopub.get('annotations', [])):
term_type = annotation['type']
term_id = annotation['id']
# term_label = annotation['label']
Expand All @@ -96,8 +103,8 @@ def validate(nanopub: dict, error_level: str = 'WARNING') -> Tuple[str, str, str
if len(results['hits']['hits']) > 0:
result = results['hits']['hits'][0]['_source']
if term_type not in result['annotation_types']:
v['WARNING']['ANNOTATION'].append(f'Annotation type: {term_type} for {term_id} does not match annotation types in database: {result["annotation_types"]}')
v.append({'level': 'Warning', 'section': 'Annotation', 'index': idx, 'label': 'Warning-Annotation', 'msg': f'Annotation type: {term_type} for {term_id} does not match annotation types in database: {result["annotation_types"]}'})
else:
v['WARNING']['ANNOTATION'].append(f'Annotation term: {term_id} not found in database')
v.append({'level': 'Warning', 'section': 'Annotation', 'index': idx, 'label': 'Warning-Annotation', 'msg': f'Annotation term: {term_id} not found in database'})

return v
10 changes: 5 additions & 5 deletions bel/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import urllib
import ulid
import tempfile
import mmh3
from cityhash import CityHash64
import json
from typing import Mapping, Any
import datetime
Expand Down Expand Up @@ -119,16 +119,16 @@ def _create_hash_from_doc(doc: Mapping[str, Any]) -> str:


def _create_hash(string: str) -> str:
"""Create Murmur3 128 bit hash of string
"""Create CityHash64 bit hash of string
Args:
string (str): string to create Murmur3 128bit hash from
string (str): string to create CityHash64 from
Returns:
str: Murmur3 128 bit hash
str: CityHash64
"""

return str(mmh3.hash128(string))
return str(CityHash64(string))


def _generate_id() -> str:
Expand Down
8 changes: 0 additions & 8 deletions requirements-docs.txt

This file was deleted.

20 changes: 0 additions & 20 deletions requirements.txt

This file was deleted.

4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,20 @@

# What packages are required for this module to be executed?
REQUIRED = [
'cityhash',
'click',
'elasticsearch',
'fastcache',
'CacheControl',
'jsonschema',
'mmh3',
'python-arango',
'python-dateutil',
'pyyaml',
'requests',
'TatSu',
'ulid-py',
'lxml',
'structlog',
'typing',
'ulid-py',
]
Expand Down
2 changes: 1 addition & 1 deletion tests/lang/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def test_empty_string():
bo.parse(statement)

assert bo.ast is None
assert 'Please include a valid BEL statement.' in bo.validation_messages[0][1]
assert 'Please include a valid BEL statement - found empty string.' in bo.validation_messages[0][1]


def test_bad_string_start():
Expand Down
14 changes: 14 additions & 0 deletions tests/lang/test_partialparse.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import bel.lang.partialparse
import pytest


def test_parse():
Expand All @@ -11,3 +12,16 @@ def test_parse():

assert ast.to_string() == 'sec(a(CHEBI:3-hydroxybutyrate))'


@pytest.mark.skip(reason="Not finished with this test")
def test_parse_bad_tloc():

# Mis-matched parenthesis and missing colon after first GOCC
belstr = 'tloc(HGNC:CTNNB1),GOCC"cytoplasm",GOCC:"nucleus")'

bel_version = '2.0.0'
ast = bel.lang.partialparse.get_ast_obj(belstr, bel_version)

print(ast.to_string())

assert False

0 comments on commit 25cd9d4

Please sign in to comment.