Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
language: python
python:
- "2.7"
install:
- wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
- bash Miniconda3-latest-Linux-x86_64.sh -b -p $HOME/miniconda
- export PATH="$HOME/miniconda/bin:$PATH"

- conda config --add channels conda-forge
- conda config --add channels bioconda
- conda config --set always_yes yes --set changeps1 no
- conda config --set show_channel_urls True

- ENV_NAME='testing'
- conda create --quiet -n $ENV_NAME python=$TRAVIS_PYTHON_VERSION
- source activate $ENV_NAME
- conda install --quiet --file conda-requirements.txt
- conda list
- conda info -a
- python setup.py --quiet install

script:
python -m unittest discover -s bald.tests -v
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# bald
# Binary Array Linked Data: bald

[![Build Status](https://api.travis-ci.org/repositories/binary-array-ld/bald.svg?branch=master)](http://travis-ci.org/binary-array-ld/bald/branches)
Python library for validating and managing binary array linked data files.

A Python library for validating and managing binary array linked data files.
5 changes: 5 additions & 0 deletions conda-requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
numpy
h5py
netCDF4
requests
rdflib
44 changes: 36 additions & 8 deletions lib/bald/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import bald.validation as bv

__version__ = '0.2'

class HttpCache(object):
"""
Expand All @@ -15,9 +16,12 @@ class HttpCache(object):
def __init__(self):
self.cache = {}

def is_http_uri(self, item):
return item.startswith('http://') or item.startswith('https://')

def __getitem__(self, item):

if not item.startswith('http://') or item.startswith('https://'):
if not self.is_http_uri(item):
raise ValueError('{} is not a HTTP URI.'.format(item))
if item not in self.cache:
headers = {'Accept': 'text/turtle'}
Expand All @@ -33,23 +37,29 @@ def check_uri(self, uri):


class Subject(object):
def __init__(self, attrs=None):
def __init__(self, attrs=None, prefixes=None, aliases=None):
"""
A subject of metadata statements.

attrs: an dictionary of key value pair attributes
"""
if attrs is None:
attrs = []
attrs = {}
if prefixes is None:
prefixes = {}
if aliases is None:
aliases = {}
self.attrs = attrs
self.aliases = aliases
self._prefixes = prefixes
self._prefix_suffix = re.compile('(^(?:(?!__).)*)__((?!.*__).*$)')
_http_p = 'http[s]?://.*'
self._http_uri = re.compile('{}'.format(_http_p))
self._http_uri_prefix = re.compile('{}/|#'.format(_http_p))

def prefixes(self):
prefixes = {}
for key, value in self.attrs.iteritems():
for key, value in self._prefixes.iteritems():
if key.endswith('__') and self._http_uri_prefix.match(value):
pref = key.rstrip('__')
if pref in prefixes:
Expand All @@ -66,6 +76,8 @@ def unpack_uri(self, astring):
if self._http_uri.match(self.prefixes()[prefix]):
result = astring.replace('{}__'.format(prefix),
self.prefixes()[prefix])
elif astring in self.aliases:
result = self.aliases[astring]
return result


Expand Down Expand Up @@ -105,17 +117,25 @@ def validate_netcdf(afilepath):

with load(afilepath) as fhandle:
sval = bv.StoredValidation()
prefix_group = fhandle[fhandle.bald__isPrefixedBy] if hasattr(fhandle, 'bald__isPrefixedBy') else {}
prefixes = {}
if prefix_group:
prefixes = dict([(prefix, getattr(prefix_group, prefix)) for prefix in prefix_group.ncattrs()])
else:
for k in fhandle.ncattrs():
if k.endswith('__'):
prefixes[k] = getattr(fhandle, k)
attrs = {}
for k in fhandle.ncattrs():
attrs[k] = getattr(fhandle, k)
root_container = Subject(attrs)
root_container = Subject(attrs, prefixes=prefixes)
root_val = bv.ContainerValidation(subject=root_container,
fhandle=fhandle)
sval.stored_exceptions += root_val.exceptions()
for name in fhandle.variables:
sattrs = fhandle.__dict__.copy()
sattrs.update(fhandle.variables[name].__dict__.copy())
var = Subject(sattrs)
var = Subject(sattrs, prefixes=prefixes)
var_val = bv.ArrayValidation(name, fhandle.variables[name], fhandle=fhandle,
subject=var)
sval.stored_exceptions += var_val.exceptions()
Expand All @@ -132,7 +152,15 @@ def validate_hdf5(afilepath):
with load(afilepath) as fhandle:
sval = bv.StoredValidation()
cache = {}
root_container = Subject(fhandle.attrs)
prefix_group = fhandle.attrs.get('bald__isPrefixedBy')
prefixes = {}
if prefix_group:
prefixes = fhandle[prefix_group].attrs
alias_group = fhandle.attrs.get('bald__isAliasedBy')
aliases = {}
if alias_group:
aliases = dict(fhandle[alias_group].attrs.iteritems())
root_container = Subject(fhandle.attrs, prefixes=prefixes, aliases=aliases)
root_val = bv.ContainerValidation(subject=root_container,
fhandle=fhandle)
sval.stored_exceptions += root_val.exceptions()
Expand All @@ -144,7 +172,7 @@ def validate_hdf5(afilepath):
# #
sattrs = dict(fhandle.attrs).copy()
sattrs.update(dataset.attrs)
dset = Subject(sattrs)
dset = Subject(sattrs, prefixes, aliases)
dset_val = bv.ArrayValidation(name, dataset, fhandle=fhandle,
subject=dset)
sval.stored_exceptions += dset_val.exceptions()
Expand Down
59 changes: 59 additions & 0 deletions lib/bald/tests/integration/test_aliases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import unittest

import h5py
import numpy as np

import bald
from bald.tests import BaldTestCase

def _fattrs(f):
f.attrs['rdf__type'] = 'bald__Container'
group_pref = f.create_group('bald__prefix_list')
group_pref.attrs['bald__'] = 'http://binary-array-ld.net/latest/'
group_pref.attrs['rdf__'] = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
f.attrs['bald__isPrefixedBy'] = group_pref.ref
return f

def _create_parent_child(f, pshape, cshape):
dsetp = f.create_dataset("parent_dataset", pshape, dtype='i')
dsetc = f.create_dataset("child_dataset", cshape, dtype='i')
dsetp.attrs['rdf__type'] = 'bald__Array'
dsetp.attrs['bald__references'] = dsetc.ref
dsetc.attrs['rdf__type'] = 'bald__Array'
dsetc.attrs['rdf__type'] = 'bald__Reference'
dsetc.attrs['bald__array'] = dsetc.ref
return f


class Test(BaldTestCase):

def test_valid_uri(self):
with self.temp_filename('.hdf') as tfile:
f = h5py.File(tfile, "w")
f = _fattrs(f)
group_alias = f.create_group('bald__alias_list')
f.attrs['bald__isAliasedBy'] = group_alias.ref
group_alias.attrs['skosPrefLabel'] = 'http://www.w3.org/2004/02/skos/core#prefLabel'
dsetp = f.create_dataset("parent_dataset", (11, 17), dtype='i')
dsetp.attrs['skosPrefLabel'] = 'alabel'
f.close()
validation = bald.validate_hdf5(tfile)
self.assertTrue(validation.is_valid())

def test_invalid_uri(self):
with self.temp_filename('.hdf') as tfile:
f = h5py.File(tfile, "w")
f = _fattrs(f)
f.attrs['bald__turtle'] = 'bald__walnut'
group_alias = f.create_group('bald__alias_list')
f.attrs['bald__isAliasedBy'] = group_alias.ref
group_alias.attrs['skosPrefLabel'] = 'http://www.w3.org/2004/02/skos/core#notThisPrefLabel'
dsetp = f.create_dataset("parent_dataset", (11, 17), dtype='i')
dsetp.attrs['skosPrefLabel'] = 'alabel'
f.close()
validation = bald.validate_hdf5(tfile)
self.assertFalse(validation.is_valid())


if __name__ == '__main__':
unittest.main()
6 changes: 4 additions & 2 deletions lib/bald/tests/integration/test_netcdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@
from bald.tests import BaldTestCase

def _fattrs(f):
f.bald__ = 'http://binary-array-ld.net/latest/'
f.rdf__ = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
f.rdf__type = 'bald__Container'
group_pref = f.createGroup('bald__prefix_list')
group_pref.bald__ = 'http://binary-array-ld.net/latest/'
group_pref.rdf__ = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
f.bald__isPrefixedBy = 'bald__prefix_list'
return f

def _create_parent_child(f, pshape, cshape):
Expand Down
78 changes: 78 additions & 0 deletions lib/bald/tests/integration/test_netcdf4_classic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import unittest

import h5py
import netCDF4
import numpy as np

import bald
from bald.tests import BaldTestCase

def _fattrs(f):
f.bald__ = 'http://binary-array-ld.net/latest/'
f.rdf__ = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
f.rdf__type = 'bald__Container'
return f

def _create_parent_child(f, pshape, cshape):
for i, pdimsize in enumerate(pshape):
f.createDimension("pdim{}".format(str(i)), pdimsize)
for i, cdimsize in enumerate(cshape):
f.createDimension("cdim{}".format(str(i)), cdimsize)
varp = f.createVariable("parent_variable", 'i4', tuple(["pdim{}".format(str(i)) for i, _ in enumerate(pshape)]))
varc = f.createVariable("child_variable", 'i4', tuple(["cdim{}".format(str(i)) for i, _ in enumerate(cshape)]))
varp.rdf__type = 'bald__Array'
varp.bald__references = "child_variable"
varc.rdf__type = 'bald__Array'
varc.rdf__type = 'bald__Reference'
varc.bald__array = "child_variable"
return f


class Test(BaldTestCase):

def test_valid_uri(self):
with self.temp_filename('.nc') as tfile:
f = netCDF4.Dataset(tfile, "w", format="NETCDF4_CLASSIC")

f = _fattrs(f)
f.close()
validation = bald.validate_netcdf(tfile)
self.assertTrue(validation.is_valid())

def test_invalid_uri(self):
with self.temp_filename('.nc') as tfile:
f = netCDF4.Dataset(tfile, "w", format="NETCDF4_CLASSIC")

f = _fattrs(f)
setattr(f, 'bald__turtle', 'bald__walnut')
f.close()
validation = bald.validate_netcdf(tfile)
self.assertFalse(validation.is_valid())


class TestArrayReference(BaldTestCase):
def test_match(self):
with self.temp_filename('.nc') as tfile:
f = netCDF4.Dataset(tfile, "w", format="NETCDF4_CLASSIC")
f = _fattrs(f)
f = _create_parent_child(f, (11, 17), (11, 17))
f.close()
validation = bald.validate_netcdf(tfile)
self.assertTrue(validation.is_valid())

def test_mismatch_zeroth(self):
with self.temp_filename('.nc') as tfile:
f = netCDF4.Dataset(tfile, "w", format="NETCDF4_CLASSIC")
f = _fattrs(f)
f = _create_parent_child(f, (11, 17), (11, 13))
f.close()
validation = bald.validate_netcdf(tfile)
self.assertFalse(validation.is_valid())


if __name__ == '__main__':
unittest.main()




6 changes: 4 additions & 2 deletions lib/bald/tests/integration/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
from bald.tests import BaldTestCase

def _fattrs(f):
f.attrs['bald__'] = 'http://binary-array-ld.net/latest/'
f.attrs['rdf__'] = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
f.attrs['rdf__type'] = 'bald__Container'
group_pref = f.create_group('bald__prefix_list')
group_pref.attrs['bald__'] = 'http://binary-array-ld.net/latest/'
group_pref.attrs['rdf__'] = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
f.attrs['bald__isPrefixedBy'] = group_pref.ref
return f

def _create_parent_child(f, pshape, cshape):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,19 @@
import numpy as np

from bald.tests import BaldTestCase
from bald import validation
import bald

class TestHttpCache(unittest.TestCase):
def setUp(self):
self.cache = bald.HttpCache()

class Test(unittest.TestCase):
def test_check_uri_200(self):
auri = 'http://binary-array-ld.net/experimental'
self.assertTrue(validation.check_uri(auri))
self.assertTrue(self.cache.check_uri(auri))

def test_check_uri_404(self):
notauri = 'http://binary-array-ld.net/experimentalish'
self.assertFalse(validation.check_uri(notauri))
self.assertFalse(self.cache.check_uri(notauri))


if __name__ == '__main__':
Expand Down
14 changes: 11 additions & 3 deletions lib/bald/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,12 +91,16 @@ def _check_uri(uri, exceptions):
for pref, uri in self.subject.prefixes().iteritems():
exceptions = _check_uri(self.subject.unpack_uri(uri),
exceptions)
for alias, uri in self.subject.aliases.iteritems():
exceptions = _check_uri(self.subject.unpack_uri(uri),
exceptions)
for attr, value in self.subject.attrs.iteritems():
exceptions = _check_uri(self.subject.unpack_uri(attr),
exceptions)
if isinstance(value, str):
exceptions = _check_uri(self.subject.unpack_uri(value),
exceptions)
val = self.subject.unpack_uri(value)
if self.cache.is_http_uri(val):
exceptions = _check_uri(val, exceptions)
return exceptions

def check_attr_domain_range(self, exceptions):
Expand All @@ -106,7 +110,11 @@ def check_attr_domain_range(self, exceptions):
# thus we have a payload
# go rdf
g = rdflib.Graph()
g.parse(data=self.cache[uri].text, format="n3")
data=self.cache[uri].text
try:
g.parse(data=self.cache[uri].text, format="n3")
except Exception:
g.parse(data=self.cache[uri].text, format="xml")
query = ('SELECT ?s \n'
'(GROUP_CONCAT(?domain; SEPARATOR=" | ") AS ?domains)'
' \n'
Expand Down
Loading