Skip to content

Commit

Permalink
Preserve str type, set fixed width on load, and set fixed with on upd…
Browse files Browse the repository at this point in the history
…ate_ids (#872)

* MAINT: preserve str type, and set type on update_ids

* Fixed width on load
  • Loading branch information
wasade committed Mar 25, 2022
1 parent eadab73 commit 3876827
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 19 deletions.
3 changes: 2 additions & 1 deletion biom/_filter.pyx
Expand Up @@ -140,7 +140,8 @@ def _filter(arr, ids, metadata, index, ids_to_keep, axis, invert):
_remove_rows_csr(arr, bools)
arr = arr.T # Back to CSC

ids = np.asarray(list(compress(ids, bools)), dtype=object)
ids_dtype = ids.dtype
ids = np.asarray(list(compress(ids, bools)), dtype=ids_dtype)
metadata = tuple(compress(metadata, bools))

if metadata_is_None:
Expand Down
20 changes: 13 additions & 7 deletions biom/table.py
Expand Up @@ -467,9 +467,8 @@ def __init__(self, data, observation_ids, sample_ids,

self._data = self._data.astype(float)

# using object to allow for variable length strings
self._sample_ids = np.asarray(sample_ids, dtype=object)
self._observation_ids = np.asarray(observation_ids, dtype=object)
self._sample_ids = np.asarray(sample_ids)
self._observation_ids = np.asarray(observation_ids)

if sample_metadata is not None:
# not m will evaluate True if the object tested is None or
Expand Down Expand Up @@ -1398,7 +1397,8 @@ def update_ids(self, id_map, axis='sample', strict=True, inplace=True):
>>> print(updated_table.ids(axis='sample'))
['s1.1' 's2.2' 's3.3']
"""
updated_ids = zeros(self.ids(axis=axis).size, dtype=object)
str_dtype = 'U%d' % max([len(v) for v in id_map.values()])
updated_ids = zeros(self.ids(axis=axis).size, dtype=str_dtype)
for idx, old_id in enumerate(self.ids(axis=axis)):
if strict and old_id not in id_map:
raise TableException(
Expand Down Expand Up @@ -2340,7 +2340,6 @@ def filter(self, ids_to_keep, axis='sample', invert=False, inplace=True):
ids = table.ids(axis=axis)
index = self._index(axis=axis)
axis = table._axis_to_num(axis=axis)

arr = table._data
arr, ids, metadata = _filter(arr,
ids,
Expand Down Expand Up @@ -4071,6 +4070,12 @@ def from_hdf5(cls, h5grp, ids=None, axis='sample', parse_fs=None,
shape = (len(to_keep), len(samp_ids))
mat = csr_matrix((data, indices, indptr), shape=shape)

# use a fixed width dtype
obs_ids_dtype = 'U%d' % max([len(v) for v in obs_ids])
samp_ids_dtype = 'U%d' % max([len(v) for v in samp_ids])
obs_ids = np.asarray(obs_ids, dtype=obs_ids_dtype)
samp_ids = np.asarray(samp_ids, dtype=samp_ids_dtype)

return Table(mat, obs_ids, samp_ids)

id_ = h5grp.attrs['id']
Expand All @@ -4091,8 +4096,9 @@ def axis_load(grp):
# fetch all of the IDs
ids = grp['ids'][:]

if ids.size > 0 and isinstance(ids[0], bytes):
ids = np.array([i.decode('utf8') for i in ids])
if ids.size > 0:
ids_dtype = 'U%d' % max([len(v) for v in ids])
ids = np.asarray(ids, dtype=ids_dtype)

parser = defaultdict(lambda: general_parser)
parser['taxonomy'] = vlen_list_of_str_parser
Expand Down
20 changes: 10 additions & 10 deletions biom/tests/test_table.py
Expand Up @@ -779,10 +779,10 @@ def test_from_hdf5_sample_subset_no_metadata(self):
subset_with_metadata=False)
os.chdir(cwd)

npt.assert_equal(t.ids(), [b'Sample2', b'Sample4', b'Sample6'])
npt.assert_equal(t.ids(), ['Sample2', 'Sample4', 'Sample6'])
npt.assert_equal(t.ids(axis='observation'),
[b'GG_OTU_1', b'GG_OTU_2', b'GG_OTU_3', b'GG_OTU_4',
b'GG_OTU_5'])
['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4',
'GG_OTU_5'])
exp_obs_md = None
self.assertEqual(t._observation_metadata, exp_obs_md)
exp_samp_md = None
Expand Down Expand Up @@ -881,10 +881,10 @@ def test_from_hdf5_observation_subset_no_metadata(self):
subset_with_metadata=False)
os.chdir(cwd)

npt.assert_equal(t.ids(), [b'Sample1', b'Sample2', b'Sample3',
b'Sample4', b'Sample5', b'Sample6'])
npt.assert_equal(t.ids(), ['Sample1', 'Sample2', 'Sample3',
'Sample4', 'Sample5', 'Sample6'])
npt.assert_equal(t.ids(axis='observation'),
[b'GG_OTU_1', b'GG_OTU_3', b'GG_OTU_5'])
['GG_OTU_1', 'GG_OTU_3', 'GG_OTU_5'])
exp_obs_md = None
self.assertEqual(t._observation_metadata, exp_obs_md)

Expand Down Expand Up @@ -2348,8 +2348,8 @@ def test_update_ids(self):
"""ids are updated as expected"""
# update observation ids
exp = self.st1.copy()
exp._observation_ids = np.array(['41', '42'])
id_map = {'2': '42', '1': '41'}
exp._observation_ids = np.array(['41', '42long'])
id_map = {'2': '42long', '1': '41'}
obs = self.st1.update_ids(id_map, axis='observation', inplace=False)
self.assertEqual(obs, exp)

Expand Down Expand Up @@ -2932,10 +2932,10 @@ def test_copy_metadata(self):

def test_copy_ids(self):
copied_table = self.st_rich.copy()
self.st_rich._sample_ids[0] = 'a different id'
self.st_rich._sample_ids[0] = 'X'
self.assertNotEqual(copied_table, self.st_rich)
copied_table = self.st_rich.copy()
self.st_rich._observation_ids[0] = 'a different id'
self.st_rich._observation_ids[0] = 'X'
self.assertNotEqual(copied_table, self.st_rich)

def test_copy_data(self):
Expand Down
2 changes: 1 addition & 1 deletion pytest.ini
@@ -1,4 +1,4 @@
[pytest]
addopts = --ignore=biom/assets/exercise_api.py --cov=biom
addopts = --ignore=biom/assets/exercise_api.py
doctest_optionflags = NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL
testpaths = biom

0 comments on commit 3876827

Please sign in to comment.