Skip to content

Commit

Permalink
add filter_sample_categories and allow slice in __getitem__ (#49)
Browse files Browse the repository at this point in the history
* update moving picture example

* update the simple demo example

* delete old files and reorganize

* add amphibian example for manuscript

* add examples for manuscript

* remove manuscript data set from this repo

* add filter_sample_categories and empower Experiment.__getitem__ to
take slice as input

* update notebooks

* fix a typo causing travis failure

* oh pep8
  • Loading branch information
RNAer authored and amnona committed Sep 15, 2017
1 parent b33d09c commit e072bda
Show file tree
Hide file tree
Showing 29 changed files with 9,421 additions and 47,019 deletions.
9 changes: 5 additions & 4 deletions calour/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,14 @@ def correlation(exp, field, method='spearman', nonzero=False, transform=None, nu
the desired FDR control level
numperm : int
number of permutations to perform
fdr_method : str
method to compute FDR. Allowed method include "", ""
Returns
-------
newexp : calour.Experiment
The experiment with only significant (FDR<=maxfval) correlated features, sorted according to correlation size
The experiment with only significant (FDR<=maxfval) correlated features, sorted according to correlation coefficient
'''
# remove features not present in both groups
cexp = exp.filter_min_abundance(0, strict=True)

data = cexp.get_data(copy=True, sparse=False).transpose()
Expand Down Expand Up @@ -112,7 +113,7 @@ def diff_abundance(exp, field, val1, val2=None, method='meandiff', transform='ra
The field to test by
val1: str or list of str
The values for the first group.
val1: str or list of str or None (optional)
val2: str or list of str or None (optional)
None (default) to compare to all other samples (not in val1)
method : str or function
the method to use for the t-statistic test. options:
Expand Down Expand Up @@ -198,7 +199,7 @@ def diff_abundance_kw(exp, field, transform='rankdata', numperm=1000, alpha=0.1,
labels[exp.sample_metadata[field].values == clabel] = idx
logger.debug('Found %d unique sample labels' % (idx+1))
keep, odif, pvals = dsfdr.dsfdr(data, labels, method='kruwallis', transform_type=transform, alpha=alpha, numperm=numperm, fdr_method=fdr_method)
print(keep)

logger.info('Found %d significant features' % (np.sum(keep)))
return _new_experiment_from_pvals(cexp, exp, keep, odif, pvals)

Expand Down
33 changes: 24 additions & 9 deletions calour/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,16 +165,29 @@ def __getitem__(self, pos):
The abundance of feature ID in sample ID
'''
if not isinstance(pos, tuple) or len(pos) != 2:
raise ValueError('Must supply sample ID, feature ID')
raise SyntaxError('Must supply sample ID, feature ID')

sample = pos[0]
feature = pos[1]
if sample not in self.sample_metadata.index:
raise ValueError('SampleID %s not in experiment samples' % sample)
if feature not in self.feature_metadata.index:
raise ValueError('FeatureID %s not in experiment features' % feature)
sample_pos = self.sample_metadata.index.get_loc(sample)
feature_pos = self.feature_metadata.index.get_loc(feature)
return self.data[sample_pos, feature_pos]
if isinstance(sample, slice):
sample_pos = sample
else:
try:
sample_pos = self.sample_metadata.index.get_loc(sample)
except KeyError:
raise KeyError('SampleID %s not in experiment samples' % sample)
if isinstance(feature, slice):
feature_pos = feature
else:
try:
feature_pos = self.feature_metadata.index.get_loc(feature)
except KeyError:
raise KeyError('FeatureID %s not in experiment features' % feature)
if self.sparse:
dat = self.get_data(sparse=False)
else:
dat = self.get_data()
return dat[sample_pos, feature_pos]

def copy(self):
'''Copy the object.
Expand Down Expand Up @@ -220,12 +233,14 @@ def inner(*args, **kwargs):
exp = args[0]
log = exp._log
try:
logger.debug('Run func {}'.format(fn))
new_exp = func(*args, **kwargs)
if exp._log is True:
param = ['%r' % i for i in args[1:]] + ['%s=%r' % (k, v) for k, v in kwargs.items()]
param = ', '.join(param)
new_exp._call_history.append('{0}({1})'.format(fn, param))
exp._log = False
logger.debug('Current object: {}'.format(new_exp))
finally:
# set log status back
exp._log = log
Expand Down Expand Up @@ -275,7 +290,7 @@ def get_data(self, sparse=None, copy=False):

@property
def shape(self):
return self.get_data().shape
return self.data.shape

def reorder(self, new_order, axis=0, inplace=False):
'''Reorder according to indices in the new order.
Expand Down
44 changes: 40 additions & 4 deletions calour/filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
filter_mean
filter_prevalence
filter_min_abundance
filter_sample_categories
'''

# ----------------------------------------------------------------------------
Expand Down Expand Up @@ -96,6 +97,44 @@ def downsample(exp, field, axis=0, num_keep=None, inplace=False):
return exp.reorder(np.concatenate(indices), axis=axis, inplace=inplace)


@Experiment._record_sig
def filter_sample_categories(exp, field, min_samples=5, inplace=False):
'''Filter sample categories that have too few samples.
This is useful to get rid of categories with few samples for
supervised classification training. It also drops the samples
that don't have any value in the field.
Examples
--------
Parameters
----------
field : str
The name of the column in samples metadata table. This column
should has categorical values
min_samples : int (optional)
Filter away the samples with a value in the given column if its sample count is
less than min_samples.
inplace : bool (optional)
False (default) to create a copy of the experiment, True to filter inplace
Returns
-------
``Experiment``
filtered so contains only features/samples present in exp and in ids
'''
exp = exp.reorder(exp.sample_metadata[field].notnull(), inplace=inplace)
unique, counts = np.unique(exp.sample_metadata[field].values, return_counts=True)
drop_values = [i for i, j in zip(unique, counts) if j < min_samples]
if drop_values:
logger.debug('Drop samples with {0} values in column {1}'.format(drop_values, field))
return exp.filter_samples(field, drop_values, negate=True, inplace=inplace)
else:
return exp


@Experiment._record_sig
def filter_by_metadata(exp, field, select, axis=0, negate=False, inplace=False):
'''Filter samples or features by metadata.
Expand All @@ -120,8 +159,6 @@ def filter_by_metadata(exp, field, select, axis=0, negate=False, inplace=False):
``Experiment``
the filtered object
'''
logger.debug('filter_by_metadata')

if axis == 0:
x = exp.sample_metadata
elif axis == 1:
Expand Down Expand Up @@ -324,7 +361,7 @@ def _unique_cut(x, unique=0.05):


def _freq_ratio(x, ratio=2):
'''the ratio of the most common value to the second most common value
'''the ratio of the counts of the most common value to the second most common value
Return True if the ratio is not greater than "ratio".
Expand Down Expand Up @@ -425,7 +462,6 @@ def filter_ids(exp, ids, axis=1, negate=False, inplace=False):
``Experiment``
filtered so contains only features/samples present in exp and in ids
'''
logger.debug('filter_ids')
okpos = []
tot_ids = 0
if axis == 0:
Expand Down
16 changes: 13 additions & 3 deletions calour/tests/test_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,13 +218,23 @@ def test_from_pandas_round_trip(self):
def test_getitem(self):
self.assertEqual(self.test1['S5', 'TACGTAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGTTTTGTAAGTCTGATGTGAAATCCCCGGGCTCAACCTGGGAATTGCATTGGAGACTGCAAGGCTAGAATCTGGCAGAGGGGGGTAGAATTCCACG'], 5)
self.assertEqual(self.test1['S4', 'TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGGGTGCGTAGGCGGCCTGTTAAGTAAGTGGTTAAATTGTTGGGCTCAACCCAATCCGGCCACTTAAACTGGCAGGCTAGAGTATTGGAGAGGCAAGTGGAATTCCATGT'], 0)
with self.assertRaises(ValueError):
with self.assertRaises(KeyError):
self.test1['Pita', 'TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGGGTGCGTAGGCGGCCTGTTAAGTAAGTGGTTAAATTGTTGGGCTCAACCCAATCCGGCCACTTAAACTGGCAGGCTAGAGTATTGGAGAGGCAAGTGGAATTCCATGT']
with self.assertRaises(ValueError):
with self.assertRaises(KeyError):
self.test1['S5', 'Pita']
with self.assertRaises(ValueError):
with self.assertRaises(SyntaxError):
self.test1['S5']

def test_shape(self):
self.assertEqual(self.test1.shape, (21, 12))

def test_getitem_slice(self):
# 1st sample
npt.assert_array_equal(self.test1['S1', :], self.test1.data.toarray()[0, :])
# 2nd feature
npt.assert_array_equal(self.test1[:, 'TACATAGGTCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGTTCGTAGGCTGTTTATTAAGTCTGGAGTCAAATCCCAGGGCTCAACCCTGGCTCGCTTTGGATACTGGTAAACTAGAGTTAGATAGAGGTAAGCAGAATTCCATGT'],
self.test1.data.toarray()[:, 1])


if __name__ == "__main__":
main()
8 changes: 8 additions & 0 deletions calour/tests/test_filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,14 @@ def test_filter_ids_samples_inplace_negate(self):
self.assertCountEqual(list(exp.sample_metadata.index.values), oksamples)
self.assertIs(exp, self.test1)

def test_filter_sample_categories(self):
test = self.test1.filter_ids(['badsample'], axis=0, negate=True)
# does not filter anything
assert_experiment_equal(test.filter_sample_categories('group', 9), test)
# filter group of 2
assert_experiment_equal(test.filter_sample_categories('group', 10),
test.filter_samples('group', '1'))


if __name__ == '__main__':
main()

0 comments on commit e072bda

Please sign in to comment.