add filter_sample_categories and allow slice in __getitem__ (#49)

* update moving picture example * update the simple demo example * delete old files and reorganize * add amphibian example for manuscript * add examples for manuscript * remove manuscript data set from this repo * add filter_sample_categories and empower Experiment.__getitem__ to take slice as input * update notebooks * fix a typo causing travis failure * oh pep8
biocore · Sep 15, 2017 · e072bda · e072bda
1 parent b33d09c
commit e072bda
Show file tree

Hide file tree

Showing 29 changed files with 9,421 additions and 47,019 deletions.
diff --git a/calour/analysis.py b/calour/analysis.py
@@ -65,13 +65,14 @@ def correlation(exp, field, method='spearman', nonzero=False, transform=None, nu
         the desired FDR control level
     numperm : int
         number of permutations to perform
+    fdr_method : str
+        method to compute FDR. Allowed method include "", ""
 
     Returns
     -------
     newexp : calour.Experiment
-        The experiment with only significant (FDR<=maxfval) correlated features, sorted according to correlation size
+        The experiment with only significant (FDR<=maxfval) correlated features, sorted according to correlation coefficient
     '''
-    # remove features not present in both groups
     cexp = exp.filter_min_abundance(0, strict=True)
 
     data = cexp.get_data(copy=True, sparse=False).transpose()
@@ -112,7 +113,7 @@ def diff_abundance(exp, field, val1, val2=None, method='meandiff', transform='ra
         The field to test by
     val1: str or list of str
         The values for the first group.
-    val1: str or list of str or None (optional)
+    val2: str or list of str or None (optional)
         None (default) to compare to all other samples (not in val1)
     method : str or function
         the method to use for the t-statistic test. options:
@@ -198,7 +199,7 @@ def diff_abundance_kw(exp, field, transform='rankdata', numperm=1000, alpha=0.1,
         labels[exp.sample_metadata[field].values == clabel] = idx
     logger.debug('Found %d unique sample labels' % (idx+1))
     keep, odif, pvals = dsfdr.dsfdr(data, labels, method='kruwallis', transform_type=transform, alpha=alpha, numperm=numperm, fdr_method=fdr_method)
-    print(keep)
+
     logger.info('Found %d significant features' % (np.sum(keep)))
     return _new_experiment_from_pvals(cexp, exp, keep, odif, pvals)
 

diff --git a/calour/experiment.py b/calour/experiment.py
@@ -165,16 +165,29 @@ def __getitem__(self, pos):
             The abundance of feature ID in sample ID
         '''
         if not isinstance(pos, tuple) or len(pos) != 2:
-            raise ValueError('Must supply sample ID, feature ID')
+            raise SyntaxError('Must supply sample ID, feature ID')
+
         sample = pos[0]
         feature = pos[1]
-        if sample not in self.sample_metadata.index:
-            raise ValueError('SampleID %s not in experiment samples' % sample)
-        if feature not in self.feature_metadata.index:
-            raise ValueError('FeatureID %s not in experiment features' % feature)
-        sample_pos = self.sample_metadata.index.get_loc(sample)
-        feature_pos = self.feature_metadata.index.get_loc(feature)
-        return self.data[sample_pos, feature_pos]
+        if isinstance(sample, slice):
+            sample_pos = sample
+        else:
+            try:
+                sample_pos = self.sample_metadata.index.get_loc(sample)
+            except KeyError:
+                raise KeyError('SampleID %s not in experiment samples' % sample)
+        if isinstance(feature, slice):
+            feature_pos = feature
+        else:
+            try:
+                feature_pos = self.feature_metadata.index.get_loc(feature)
+            except KeyError:
+                raise KeyError('FeatureID %s not in experiment features' % feature)
+        if self.sparse:
+            dat = self.get_data(sparse=False)
+        else:
+            dat = self.get_data()
+        return dat[sample_pos, feature_pos]
 
     def copy(self):
         '''Copy the object.
@@ -220,12 +233,14 @@ def inner(*args, **kwargs):
             exp = args[0]
             log = exp._log
             try:
+                logger.debug('Run func {}'.format(fn))
                 new_exp = func(*args, **kwargs)
                 if exp._log is True:
                     param = ['%r' % i for i in args[1:]] + ['%s=%r' % (k, v) for k, v in kwargs.items()]
                     param = ', '.join(param)
                     new_exp._call_history.append('{0}({1})'.format(fn, param))
                     exp._log = False
+                    logger.debug('Current object: {}'.format(new_exp))
             finally:
                 # set log status back
                 exp._log = log
@@ -275,7 +290,7 @@ def get_data(self, sparse=None, copy=False):
 
     @property
     def shape(self):
-        return self.get_data().shape
+        return self.data.shape
 
     def reorder(self, new_order, axis=0, inplace=False):
         '''Reorder according to indices in the new order.

diff --git a/calour/filtering.py b/calour/filtering.py
@@ -16,6 +16,7 @@
    filter_mean
    filter_prevalence
    filter_min_abundance
+   filter_sample_categories
 '''
 
 # ----------------------------------------------------------------------------
@@ -96,6 +97,44 @@ def downsample(exp, field, axis=0, num_keep=None, inplace=False):
     return exp.reorder(np.concatenate(indices), axis=axis, inplace=inplace)
 
 
+@Experiment._record_sig
+def filter_sample_categories(exp, field, min_samples=5, inplace=False):
+    '''Filter sample categories that have too few samples.
+
+    This is useful to get rid of categories with few samples for
+    supervised classification training.  It also drops the samples
+    that don't have any value in the field.
+
+    Examples
+    --------
+
+    Parameters
+    ----------
+    field : str
+        The name of the column in samples metadata table. This column
+        should has categorical values
+    min_samples : int (optional)
+        Filter away the samples with a value in the given column if its sample count is
+        less than min_samples.
+    inplace : bool (optional)
+        False (default) to create a copy of the experiment, True to filter inplace
+
+    Returns
+    -------
+    ``Experiment``
+        filtered so contains only features/samples present in exp and in ids
+
+    '''
+    exp = exp.reorder(exp.sample_metadata[field].notnull(), inplace=inplace)
+    unique, counts = np.unique(exp.sample_metadata[field].values, return_counts=True)
+    drop_values = [i for i, j in zip(unique, counts) if j < min_samples]
+    if drop_values:
+        logger.debug('Drop samples with {0} values in column {1}'.format(drop_values, field))
+        return exp.filter_samples(field, drop_values, negate=True, inplace=inplace)
+    else:
+        return exp
+
+
 @Experiment._record_sig
 def filter_by_metadata(exp, field, select, axis=0, negate=False, inplace=False):
     '''Filter samples or features by metadata.
@@ -120,8 +159,6 @@ def filter_by_metadata(exp, field, select, axis=0, negate=False, inplace=False):
     ``Experiment``
         the filtered object
     '''
-    logger.debug('filter_by_metadata')
-
     if axis == 0:
         x = exp.sample_metadata
     elif axis == 1:
@@ -324,7 +361,7 @@ def _unique_cut(x, unique=0.05):
 
 
 def _freq_ratio(x, ratio=2):
-    '''the ratio of the most common value to the second most common value
+    '''the ratio of the counts of the most common value to the second most common value
 
     Return True if the ratio is not greater than "ratio".
 
@@ -425,7 +462,6 @@ def filter_ids(exp, ids, axis=1, negate=False, inplace=False):
     ``Experiment``
         filtered so contains only features/samples present in exp and in ids
     '''
-    logger.debug('filter_ids')
     okpos = []
     tot_ids = 0
     if axis == 0:

diff --git a/calour/tests/test_experiment.py b/calour/tests/test_experiment.py
@@ -218,13 +218,23 @@ def test_from_pandas_round_trip(self):
     def test_getitem(self):
         self.assertEqual(self.test1['S5', 'TACGTAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGTTTTGTAAGTCTGATGTGAAATCCCCGGGCTCAACCTGGGAATTGCATTGGAGACTGCAAGGCTAGAATCTGGCAGAGGGGGGTAGAATTCCACG'], 5)
         self.assertEqual(self.test1['S4', 'TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGGGTGCGTAGGCGGCCTGTTAAGTAAGTGGTTAAATTGTTGGGCTCAACCCAATCCGGCCACTTAAACTGGCAGGCTAGAGTATTGGAGAGGCAAGTGGAATTCCATGT'], 0)
-        with self.assertRaises(ValueError):
+        with self.assertRaises(KeyError):
             self.test1['Pita', 'TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGGGTGCGTAGGCGGCCTGTTAAGTAAGTGGTTAAATTGTTGGGCTCAACCCAATCCGGCCACTTAAACTGGCAGGCTAGAGTATTGGAGAGGCAAGTGGAATTCCATGT']
-        with self.assertRaises(ValueError):
+        with self.assertRaises(KeyError):
             self.test1['S5', 'Pita']
-        with self.assertRaises(ValueError):
+        with self.assertRaises(SyntaxError):
             self.test1['S5']
 
+    def test_shape(self):
+        self.assertEqual(self.test1.shape, (21, 12))
+
+    def test_getitem_slice(self):
+        # 1st sample
+        npt.assert_array_equal(self.test1['S1', :], self.test1.data.toarray()[0, :])
+        # 2nd feature
+        npt.assert_array_equal(self.test1[:, 'TACATAGGTCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGTTCGTAGGCTGTTTATTAAGTCTGGAGTCAAATCCCAGGGCTCAACCCTGGCTCGCTTTGGATACTGGTAAACTAGAGTTAGATAGAGGTAAGCAGAATTCCATGT'],
+                               self.test1.data.toarray()[:, 1])
+
 
 if __name__ == "__main__":
     main()
diff --git a/calour/tests/test_filtering.py b/calour/tests/test_filtering.py
@@ -211,6 +211,14 @@ def test_filter_ids_samples_inplace_negate(self):
         self.assertCountEqual(list(exp.sample_metadata.index.values), oksamples)
         self.assertIs(exp, self.test1)
 
+    def test_filter_sample_categories(self):
+        test = self.test1.filter_ids(['badsample'], axis=0, negate=True)
+        # does not filter anything
+        assert_experiment_equal(test.filter_sample_categories('group', 9), test)
+        # filter group of 2
+        assert_experiment_equal(test.filter_sample_categories('group', 10),
+                                test.filter_samples('group', '1'))
+
 
 if __name__ == '__main__':
     main()