Update reporting to diff abundance and filtering (#279)

* update changelog and remove old parameter doc * add testing for no samples with value and info printing on various diff. abundance function * add axis filtered for filter_by_data() * bugfix in reporting missing samples in _read_metadata()
biocore · Jul 28, 2022 · 8e6b16c · 8e6b16c
1 parent 0e2e5f9
commit 8e6b16c
Show file tree

Hide file tree

Showing 5 changed files with 24 additions and 13 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,7 @@
 ## Version 2022.7.1
 Incompatible changes:
 * Change default join_metadata_fields() inplace parameter to False
+* In plot_diff_abundance_enrichment(), plot_enrichment(), Replaced enriched_exp_color parameter with labels_kwargs, numbers_kwargs, to enable better control of the barplot labels
 
 Bug Fixes:
 * Fix join_metadata_fields() to use axis='s' by default

diff --git a/calour/analysis.py b/calour/analysis.py
@@ -117,9 +117,11 @@ def correlation(exp: Experiment, field, method='spearman', nonzero=False, transf
     # remove the nans
     nanpos = np.where(np.isnan(labels))[0]
     if len(nanpos) > 0:
-        logger.warning('NaN values encountered in labels for correlation. Ignoring these samples')
         labels = np.delete(labels, nanpos)
         data = np.delete(data, nanpos, axis=1)
+        logger.warning('NaN values encountered in labels for correlation. Ignoring these samples (%d). %d samples left' % (len(nanpos), len(labels)))
+        if len(labels) == 0:
+            raise ValueError('Field %s does not seem to contain any samples with numeric value' % field)
     # change the method if we have nonzero
     if nonzero:
         if method == 'spearman':
@@ -233,7 +235,15 @@ def diff_abundance(exp: Experiment, field, val1, val2=None, method='meandiff', t
     # prepare the labels.
     labels = np.zeros(len(cexp.sample_metadata))
     labels[cexp.sample_metadata[field].isin(val1).values] = 1
-    logger.info('%d samples with value 1 (%s)' % (np.sum(labels), val1))
+
+    # check if we have samples left in val1 and val2
+    if np.sum(labels) == len(cexp.sample_metadata):
+        raise ValueError('No samples in field: %s found with val2: %s' % (field, grp2))
+    if np.sum(labels) == 0:
+        raise ValueError('No samples in field: %s found with val1: %s' % (field, grp1))
+
+    logger.info('%d samples with value 1 (%s), %d samples with value2 (%s)' % (np.sum(labels), grp1, len(cexp.sample_metadata) - np.sum(labels), grp2))
+
     keep, odif, pvals, qvals = dsfdr.dsfdr(data, labels, method=method, transform_type=transform, alpha=alpha, numperm=numperm, fdr_method=fdr_method, shuffler=shuffler, random_seed=random_seed)
     logger.info('number of higher in {}: {}. number of higher in {} : {}. total {}'.format(
         grp1, np.sum(odif[keep] > 0), grp2, np.sum(odif[keep] < 0), np.sum(keep)))
@@ -362,6 +372,10 @@ def diff_abundance_paired(exp: Experiment, pair_field, field, val1, val2=None, t
         logger.info('Dropping %d values with < 2 samples' % len(drop_values))
         exp = exp.filter_samples(pair_field, drop_values, negate=True)
 
+    if len(exp.sample_metadata) == 0:
+        raise ValueError('No samples with >1 value in pair field left')
+    logger.info('%d samples left after removing group value singletons' % len(exp.sample_metadata))
+
     # create the groups list for the shuffle function
     groups = defaultdict(list)
     for pos, (idx, crow) in enumerate(exp.sample_metadata.iterrows()):

diff --git a/calour/filtering.py b/calour/filtering.py
@@ -281,7 +281,11 @@ def filter_by_data(exp: Experiment, predicate, axis=1, field=None,
     if negate is True:
         select = ~ select
 
-    logger.info('After filtering, %s remain.' % np.sum(select))
+    if axis == 0:
+        grp_str = 'samples'
+    else:
+        grp_str = 'features'
+    logger.info('After filtering, %s %s remaining.' % (np.sum(select), grp_str))
     return exp.reorder(select, axis=axis, inplace=inplace)
 
 

diff --git a/calour/io.py b/calour/io.py
@@ -315,12 +315,12 @@ def _read_metadata(ids, f, kwargs):
         diff = mid - ids2
         if diff:
             logger.warning('Found %d samples that have metadata but do not have data. These samples have been dropped.' % len(diff))
-            logger.info('First 5 samples without data: %r' % diff[:5])
+            logger.info('First 5 samples without data: %r' % list(diff)[:5])
             logger.debug('These have metadata but do not have data - dropped (%d): %r' % (len(diff), diff))
         diff = ids2 - mid
         if diff:
             logger.warning('Found %d samples that have data but do not have metadata.' % len(diff))
-            logger.info('First 5 samples without metadata: %r' % diff[:5])
+            logger.info('First 5 samples without metadata: %r' % list(diff)[:5])
             logger.debug('These have data but do not have metadata: %r' % diff)
         # reorder the id in metadata to align with biom
         # metadata = metadata.loc[ids, ]

diff --git a/calour/plotting.py b/calour/plotting.py
@@ -95,9 +95,6 @@ def plot_enrichment(exp: Experiment, enriched, max_show=10, max_len=40, ax=None,
         name for terms enriched in group1 or group2 respectively, or None to not show legend
     colors: tuple of (str, str) or None (optional)
         Colors for terms enriched in group1 or group2 respectively
-    enriched_exp_color: str or None, optional
-        If not None, the color to show the number of enriched experiments for each term in the bar. Default is white since the background is the bar color (green/red).
-        None to not show the enriched experiments count
     labels_kwargs: dict, optional
         Additional parameters for the axis ticks labels fonts. See matplolib.axes.Axes.set_yticklabels()
     numbers_kwargs: dict or None, optional
@@ -198,11 +195,6 @@ def plot_diff_abundance_enrichment(exp: Experiment, max_show=10, max_len=40, ax=
         Colors for terms enriched in group1 or group2 respectively
     show_legend: bool (optional)
         True to show the color legend, False to hide it
-    enriched_exp_color: str or None, optional
-        If not None, the color to show the number of enriched
-        experiments for each term in the bar. Default is white since
-        the background is the bar color (green/red).  None to not show
-        the enriched experiments count
     labels_kwargs: dict, optional
         Additional parameters to pass to the bar labels rendering. see matplolib.axes.Axes.set_yticklabels()
     numbers_kwargs: dict or None, optional