add a couple functions (#88)

* add dropna function * merge dropna into filter_by_metadata keep only needed kwargs using docrep allow set in _to_list() * fix plot_shareness() * add join experiment featurewise and fix plot_core_features() fix other minor things * fix a edge bug when the function param is a tuple * rename doc_init to _doc and fix pep8 * add docstring
biocore · Mar 24, 2018 · 422648d · 422648d
1 parent c01c3ba
commit 422648d
Show file tree

Hide file tree

Showing 20 changed files with 246 additions and 59 deletions.
diff --git a/calour/__init__.py b/calour/__init__.py
@@ -34,7 +34,5 @@
 for fn, f in inspect.getmembers(Experiment, predicate=inspect.isfunction):
     setattr(Experiment, fn, _convert_axis_name(f))
 
-log = resource_filename(__package__, 'log.cfg')
-
 # setting False allows other logger to print log.
-fileConfig(log, disable_existing_loggers=False)
+fileConfig(resource_filename(__package__, 'log.cfg'), disable_existing_loggers=False)
diff --git a/calour/doc_init.py → calour/_doc.py b/calour/doc_init.py → calour/_doc.py
diff --git a/calour/amplicon_experiment.py b/calour/amplicon_experiment.py
@@ -254,3 +254,31 @@ def split_taxonomy(self, field='taxonomy', sep=';',
             the column names for the new columns split from ``field``
         '''
         self.feature_metadata[names] = self.feature_metadata[field].str.split(sep, expand=True)
+        # return so you can chain the functions
+        return self
+
+    def find_lowest_taxonomy(self, field='taxonomy', new_field='taxa'):
+        '''Create a new column that contains the taxonomy of lowest possible level.
+
+        For example, 'k__Bacteria; p__Firmicutes; c__Bacilli,
+        o__Lactobacillales; f__Enterococcaceae; g__Enterococcus,
+        s__' will return 'g__Enterococcus'
+
+        Parameters
+        ----------
+        field : str
+            column name that contains all levels of taxonomy
+        new_field : str
+            new column name
+
+        Returns
+        -------
+        AmpliconExperiment
+
+        '''
+        def find_highest(s):
+            l = s.split(';')
+            b = [len(i) > 3 for i in l]
+            return np.array(l)[b][-1]
+        self.feature_metadata[new_field] = self.feature_metadata[field].apply(find_highest)
+        return self
diff --git a/calour/experiment.py b/calour/experiment.py
@@ -227,7 +227,8 @@ def inner(*args, **kwargs):
                 logger.debug('Run func {}'.format(fn))
                 new_exp = func(*args, **kwargs)
                 if exp._log is True:
-                    param = ['%r' % i for i in args[1:]] + ['%s=%r' % (k, v) for k, v in kwargs.items()]
+                    # do not use `'%r' % i` because it causes error when i is a tuple
+                    param = ['{!r}'.format(i) for i in args[1:]] + ['{0!s}={1!r}'.format(k, v) for k, v in kwargs.items()]
                     param = ', '.join(param)
                     new_exp._call_history.append('{0}({1})'.format(fn, param))
                     exp._log = False

diff --git a/calour/filtering.py b/calour/filtering.py
@@ -36,9 +36,9 @@
 import numpy as np
 from scipy.sparse import issparse
 
-from .experiment import Experiment
+from . import Experiment
+from ._doc import ds
 from .util import _to_list
-from .doc_init import ds
 
 
 logger = getLogger(__name__)
@@ -514,7 +514,9 @@ def filter_mean(exp: Experiment, cutoff=0.01, **kwargs):
 
 @Experiment._record_sig
 def filter_ids(exp: Experiment, ids, axis=1, negate=False, inplace=False):
-    '''Filter samples or features based on a list index values
+    '''Filter samples or features based on a list IDs.
+
+    .. note:: the order of samples or features is updated as the order given in ``ids``.
 
     Parameters
     ----------

diff --git a/calour/heatmap/heatmap.py b/calour/heatmap/heatmap.py
@@ -16,9 +16,9 @@
 
 from .. import Experiment
 from ..database import _get_database_class
-from .._dendrogram import plot_tree
 from ..util import _to_list, _transition_index
-from ..doc_init import ds
+from .._doc import ds
+from .._dendrogram import plot_tree
 
 
 logger = getLogger(__name__)

diff --git a/calour/heatmap/plotgui.py b/calour/heatmap/plotgui.py
@@ -12,7 +12,7 @@
 import numpy as np
 from matplotlib.gridspec import GridSpec
 
-from ..doc_init import ds
+from .._doc import ds
 
 logger = getLogger(__name__)
 

diff --git a/calour/heatmap/plotgui_cli.py b/calour/heatmap/plotgui_cli.py
@@ -7,7 +7,7 @@
 # ----------------------------------------------------------------------------
 
 from .plotgui import PlotGUI
-from ..doc_init import ds
+from .._doc import ds
 
 
 class PlotGUI_CLI(PlotGUI):

diff --git a/calour/heatmap/plotgui_jupyter.py b/calour/heatmap/plotgui_jupyter.py
@@ -5,12 +5,13 @@
 import matplotlib
 
 from .plotgui import PlotGUI
-from ..doc_init import ds
+from .._doc import ds
 
 
 logger = getLogger(__name__)
 
 
+@ds.with_indent(4)
 class PlotGUI_Jupyter(PlotGUI):
     '''Jupyter GUI of plotting.
 
@@ -19,6 +20,7 @@ class PlotGUI_Jupyter(PlotGUI):
 
     Parameters
     ----------
+    %(PlotGUI.parameters)s
     '''
 
     @ds.with_indent(8)

diff --git a/calour/heatmap/plotgui_qt5.py b/calour/heatmap/plotgui_qt5.py
@@ -11,7 +11,7 @@
 from PyQt5.QtCore import Qt
 
 from .plotgui import PlotGUI
-from ..doc_init import ds
+from .._doc import ds
 
 logger = getLogger(__name__)
 

diff --git a/calour/io.py b/calour/io.py
@@ -32,12 +32,9 @@
 import pandas as pd
 import biom
 
-from .experiment import Experiment
-from .amplicon_experiment import AmpliconExperiment
-from .ms1_experiment import MS1Experiment
+from . import Experiment, AmpliconExperiment, MS1Experiment
 from .util import get_file_md5, get_data_md5, _get_taxonomy_string
-
-from .doc_init import ds
+from ._doc import ds
 
 
 logger = getLogger(__name__)

diff --git a/calour/manipulation.py b/calour/manipulation.py
@@ -267,3 +267,50 @@ def join_experiments(exp: Experiment, other, field_name='experiments', prefixes=
     newexp.data = all_data
 
     return newexp
+
+
+@Experiment._record_sig
+def join_experiments_featurewise(exp: Experiment, other,
+                                 field_name='_feature_origin_', origin_labels=('exp1', 'exp2')):
+    '''Combine two :class:`.Experiment` objects into one.
+
+    An example of user cases is to combine the 16S and ITS amplicon data together.
+
+    .. warning:: If a sample has only features in one Experiment
+    object and not the other, the sample will be dropped from joining.
+
+    Parameters
+    ----------
+    other : :class:`.Experiment`
+        The ``Experiment`` object to combine with the current one.  If
+        both experiments contain the same feature metadata column and
+        there is a conflict between the two, the value will be taken
+        from exp and not from other.
+    field_name : ``None`` or str (optional)
+        Name of the new ``feature_metdata`` field containing the experiment each feature is coming from.
+        If it is None, don't add such column.
+    labels : tuple of (str, str) (optional)
+        The text to label which experiment the feature is originated from.
+
+    Returns
+    -------
+    :class:`.Experiment`
+        A new experiment with samples from both experiments concatenated, features from both
+        experiments merged.
+
+    '''
+    logger.debug('Join 2 experiments featurewise:\n{!r}\n{!r}'.format(exp, other))
+    # create an empty object
+    newexp = exp.__class__(np.empty(shape=[0, 0]), pd.DataFrame(),
+                           description='join %s & %s' % (exp.description, other.description))
+    sid = exp.sample_metadata.index.intersection(other.sample_metadata.index)
+    exp = exp.filter_ids(sid, axis=0)
+    other = other.filter_ids(sid, axis=0)
+    fmd = pd.concat([exp.feature_metadata, other.feature_metadata], join='outer')
+    fmd[field_name] = [origin_labels[0]] * exp.shape[1] + [origin_labels[1]] * other.shape[1]
+    newexp.sample_metadata = exp.sample_metadata
+    newexp.feature_metadata = fmd
+    # merge data table
+    newexp.data = np.c_[exp.data, other.data]
+
+    return newexp
diff --git a/calour/plotting.py b/calour/plotting.py
@@ -13,8 +13,9 @@
    plot_enrichment
    plot_diff_abundance_enrichment
    plot_stacked_bar
-   plot_shareness
+   plot_core_features
    plot_abund_prevalence
+   plot_scatter_matrix
 '''
 
 # ----------------------------------------------------------------------------
@@ -217,8 +218,8 @@ def plot_diff_abundance_enrichment(exp: Experiment, term_type='term', max_show=1
     return ax2, newexp
 
 
-def plot_shareness(exp: Experiment, field=None, steps=None, iterations=10, alpha=0.5, linewidth=0.7, ax=None):
-    '''Plot the number of shared features against the number of samples.
+def plot_core_features(exp: Experiment, field=None, steps=None, cutoff=2, frac=0.9, iterations=10, alpha=0.5, linewidth=0.7, ax=None):
+    '''Plot the percentage of core features shared in increasing number of samples.
 
     To see if there is a core feature set shared across most of the samples.
 
@@ -237,7 +238,12 @@ def plot_shareness(exp: Experiment, field=None, steps=None, iterations=10, alpha
     field : str
         sample metadata field to group samples
     steps : iterable of int
-        the sizes of subsamples to compute the shareness.
+        the sizes of subsamples to compute the fraction of core features.
+    cutoff : numeric
+        the feature is considered present in a sample only if its abundance is >= cutoff.
+    frac : numeric
+        Must between 0 and 1. The feature would be considered as a core feature
+        if it is present in ``fac`` faction of samples.
     iterations : int
         repeat the subsampling multiple times and plot all the iterations
     alpha : float
@@ -251,7 +257,6 @@ def plot_shareness(exp: Experiment, field=None, steps=None, iterations=10, alpha
     -------
     matplotlib.axes.Axes
         The Axes object containing the plot.
-
     '''
     if ax is None:
         from matplotlib import pyplot as plt
@@ -264,15 +269,16 @@ def plot_shareness(exp: Experiment, field=None, steps=None, iterations=10, alpha
     def plot_lines(data, steps, label):
         y_sum = np.zeros(len(steps))
         for i in range(iterations):
-            x, y = _compute_frac_nonzero(data, steps)
+            y = _compute_frac_nonzero(data, steps, cutoff, frac, i)
+            y = y * 100
             y_sum += y
             if i == 0:
-                line, = ax.plot(x, y, alpha=alpha, linewidth=linewidth)
+                line, = ax.plot(steps, y, alpha=alpha, linewidth=linewidth)
             else:
-                ax.plot(x, y, alpha=alpha, linewidth=linewidth, color=line.get_color())
+                ax.plot(steps, y, alpha=alpha, linewidth=linewidth, color=line.get_color())
         y_ave = y_sum / iterations
         # plot average of the iterations
-        ax.plot(x, y_ave, linewidth=linewidth * 3, label=label, color=line.get_color())
+        ax.plot(steps, y_ave, linewidth=linewidth * 3, label=label, color=line.get_color())
 
     if field is None:
         plot_lines(exp.data, steps, label='all samples')
@@ -286,39 +292,55 @@ def plot_lines(data, steps, label):
     # because the shareness drops quickly, we plot it in log scale
     ax.set_xscale('log')
     ax.set_xlabel('sample number')
-    ax.set_ylabel('fraction of shared features')
+    ax.set_ylabel('shared features (%)')
     return ax
 
 
-def _compute_frac_nonzero(data, steps):
+def _compute_frac_nonzero(data, steps, cutoff=2, frac=0.9, random_state=None):
     '''iteratively compute the fraction of non-zeros in each column after subsampling rows.
 
     Parameters
     ----------
     data : 2-d array of numeric
         sample in row and feature in column
     steps : iterable of int
-        the sizes of subsamples
+        the subsample sizes (should be in descending order)
+    cutoff : numeric
+        the feature is considered present in a sample only if its abundance is >= cutoff.
+    frac : numeric
+        Must between 0 and 1. The feature would be considered as a core feature
+        if it is present in ``fac`` faction of samples.
+    random_state : int, RandomState instance or None, optional, default=None
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
 
     Return
     ------
-    tuple of 2 lists
-        steps and fractions
+    numpy.array
+        fractions of core features for each subsample size
     '''
     n_samples, n_features = data.shape
-
-    shared = []
-    for i in steps:
-        data = data[np.random.choice(n_samples, i, replace=False), :]
-        x = data > 0
+    shared = np.zeros(len(steps))
+    rand = np.random.RandomState(random_state)
+    if cutoff <= 0:
+        raise ValueError('You need to provide a positive value for `cutoff`: %r' % cutoff)
+    if frac <= 0 or frac > 1:
+        raise ValueError('You need to provide a value among (0, 1] for `frac`: %r' % frac)
+    for n, i in enumerate(steps):
+        data = data[rand.choice(n_samples, i, replace=False), :]
+        print(data)
+        x = data >= cutoff
         # the count of samples that have the given feature
         counts = x.sum(axis=0)
-        all_presence = np.sum(counts == i)
+        all_presence = np.sum(counts >= np.ceil(i * frac))
         all_absence = np.sum(counts == 0)
         # important: remove the features that are all zeros across the subset of samples
-        shared.append(all_presence / (n_features - all_absence))
+        shared[n] = all_presence / (n_features - all_absence)
+        # don't forget to update sample count
         n_samples = data.shape[0]
-    return steps, shared
+    return shared
 
 
 def plot_abund_prevalence(exp: Experiment, field, log=True, min_abund=0.01, alpha=0.5, linewidth=0.7, ax=None):

diff --git a/calour/sorting.py b/calour/sorting.py
@@ -37,7 +37,7 @@
 from . import Experiment
 from .transforming import log_n, transform, scale
 from .util import _argsort
-from .doc_init import ds
+from ._doc import ds
 
 
 logger = getLogger(__name__)

diff --git a/calour/tests/test_manipulation.py b/calour/tests/test_manipulation.py
@@ -10,9 +10,9 @@
 from copy import deepcopy
 
 import numpy as np
+import pandas as pd
 
 import calour as ca
-
 from calour._testing import Tests, assert_experiment_equal
 
 
@@ -50,6 +50,29 @@ def test_join_experiments(self):
         fexp = newexp.filter_samples('experiments', ['t2'])
         assert_experiment_equal(fexp, texp, ignore_md_fields=['experiments'])
 
+    def test_join_experiments_featurewise(self):
+        otu1 = ca.Experiment(np.array([[0, 9], [7, 4]]), sparse=False,
+                             sample_metadata=pd.DataFrame({'category': ['B', 'A'],
+                                                           'ph': [7.7, 6.6]},
+                                                          index=['s2', 's1']),
+                             feature_metadata=pd.DataFrame({'motile': ['y', 'n']}, index=['16S1', '16S2']))
+        otu2 = ca.Experiment(np.array([[6], [8], [10]]), sparse=False,
+                             sample_metadata=pd.DataFrame({'category': ['A', 'B', 'C'],
+                                                           'ph': [6.6, 7.7, 8.8]},
+                                                          index=['s1', 's2', 's3']),
+                             feature_metadata=pd.DataFrame({'motile': [None]}, index=['ITS1']))
+        combined_obs = otu1.join_experiments_featurewise(otu2, 'origin', ('16S', 'ITS'))
+        combined_exp = ca.Experiment(np.array([[7, 4, 6], [0, 9, 8]]), sparse=False,
+                                     sample_metadata=pd.DataFrame({'category': ['A', 'B'],
+                                                                   'ph': [6.6, 7.7]},
+                                                                  index=['s1', 's2']),
+                                     feature_metadata=pd.DataFrame({'motile': ['y', 'n', None],
+                                                                    'origin': ['16S', '16S', 'ITS']},
+                                                                   index=['16S1', '16S2', 'ITS1']))
+        # reorder the samples
+        combined_obs = combined_obs.filter_ids(combined_exp.sample_metadata.index, axis=0)
+        assert_experiment_equal(combined_obs, combined_exp)
+
     def test_agg_by_metadata(self):
         # test default conditions - on samples, not inplace, mean method
         newexp = self.test1.aggregate_by_metadata('group')