From 422648d0771c4dc62a93c7994e23693c8ee02b0b Mon Sep 17 00:00:00 2001
From: Zech Xu <zhenjiang.xu@gmail.com>
Date: Fri, 23 Mar 2018 22:47:19 -0700
Subject: [PATCH] add a couple functions (#88)

* add dropna function

* merge dropna into filter_by_metadata

keep only needed kwargs using docrep
allow set in _to_list()

* fix plot_shareness()

* add join experiment featurewise and fix plot_core_features()

fix other minor things

* fix a edge bug when the function param is a tuple

* rename doc_init to _doc and fix pep8

* add docstring
---
 calour/__init__.py                |  4 +-
 calour/{doc_init.py => _doc.py}   |  0
 calour/amplicon_experiment.py     | 28 +++++++++++++
 calour/experiment.py              |  3 +-
 calour/filtering.py               |  8 ++--
 calour/heatmap/heatmap.py         |  4 +-
 calour/heatmap/plotgui.py         |  2 +-
 calour/heatmap/plotgui_cli.py     |  2 +-
 calour/heatmap/plotgui_jupyter.py |  4 +-
 calour/heatmap/plotgui_qt5.py     |  2 +-
 calour/io.py                      |  7 +---
 calour/manipulation.py            | 47 ++++++++++++++++++++++
 calour/plotting.py                | 66 ++++++++++++++++++++-----------
 calour/sorting.py                 |  2 +-
 calour/tests/test_manipulation.py | 25 +++++++++++-
 calour/tests/test_plotting.py     | 15 ++++---
 calour/tests/test_training.py     | 16 ++++++--
 calour/training.py                | 62 ++++++++++++++++++++++++++---
 calour/transforming.py            |  5 ++-
 calour/util.py                    |  3 ++
 20 files changed, 246 insertions(+), 59 deletions(-)
 rename calour/{doc_init.py => _doc.py} (100%)

diff --git a/calour/__init__.py b/calour/__init__.py
index 302adb18..e3db53f7 100644
--- a/calour/__init__.py
+++ b/calour/__init__.py
@@ -34,7 +34,5 @@
 for fn, f in inspect.getmembers(Experiment, predicate=inspect.isfunction):
     setattr(Experiment, fn, _convert_axis_name(f))
 
-log = resource_filename(__package__, 'log.cfg')
-
 # setting False allows other logger to print log.
-fileConfig(log, disable_existing_loggers=False)
+fileConfig(resource_filename(__package__, 'log.cfg'), disable_existing_loggers=False)
diff --git a/calour/doc_init.py b/calour/_doc.py
similarity index 100%
rename from calour/doc_init.py
rename to calour/_doc.py
diff --git a/calour/amplicon_experiment.py b/calour/amplicon_experiment.py
index d43c8201..05ec00d6 100644
--- a/calour/amplicon_experiment.py
+++ b/calour/amplicon_experiment.py
@@ -254,3 +254,31 @@ def split_taxonomy(self, field='taxonomy', sep=';',
             the column names for the new columns split from ``field``
         '''
         self.feature_metadata[names] = self.feature_metadata[field].str.split(sep, expand=True)
+        # return so you can chain the functions
+        return self
+
+    def find_lowest_taxonomy(self, field='taxonomy', new_field='taxa'):
+        '''Create a new column that contains the taxonomy of lowest possible level.
+
+        For example, 'k__Bacteria; p__Firmicutes; c__Bacilli,
+        o__Lactobacillales; f__Enterococcaceae; g__Enterococcus,
+        s__' will return 'g__Enterococcus'
+
+        Parameters
+        ----------
+        field : str
+            column name that contains all levels of taxonomy
+        new_field : str
+            new column name
+
+        Returns
+        -------
+        AmpliconExperiment
+
+        '''
+        def find_highest(s):
+            l = s.split(';')
+            b = [len(i) > 3 for i in l]
+            return np.array(l)[b][-1]
+        self.feature_metadata[new_field] = self.feature_metadata[field].apply(find_highest)
+        return self
diff --git a/calour/experiment.py b/calour/experiment.py
index 52980b83..42bdb09c 100644
--- a/calour/experiment.py
+++ b/calour/experiment.py
@@ -227,7 +227,8 @@ def inner(*args, **kwargs):
                 logger.debug('Run func {}'.format(fn))
                 new_exp = func(*args, **kwargs)
                 if exp._log is True:
-                    param = ['%r' % i for i in args[1:]] + ['%s=%r' % (k, v) for k, v in kwargs.items()]
+                    # do not use `'%r' % i` because it causes error when i is a tuple
+                    param = ['{!r}'.format(i) for i in args[1:]] + ['{0!s}={1!r}'.format(k, v) for k, v in kwargs.items()]
                     param = ', '.join(param)
                     new_exp._call_history.append('{0}({1})'.format(fn, param))
                     exp._log = False
diff --git a/calour/filtering.py b/calour/filtering.py
index bb439380..045d65a7 100644
--- a/calour/filtering.py
+++ b/calour/filtering.py
@@ -36,9 +36,9 @@
 import numpy as np
 from scipy.sparse import issparse
 
-from .experiment import Experiment
+from . import Experiment
+from ._doc import ds
 from .util import _to_list
-from .doc_init import ds
 
 
 logger = getLogger(__name__)
@@ -514,7 +514,9 @@ def filter_mean(exp: Experiment, cutoff=0.01, **kwargs):
 
 @Experiment._record_sig
 def filter_ids(exp: Experiment, ids, axis=1, negate=False, inplace=False):
-    '''Filter samples or features based on a list index values
+    '''Filter samples or features based on a list IDs.
+
+    .. note:: the order of samples or features is updated as the order given in ``ids``.
 
     Parameters
     ----------
diff --git a/calour/heatmap/heatmap.py b/calour/heatmap/heatmap.py
index d0ffcca4..31979524 100644
--- a/calour/heatmap/heatmap.py
+++ b/calour/heatmap/heatmap.py
@@ -16,9 +16,9 @@
 
 from .. import Experiment
 from ..database import _get_database_class
-from .._dendrogram import plot_tree
 from ..util import _to_list, _transition_index
-from ..doc_init import ds
+from .._doc import ds
+from .._dendrogram import plot_tree
 
 
 logger = getLogger(__name__)
diff --git a/calour/heatmap/plotgui.py b/calour/heatmap/plotgui.py
index fc2462c7..31d2340e 100644
--- a/calour/heatmap/plotgui.py
+++ b/calour/heatmap/plotgui.py
@@ -12,7 +12,7 @@
 import numpy as np
 from matplotlib.gridspec import GridSpec
 
-from ..doc_init import ds
+from .._doc import ds
 
 logger = getLogger(__name__)
 
diff --git a/calour/heatmap/plotgui_cli.py b/calour/heatmap/plotgui_cli.py
index 089866ee..d345a0ef 100644
--- a/calour/heatmap/plotgui_cli.py
+++ b/calour/heatmap/plotgui_cli.py
@@ -7,7 +7,7 @@
 # ----------------------------------------------------------------------------
 
 from .plotgui import PlotGUI
-from ..doc_init import ds
+from .._doc import ds
 
 
 class PlotGUI_CLI(PlotGUI):
diff --git a/calour/heatmap/plotgui_jupyter.py b/calour/heatmap/plotgui_jupyter.py
index d0dad7aa..211843f8 100644
--- a/calour/heatmap/plotgui_jupyter.py
+++ b/calour/heatmap/plotgui_jupyter.py
@@ -5,12 +5,13 @@
 import matplotlib
 
 from .plotgui import PlotGUI
-from ..doc_init import ds
+from .._doc import ds
 
 
 logger = getLogger(__name__)
 
 
+@ds.with_indent(4)
 class PlotGUI_Jupyter(PlotGUI):
     '''Jupyter GUI of plotting.
 
@@ -19,6 +20,7 @@ class PlotGUI_Jupyter(PlotGUI):
 
     Parameters
     ----------
+    %(PlotGUI.parameters)s
     '''
 
     @ds.with_indent(8)
diff --git a/calour/heatmap/plotgui_qt5.py b/calour/heatmap/plotgui_qt5.py
index dbbe0425..a426b34e 100644
--- a/calour/heatmap/plotgui_qt5.py
+++ b/calour/heatmap/plotgui_qt5.py
@@ -11,7 +11,7 @@
 from PyQt5.QtCore import Qt
 
 from .plotgui import PlotGUI
-from ..doc_init import ds
+from .._doc import ds
 
 logger = getLogger(__name__)
 
diff --git a/calour/io.py b/calour/io.py
index 05ead062..0e79b9df 100644
--- a/calour/io.py
+++ b/calour/io.py
@@ -32,12 +32,9 @@
 import pandas as pd
 import biom
 
-from .experiment import Experiment
-from .amplicon_experiment import AmpliconExperiment
-from .ms1_experiment import MS1Experiment
+from . import Experiment, AmpliconExperiment, MS1Experiment
 from .util import get_file_md5, get_data_md5, _get_taxonomy_string
-
-from .doc_init import ds
+from ._doc import ds
 
 
 logger = getLogger(__name__)
diff --git a/calour/manipulation.py b/calour/manipulation.py
index 018c98b8..4bfbacb2 100644
--- a/calour/manipulation.py
+++ b/calour/manipulation.py
@@ -267,3 +267,50 @@ def join_experiments(exp: Experiment, other, field_name='experiments', prefixes=
     newexp.data = all_data
 
     return newexp
+
+
+@Experiment._record_sig
+def join_experiments_featurewise(exp: Experiment, other,
+                                 field_name='_feature_origin_', origin_labels=('exp1', 'exp2')):
+    '''Combine two :class:`.Experiment` objects into one.
+
+    An example of user cases is to combine the 16S and ITS amplicon data together.
+
+    .. warning:: If a sample has only features in one Experiment
+    object and not the other, the sample will be dropped from joining.
+
+    Parameters
+    ----------
+    other : :class:`.Experiment`
+        The ``Experiment`` object to combine with the current one.  If
+        both experiments contain the same feature metadata column and
+        there is a conflict between the two, the value will be taken
+        from exp and not from other.
+    field_name : ``None`` or str (optional)
+        Name of the new ``feature_metdata`` field containing the experiment each feature is coming from.
+        If it is None, don't add such column.
+    labels : tuple of (str, str) (optional)
+        The text to label which experiment the feature is originated from.
+
+    Returns
+    -------
+    :class:`.Experiment`
+        A new experiment with samples from both experiments concatenated, features from both
+        experiments merged.
+
+    '''
+    logger.debug('Join 2 experiments featurewise:\n{!r}\n{!r}'.format(exp, other))
+    # create an empty object
+    newexp = exp.__class__(np.empty(shape=[0, 0]), pd.DataFrame(),
+                           description='join %s & %s' % (exp.description, other.description))
+    sid = exp.sample_metadata.index.intersection(other.sample_metadata.index)
+    exp = exp.filter_ids(sid, axis=0)
+    other = other.filter_ids(sid, axis=0)
+    fmd = pd.concat([exp.feature_metadata, other.feature_metadata], join='outer')
+    fmd[field_name] = [origin_labels[0]] * exp.shape[1] + [origin_labels[1]] * other.shape[1]
+    newexp.sample_metadata = exp.sample_metadata
+    newexp.feature_metadata = fmd
+    # merge data table
+    newexp.data = np.c_[exp.data, other.data]
+
+    return newexp
diff --git a/calour/plotting.py b/calour/plotting.py
index 9b19f305..691573b2 100644
--- a/calour/plotting.py
+++ b/calour/plotting.py
@@ -13,8 +13,9 @@
    plot_enrichment
    plot_diff_abundance_enrichment
    plot_stacked_bar
-   plot_shareness
+   plot_core_features
    plot_abund_prevalence
+   plot_scatter_matrix
 '''
 
 # ----------------------------------------------------------------------------
@@ -217,8 +218,8 @@ def plot_diff_abundance_enrichment(exp: Experiment, term_type='term', max_show=1
     return ax2, newexp
 
 
-def plot_shareness(exp: Experiment, field=None, steps=None, iterations=10, alpha=0.5, linewidth=0.7, ax=None):
-    '''Plot the number of shared features against the number of samples.
+def plot_core_features(exp: Experiment, field=None, steps=None, cutoff=2, frac=0.9, iterations=10, alpha=0.5, linewidth=0.7, ax=None):
+    '''Plot the percentage of core features shared in increasing number of samples.
 
     To see if there is a core feature set shared across most of the samples.
 
@@ -237,7 +238,12 @@ def plot_shareness(exp: Experiment, field=None, steps=None, iterations=10, alpha
     field : str
         sample metadata field to group samples
     steps : iterable of int
-        the sizes of subsamples to compute the shareness.
+        the sizes of subsamples to compute the fraction of core features.
+    cutoff : numeric
+        the feature is considered present in a sample only if its abundance is >= cutoff.
+    frac : numeric
+        Must between 0 and 1. The feature would be considered as a core feature
+        if it is present in ``fac`` faction of samples.
     iterations : int
         repeat the subsampling multiple times and plot all the iterations
     alpha : float
@@ -251,7 +257,6 @@ def plot_shareness(exp: Experiment, field=None, steps=None, iterations=10, alpha
     -------
     matplotlib.axes.Axes
         The Axes object containing the plot.
-
     '''
     if ax is None:
         from matplotlib import pyplot as plt
@@ -264,15 +269,16 @@ def plot_shareness(exp: Experiment, field=None, steps=None, iterations=10, alpha
     def plot_lines(data, steps, label):
         y_sum = np.zeros(len(steps))
         for i in range(iterations):
-            x, y = _compute_frac_nonzero(data, steps)
+            y = _compute_frac_nonzero(data, steps, cutoff, frac, i)
+            y = y * 100
             y_sum += y
             if i == 0:
-                line, = ax.plot(x, y, alpha=alpha, linewidth=linewidth)
+                line, = ax.plot(steps, y, alpha=alpha, linewidth=linewidth)
             else:
-                ax.plot(x, y, alpha=alpha, linewidth=linewidth, color=line.get_color())
+                ax.plot(steps, y, alpha=alpha, linewidth=linewidth, color=line.get_color())
         y_ave = y_sum / iterations
         # plot average of the iterations
-        ax.plot(x, y_ave, linewidth=linewidth * 3, label=label, color=line.get_color())
+        ax.plot(steps, y_ave, linewidth=linewidth * 3, label=label, color=line.get_color())
 
     if field is None:
         plot_lines(exp.data, steps, label='all samples')
@@ -286,11 +292,11 @@ def plot_lines(data, steps, label):
     # because the shareness drops quickly, we plot it in log scale
     ax.set_xscale('log')
     ax.set_xlabel('sample number')
-    ax.set_ylabel('fraction of shared features')
+    ax.set_ylabel('shared features (%)')
     return ax
 
 
-def _compute_frac_nonzero(data, steps):
+def _compute_frac_nonzero(data, steps, cutoff=2, frac=0.9, random_state=None):
     '''iteratively compute the fraction of non-zeros in each column after subsampling rows.
 
     Parameters
@@ -298,27 +304,43 @@ def _compute_frac_nonzero(data, steps):
     data : 2-d array of numeric
         sample in row and feature in column
     steps : iterable of int
-        the sizes of subsamples
+        the subsample sizes (should be in descending order)
+    cutoff : numeric
+        the feature is considered present in a sample only if its abundance is >= cutoff.
+    frac : numeric
+        Must between 0 and 1. The feature would be considered as a core feature
+        if it is present in ``fac`` faction of samples.
+    random_state : int, RandomState instance or None, optional, default=None
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
 
     Return
     ------
-    tuple of 2 lists
-        steps and fractions
+    numpy.array
+        fractions of core features for each subsample size
     '''
     n_samples, n_features = data.shape
-
-    shared = []
-    for i in steps:
-        data = data[np.random.choice(n_samples, i, replace=False), :]
-        x = data > 0
+    shared = np.zeros(len(steps))
+    rand = np.random.RandomState(random_state)
+    if cutoff <= 0:
+        raise ValueError('You need to provide a positive value for `cutoff`: %r' % cutoff)
+    if frac <= 0 or frac > 1:
+        raise ValueError('You need to provide a value among (0, 1] for `frac`: %r' % frac)
+    for n, i in enumerate(steps):
+        data = data[rand.choice(n_samples, i, replace=False), :]
+        print(data)
+        x = data >= cutoff
         # the count of samples that have the given feature
         counts = x.sum(axis=0)
-        all_presence = np.sum(counts == i)
+        all_presence = np.sum(counts >= np.ceil(i * frac))
         all_absence = np.sum(counts == 0)
         # important: remove the features that are all zeros across the subset of samples
-        shared.append(all_presence / (n_features - all_absence))
+        shared[n] = all_presence / (n_features - all_absence)
+        # don't forget to update sample count
         n_samples = data.shape[0]
-    return steps, shared
+    return shared
 
 
 def plot_abund_prevalence(exp: Experiment, field, log=True, min_abund=0.01, alpha=0.5, linewidth=0.7, ax=None):
diff --git a/calour/sorting.py b/calour/sorting.py
index 0f5ccf3b..9b48b83f 100644
--- a/calour/sorting.py
+++ b/calour/sorting.py
@@ -37,7 +37,7 @@
 from . import Experiment
 from .transforming import log_n, transform, scale
 from .util import _argsort
-from .doc_init import ds
+from ._doc import ds
 
 
 logger = getLogger(__name__)
diff --git a/calour/tests/test_manipulation.py b/calour/tests/test_manipulation.py
index cd8ef8ce..b94e23e7 100644
--- a/calour/tests/test_manipulation.py
+++ b/calour/tests/test_manipulation.py
@@ -10,9 +10,9 @@
 from copy import deepcopy
 
 import numpy as np
+import pandas as pd
 
 import calour as ca
-
 from calour._testing import Tests, assert_experiment_equal
 
 
@@ -50,6 +50,29 @@ def test_join_experiments(self):
         fexp = newexp.filter_samples('experiments', ['t2'])
         assert_experiment_equal(fexp, texp, ignore_md_fields=['experiments'])
 
+    def test_join_experiments_featurewise(self):
+        otu1 = ca.Experiment(np.array([[0, 9], [7, 4]]), sparse=False,
+                             sample_metadata=pd.DataFrame({'category': ['B', 'A'],
+                                                           'ph': [7.7, 6.6]},
+                                                          index=['s2', 's1']),
+                             feature_metadata=pd.DataFrame({'motile': ['y', 'n']}, index=['16S1', '16S2']))
+        otu2 = ca.Experiment(np.array([[6], [8], [10]]), sparse=False,
+                             sample_metadata=pd.DataFrame({'category': ['A', 'B', 'C'],
+                                                           'ph': [6.6, 7.7, 8.8]},
+                                                          index=['s1', 's2', 's3']),
+                             feature_metadata=pd.DataFrame({'motile': [None]}, index=['ITS1']))
+        combined_obs = otu1.join_experiments_featurewise(otu2, 'origin', ('16S', 'ITS'))
+        combined_exp = ca.Experiment(np.array([[7, 4, 6], [0, 9, 8]]), sparse=False,
+                                     sample_metadata=pd.DataFrame({'category': ['A', 'B'],
+                                                                   'ph': [6.6, 7.7]},
+                                                                  index=['s1', 's2']),
+                                     feature_metadata=pd.DataFrame({'motile': ['y', 'n', None],
+                                                                    'origin': ['16S', '16S', 'ITS']},
+                                                                   index=['16S1', '16S2', 'ITS1']))
+        # reorder the samples
+        combined_obs = combined_obs.filter_ids(combined_exp.sample_metadata.index, axis=0)
+        assert_experiment_equal(combined_obs, combined_exp)
+
     def test_agg_by_metadata(self):
         # test default conditions - on samples, not inplace, mean method
         newexp = self.test1.aggregate_by_metadata('group')
diff --git a/calour/tests/test_plotting.py b/calour/tests/test_plotting.py
index dc1775c7..41d897f4 100644
--- a/calour/tests/test_plotting.py
+++ b/calour/tests/test_plotting.py
@@ -81,12 +81,12 @@ def test_plot_abund_prevalence(self):
         assert_array_almost_equal(np.array([[i, j] for i, j in zip(x, y)]),
                                   lines[1].get_xydata())
 
-    def test_plot_shareness(self):
+    def test_plot_core_features(self):
         np.random.seed(12345)
         self.test1 = ca.read(self.test1_biom, self.test1_samp, self.test1_feat, normalize=100)
         self.test1.sparse = False
         ax = self.test1.filter_samples(
-            'group', ['1', '2']).plot_shareness(
+            'group', ['1', '2']).plot_core_features(
                 field='group', steps=(2, 12), iterations=2)
         lines = ax.get_lines()
         self.assertEqual(len(lines), 6)
@@ -97,11 +97,16 @@ def test_compute_frac_nonzero(self):
                          [4, 0, 4, 3, 0, 2, 0, 5],
                          [2, 4, 0, 4, 2, 0, 1, 0],
                          [3, 3, 5, 3, 1, 0, 0, 1]])
-        np.random.seed(1)
-        steps, frac = _compute_frac_nonzero(data, [5, 3, 2])
-        self.assertListEqual(steps, [5, 3, 2])
+
+        frac = _compute_frac_nonzero(data, [5, 3, 2], cutoff=0.1, frac=1, random_state=1)
         assert_array_almost_equal(frac, np.array([0, 0.25, 4/7]))
 
+        frac = _compute_frac_nonzero(data, [5, 3, 2], cutoff=0.1, frac=0.00001, random_state=1)
+        assert_array_almost_equal(frac, np.array([1, 1, 1]))
+
+        frac = _compute_frac_nonzero(data, [5, 3, 2], cutoff=5, frac=1, random_state=1)
+        assert_array_almost_equal(frac, np.array([0, 0, 0]))
+
     def test_plot_scatter_matrix(self):
         self.test2 = ca.read(self.test2_biom, self.test2_samp, self.test2_feat, normalize=100)
         fids = ['AA', 'AT', 'AG', 'AC']
diff --git a/calour/tests/test_training.py b/calour/tests/test_training.py
index f4ada4f1..32ecae58 100644
--- a/calour/tests/test_training.py
+++ b/calour/tests/test_training.py
@@ -20,21 +20,29 @@ def setUp(self):
         self.test2_sparse = ca.read(self.test2_biom, self.test2_samp, self.test2_feat, normalize=None)
         self.test2_dense = ca.read(self.test2_biom, self.test2_samp, self.test2_feat, sparse=False, normalize=None)
 
-    def test_onehot_encode_features(self):
-        new = self.test2_sparse.onehot_encode_features(['categorical'])
+    def test_add_sample_metadata_as_features(self):
+        new = self.test2_sparse.add_sample_metadata_as_features(['categorical'])
         dat = new.data.toarray()
         assert_array_equal(dat[:, 0:3],
                            [[1, 0, 0], [0, 1, 0], [0, 0, 1]] * 3)
         self.assertListEqual(new.feature_metadata.index[:3].tolist(),
                              ['categorical=A', 'categorical=B', 'categorical=C'])
 
-    def test_onehot_encode_features_dense(self):
-        new = self.test2_dense.onehot_encode_features(['categorical'])
+    def test_add_sample_metadata_as_features_dense(self):
+        new = self.test2_dense.add_sample_metadata_as_features(['categorical'])
         assert_array_equal(new.data[:, 0:3],
                            [[1, 0, 0], [0, 1, 0], [0, 0, 1]] * 3)
         self.assertListEqual(new.feature_metadata.index[:3].tolist(),
                              ['categorical=A', 'categorical=B', 'categorical=C'])
 
+    def test_split_train_test(self):
+        train_X, test_X, train_y, test_y = self.test2_dense.split_train_test(
+            test_size=3, field='group', stratify='categorical', random_state=7)
+        self.assertListEqual(test_y.tolist(), [1, 2, 1])
+        self.assertListEqual(test_y.index.tolist(), ['S3', 'S8', 'S1'])
+        self.assertListEqual(train_y.tolist(), [2, 1, 1, 1, 1, 1])
+        self.assertListEqual(train_y.index.tolist(), ['S9', 'S6', 'S5', 'S2', 'S4', 'S7'])
+
 
 if __name__ == "__main__":
     main()
diff --git a/calour/training.py b/calour/training.py
index f26775e8..c6ee9742 100644
--- a/calour/training.py
+++ b/calour/training.py
@@ -11,29 +11,36 @@
 .. autosummary::
    :toctree: generated
 
-   onehot_encode_features
+   add_sample_metadata_as_features
 '''
 
 
 from logging import getLogger
 
 from sklearn.feature_extraction import DictVectorizer
+from sklearn.model_selection import train_test_split
 from scipy.sparse import hstack
 import pandas as pd
 import numpy as np
 
 from .experiment import Experiment
+from .amplicon_experiment import AmpliconExperiment
 
 
 logger = getLogger(__name__)
 
 
 @Experiment._record_sig
-def onehot_encode_features(exp: Experiment, fields, sparse=None, inplace=False):
+def add_sample_metadata_as_features(exp: Experiment, fields, sparse=None, inplace=False):
     '''Add covariates from sample metadata to the data table as features for machine learning.
 
-    This will convert the columns of categorical strings using one-hot encoding scheme and add them
-    into the data table as new features.
+    This will convert the columns of categorical strings using one-hot
+    encoding scheme and add them into the data table as new features.
+
+    .. note:: This is only for numeric and/or nominal covariates in
+    sample metadata. If you want to add a ordinal column as a feature,
+    use `pandas.Series.map` to convert ordinal column to numeric
+    column first.
 
     Examples
     --------
@@ -47,7 +54,7 @@ def onehot_encode_features(exp: Experiment, fields, sparse=None, inplace=False):
 
     Let's add the columns of `category` and `ph` as features into data table:
 
-    >>> new = exp.onehot_encode_features(['category', 'ph'])
+    >>> new = exp.add_sample_metadata_as_features(['category', 'ph'])
     >>> new
     Experiment with 2 samples, 5 features
     >>> new.feature_metadata
@@ -89,12 +96,55 @@ def onehot_encode_features(exp: Experiment, fields, sparse=None, inplace=False):
     md = new.sample_metadata[fields]
     if sparse is None:
         sparse = new.sparse
+
     vec = DictVectorizer(sparse=sparse)
     encoded = vec.fit_transform(md.to_dict(orient='records'))
+
     if sparse:
-        new.data = hstack((encoded, new.data))
+        new.data = hstack((encoded, new.data), format='csr')
     else:
         new.data = np.concatenate([encoded, new.data], axis=1)
     # the order in the concatenation should be consistent with the data table
     new.feature_metadata = pd.concat([pd.DataFrame(index=vec.get_feature_names()), new.feature_metadata])
     return new
+
+
+def split_train_test(exp: Experiment, field, test_size, train_size=None, stratify=None, random_state=None):
+    '''Split experiment data into train and test set.
+
+    '''
+    if isinstance(stratify, str):
+        stratify = exp.sample_metadata[stratify]
+    y = exp.sample_metadata[field]
+    train_X, test_X, train_y, test_y = train_test_split(
+        exp.data, y, test_size=test_size, train_size=train_size, stratify=stratify, random_state=random_state)
+    return train_X, test_X, train_y, test_y
+
+
+@Experiment._record_sig
+def collect_cv_prediction(exp: Experiment, field, estimator, cv, scoring, inplace=False):
+    '''Do the CV
+
+    '''
+    # from sklearn.model_selection._split import check_cv
+    # from sklearn.base import is_classifier
+    # logger.debug('')
+    # if inplace:
+    #     new = exp
+    # else:
+    #     new = exp.copy()
+    # cv = check_cv(cv, y, classifier=is_classifier(estimator))
+    # for params in paramgrid:
+    #     for train_x, train_y in cv:
+    #         estimator.fit(train_x, train_y)
+
+    # return yobs, yhat, model
+
+
+@Experiment._record_sig
+def learning_curve_depths(exp: AmpliconExperiment, field, groups=None,
+                          train_depths=np.array([0.1, 0.325, 0.55, 0.775, 1.]),
+                          cv=None, scoring=None, exploit_incremental_learning=False,
+                          n_jobs=1, pre_dispatch='all', verbose=0, shuffle=False,
+                          random_state=None):
+    '''Compute the learning curve with regarding to sequencing depths.'''
diff --git a/calour/transforming.py b/calour/transforming.py
index 9ac96741..6455a885 100644
--- a/calour/transforming.py
+++ b/calour/transforming.py
@@ -40,8 +40,8 @@
 from skbio.stats.composition import clr, centralize as skbio_centralize
 from skbio.stats import subsample_counts
 
-from .experiment import Experiment
-from .doc_init import ds
+from . import Experiment
+from ._doc import ds
 
 
 logger = getLogger(__name__)
@@ -305,6 +305,7 @@ def random_permute_data(exp: Experiment, normalize=True):
     -------
     Experiment
         With each feature shuffled independently
+
     '''
     newexp = exp.copy()
     newexp.sparse = False
diff --git a/calour/util.py b/calour/util.py
index f8758664..431c14a7 100644
--- a/calour/util.py
+++ b/calour/util.py
@@ -467,4 +467,7 @@ def register_functions(cls, modules=None):
                                    '\n    exp : {0}'
                                    '\n        Input experiment object.')
 
+                        if not f.__doc__:
+                            f.__doc__ = ''
+
                         f.__doc__ = p.sub(updated.format(cls.__name__, fn), f.__doc__)