Skip to content

Commit

Permalink
add a couple functions (#88)
Browse files Browse the repository at this point in the history
* add dropna function

* merge dropna into filter_by_metadata

keep only needed kwargs using docrep
allow set in _to_list()

* fix plot_shareness()

* add join experiment featurewise and fix plot_core_features()

fix other minor things

* fix a edge bug when the function param is a tuple

* rename doc_init to _doc and fix pep8

* add docstring
  • Loading branch information
RNAer authored and amnona committed Mar 24, 2018
1 parent c01c3ba commit 422648d
Show file tree
Hide file tree
Showing 20 changed files with 246 additions and 59 deletions.
4 changes: 1 addition & 3 deletions calour/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,5 @@
for fn, f in inspect.getmembers(Experiment, predicate=inspect.isfunction):
setattr(Experiment, fn, _convert_axis_name(f))

log = resource_filename(__package__, 'log.cfg')

# setting False allows other logger to print log.
fileConfig(log, disable_existing_loggers=False)
fileConfig(resource_filename(__package__, 'log.cfg'), disable_existing_loggers=False)
File renamed without changes.
28 changes: 28 additions & 0 deletions calour/amplicon_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,3 +254,31 @@ def split_taxonomy(self, field='taxonomy', sep=';',
the column names for the new columns split from ``field``
'''
self.feature_metadata[names] = self.feature_metadata[field].str.split(sep, expand=True)
# return so you can chain the functions
return self

def find_lowest_taxonomy(self, field='taxonomy', new_field='taxa'):
'''Create a new column that contains the taxonomy of lowest possible level.
For example, 'k__Bacteria; p__Firmicutes; c__Bacilli,
o__Lactobacillales; f__Enterococcaceae; g__Enterococcus,
s__' will return 'g__Enterococcus'
Parameters
----------
field : str
column name that contains all levels of taxonomy
new_field : str
new column name
Returns
-------
AmpliconExperiment
'''
def find_highest(s):
l = s.split(';')
b = [len(i) > 3 for i in l]
return np.array(l)[b][-1]
self.feature_metadata[new_field] = self.feature_metadata[field].apply(find_highest)
return self
3 changes: 2 additions & 1 deletion calour/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,8 @@ def inner(*args, **kwargs):
logger.debug('Run func {}'.format(fn))
new_exp = func(*args, **kwargs)
if exp._log is True:
param = ['%r' % i for i in args[1:]] + ['%s=%r' % (k, v) for k, v in kwargs.items()]
# do not use `'%r' % i` because it causes error when i is a tuple
param = ['{!r}'.format(i) for i in args[1:]] + ['{0!s}={1!r}'.format(k, v) for k, v in kwargs.items()]
param = ', '.join(param)
new_exp._call_history.append('{0}({1})'.format(fn, param))
exp._log = False
Expand Down
8 changes: 5 additions & 3 deletions calour/filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@
import numpy as np
from scipy.sparse import issparse

from .experiment import Experiment
from . import Experiment
from ._doc import ds
from .util import _to_list
from .doc_init import ds


logger = getLogger(__name__)
Expand Down Expand Up @@ -514,7 +514,9 @@ def filter_mean(exp: Experiment, cutoff=0.01, **kwargs):

@Experiment._record_sig
def filter_ids(exp: Experiment, ids, axis=1, negate=False, inplace=False):
'''Filter samples or features based on a list index values
'''Filter samples or features based on a list IDs.
.. note:: the order of samples or features is updated as the order given in ``ids``.
Parameters
----------
Expand Down
4 changes: 2 additions & 2 deletions calour/heatmap/heatmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@

from .. import Experiment
from ..database import _get_database_class
from .._dendrogram import plot_tree
from ..util import _to_list, _transition_index
from ..doc_init import ds
from .._doc import ds
from .._dendrogram import plot_tree


logger = getLogger(__name__)
Expand Down
2 changes: 1 addition & 1 deletion calour/heatmap/plotgui.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import numpy as np
from matplotlib.gridspec import GridSpec

from ..doc_init import ds
from .._doc import ds

logger = getLogger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion calour/heatmap/plotgui_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# ----------------------------------------------------------------------------

from .plotgui import PlotGUI
from ..doc_init import ds
from .._doc import ds


class PlotGUI_CLI(PlotGUI):
Expand Down
4 changes: 3 additions & 1 deletion calour/heatmap/plotgui_jupyter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@
import matplotlib

from .plotgui import PlotGUI
from ..doc_init import ds
from .._doc import ds


logger = getLogger(__name__)


@ds.with_indent(4)
class PlotGUI_Jupyter(PlotGUI):
'''Jupyter GUI of plotting.
Expand All @@ -19,6 +20,7 @@ class PlotGUI_Jupyter(PlotGUI):
Parameters
----------
%(PlotGUI.parameters)s
'''

@ds.with_indent(8)
Expand Down
2 changes: 1 addition & 1 deletion calour/heatmap/plotgui_qt5.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from PyQt5.QtCore import Qt

from .plotgui import PlotGUI
from ..doc_init import ds
from .._doc import ds

logger = getLogger(__name__)

Expand Down
7 changes: 2 additions & 5 deletions calour/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,9 @@
import pandas as pd
import biom

from .experiment import Experiment
from .amplicon_experiment import AmpliconExperiment
from .ms1_experiment import MS1Experiment
from . import Experiment, AmpliconExperiment, MS1Experiment
from .util import get_file_md5, get_data_md5, _get_taxonomy_string

from .doc_init import ds
from ._doc import ds


logger = getLogger(__name__)
Expand Down
47 changes: 47 additions & 0 deletions calour/manipulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,3 +267,50 @@ def join_experiments(exp: Experiment, other, field_name='experiments', prefixes=
newexp.data = all_data

return newexp


@Experiment._record_sig
def join_experiments_featurewise(exp: Experiment, other,
field_name='_feature_origin_', origin_labels=('exp1', 'exp2')):
'''Combine two :class:`.Experiment` objects into one.
An example of user cases is to combine the 16S and ITS amplicon data together.
.. warning:: If a sample has only features in one Experiment
object and not the other, the sample will be dropped from joining.
Parameters
----------
other : :class:`.Experiment`
The ``Experiment`` object to combine with the current one. If
both experiments contain the same feature metadata column and
there is a conflict between the two, the value will be taken
from exp and not from other.
field_name : ``None`` or str (optional)
Name of the new ``feature_metdata`` field containing the experiment each feature is coming from.
If it is None, don't add such column.
labels : tuple of (str, str) (optional)
The text to label which experiment the feature is originated from.
Returns
-------
:class:`.Experiment`
A new experiment with samples from both experiments concatenated, features from both
experiments merged.
'''
logger.debug('Join 2 experiments featurewise:\n{!r}\n{!r}'.format(exp, other))
# create an empty object
newexp = exp.__class__(np.empty(shape=[0, 0]), pd.DataFrame(),
description='join %s & %s' % (exp.description, other.description))
sid = exp.sample_metadata.index.intersection(other.sample_metadata.index)
exp = exp.filter_ids(sid, axis=0)
other = other.filter_ids(sid, axis=0)
fmd = pd.concat([exp.feature_metadata, other.feature_metadata], join='outer')
fmd[field_name] = [origin_labels[0]] * exp.shape[1] + [origin_labels[1]] * other.shape[1]
newexp.sample_metadata = exp.sample_metadata
newexp.feature_metadata = fmd
# merge data table
newexp.data = np.c_[exp.data, other.data]

return newexp
66 changes: 44 additions & 22 deletions calour/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@
plot_enrichment
plot_diff_abundance_enrichment
plot_stacked_bar
plot_shareness
plot_core_features
plot_abund_prevalence
plot_scatter_matrix
'''

# ----------------------------------------------------------------------------
Expand Down Expand Up @@ -217,8 +218,8 @@ def plot_diff_abundance_enrichment(exp: Experiment, term_type='term', max_show=1
return ax2, newexp


def plot_shareness(exp: Experiment, field=None, steps=None, iterations=10, alpha=0.5, linewidth=0.7, ax=None):
'''Plot the number of shared features against the number of samples.
def plot_core_features(exp: Experiment, field=None, steps=None, cutoff=2, frac=0.9, iterations=10, alpha=0.5, linewidth=0.7, ax=None):
'''Plot the percentage of core features shared in increasing number of samples.
To see if there is a core feature set shared across most of the samples.
Expand All @@ -237,7 +238,12 @@ def plot_shareness(exp: Experiment, field=None, steps=None, iterations=10, alpha
field : str
sample metadata field to group samples
steps : iterable of int
the sizes of subsamples to compute the shareness.
the sizes of subsamples to compute the fraction of core features.
cutoff : numeric
the feature is considered present in a sample only if its abundance is >= cutoff.
frac : numeric
Must between 0 and 1. The feature would be considered as a core feature
if it is present in ``fac`` faction of samples.
iterations : int
repeat the subsampling multiple times and plot all the iterations
alpha : float
Expand All @@ -251,7 +257,6 @@ def plot_shareness(exp: Experiment, field=None, steps=None, iterations=10, alpha
-------
matplotlib.axes.Axes
The Axes object containing the plot.
'''
if ax is None:
from matplotlib import pyplot as plt
Expand All @@ -264,15 +269,16 @@ def plot_shareness(exp: Experiment, field=None, steps=None, iterations=10, alpha
def plot_lines(data, steps, label):
y_sum = np.zeros(len(steps))
for i in range(iterations):
x, y = _compute_frac_nonzero(data, steps)
y = _compute_frac_nonzero(data, steps, cutoff, frac, i)
y = y * 100
y_sum += y
if i == 0:
line, = ax.plot(x, y, alpha=alpha, linewidth=linewidth)
line, = ax.plot(steps, y, alpha=alpha, linewidth=linewidth)
else:
ax.plot(x, y, alpha=alpha, linewidth=linewidth, color=line.get_color())
ax.plot(steps, y, alpha=alpha, linewidth=linewidth, color=line.get_color())
y_ave = y_sum / iterations
# plot average of the iterations
ax.plot(x, y_ave, linewidth=linewidth * 3, label=label, color=line.get_color())
ax.plot(steps, y_ave, linewidth=linewidth * 3, label=label, color=line.get_color())

if field is None:
plot_lines(exp.data, steps, label='all samples')
Expand All @@ -286,39 +292,55 @@ def plot_lines(data, steps, label):
# because the shareness drops quickly, we plot it in log scale
ax.set_xscale('log')
ax.set_xlabel('sample number')
ax.set_ylabel('fraction of shared features')
ax.set_ylabel('shared features (%)')
return ax


def _compute_frac_nonzero(data, steps):
def _compute_frac_nonzero(data, steps, cutoff=2, frac=0.9, random_state=None):
'''iteratively compute the fraction of non-zeros in each column after subsampling rows.
Parameters
----------
data : 2-d array of numeric
sample in row and feature in column
steps : iterable of int
the sizes of subsamples
the subsample sizes (should be in descending order)
cutoff : numeric
the feature is considered present in a sample only if its abundance is >= cutoff.
frac : numeric
Must between 0 and 1. The feature would be considered as a core feature
if it is present in ``fac`` faction of samples.
random_state : int, RandomState instance or None, optional, default=None
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Return
------
tuple of 2 lists
steps and fractions
numpy.array
fractions of core features for each subsample size
'''
n_samples, n_features = data.shape

shared = []
for i in steps:
data = data[np.random.choice(n_samples, i, replace=False), :]
x = data > 0
shared = np.zeros(len(steps))
rand = np.random.RandomState(random_state)
if cutoff <= 0:
raise ValueError('You need to provide a positive value for `cutoff`: %r' % cutoff)
if frac <= 0 or frac > 1:
raise ValueError('You need to provide a value among (0, 1] for `frac`: %r' % frac)
for n, i in enumerate(steps):
data = data[rand.choice(n_samples, i, replace=False), :]
print(data)
x = data >= cutoff
# the count of samples that have the given feature
counts = x.sum(axis=0)
all_presence = np.sum(counts == i)
all_presence = np.sum(counts >= np.ceil(i * frac))
all_absence = np.sum(counts == 0)
# important: remove the features that are all zeros across the subset of samples
shared.append(all_presence / (n_features - all_absence))
shared[n] = all_presence / (n_features - all_absence)
# don't forget to update sample count
n_samples = data.shape[0]
return steps, shared
return shared


def plot_abund_prevalence(exp: Experiment, field, log=True, min_abund=0.01, alpha=0.5, linewidth=0.7, ax=None):
Expand Down
2 changes: 1 addition & 1 deletion calour/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
from . import Experiment
from .transforming import log_n, transform, scale
from .util import _argsort
from .doc_init import ds
from ._doc import ds


logger = getLogger(__name__)
Expand Down
25 changes: 24 additions & 1 deletion calour/tests/test_manipulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
from copy import deepcopy

import numpy as np
import pandas as pd

import calour as ca

from calour._testing import Tests, assert_experiment_equal


Expand Down Expand Up @@ -50,6 +50,29 @@ def test_join_experiments(self):
fexp = newexp.filter_samples('experiments', ['t2'])
assert_experiment_equal(fexp, texp, ignore_md_fields=['experiments'])

def test_join_experiments_featurewise(self):
otu1 = ca.Experiment(np.array([[0, 9], [7, 4]]), sparse=False,
sample_metadata=pd.DataFrame({'category': ['B', 'A'],
'ph': [7.7, 6.6]},
index=['s2', 's1']),
feature_metadata=pd.DataFrame({'motile': ['y', 'n']}, index=['16S1', '16S2']))
otu2 = ca.Experiment(np.array([[6], [8], [10]]), sparse=False,
sample_metadata=pd.DataFrame({'category': ['A', 'B', 'C'],
'ph': [6.6, 7.7, 8.8]},
index=['s1', 's2', 's3']),
feature_metadata=pd.DataFrame({'motile': [None]}, index=['ITS1']))
combined_obs = otu1.join_experiments_featurewise(otu2, 'origin', ('16S', 'ITS'))
combined_exp = ca.Experiment(np.array([[7, 4, 6], [0, 9, 8]]), sparse=False,
sample_metadata=pd.DataFrame({'category': ['A', 'B'],
'ph': [6.6, 7.7]},
index=['s1', 's2']),
feature_metadata=pd.DataFrame({'motile': ['y', 'n', None],
'origin': ['16S', '16S', 'ITS']},
index=['16S1', '16S2', 'ITS1']))
# reorder the samples
combined_obs = combined_obs.filter_ids(combined_exp.sample_metadata.index, axis=0)
assert_experiment_equal(combined_obs, combined_exp)

def test_agg_by_metadata(self):
# test default conditions - on samples, not inplace, mean method
newexp = self.test1.aggregate_by_metadata('group')
Expand Down

0 comments on commit 422648d

Please sign in to comment.