Skip to content

Commit

Permalink
Add adjusted q-value to return from multiple hypothesis testing (#203)
Browse files Browse the repository at this point in the history
Add q-value (corrected p-values) to dsFDR (in diff_abundance() and correlation())
  • Loading branch information
amnona committed Jul 6, 2020
1 parent 23059d0 commit e128102
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 31 deletions.
33 changes: 20 additions & 13 deletions calour/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,12 @@

_CALOUR_STAT = '_calour_stat'
_CALOUR_PVAL = '_calour_pval'
_CALOUR_QVAL = '_calour_qval'
_CALOUR_DIRECTION = '_calour_direction'


@Experiment._record_sig
@format_docstring(_CALOUR_PVAL, _CALOUR_STAT)
@format_docstring(_CALOUR_PVAL, _CALOUR_QVAL, _CALOUR_STAT, _CALOUR_DIRECTION)
def correlation(exp: Experiment, field, method='spearman', nonzero=False, transform=None, numperm=1000, alpha=0.1, fdr_method='dsfdr', random_seed=None):
'''Find features with correlation to a numeric metadata field.
Expand Down Expand Up @@ -91,13 +92,14 @@ def correlation(exp: Experiment, field, method='spearman', nonzero=False, transf
Experiment
The experiment with only correlated features, sorted according to correlation coefficient.
* '{}' : the FDR adjusted p-values for each feature
* '{}' : the non-adjusted p-values for each feature
* '{}' : the FDR-adjusted q-values for each feature
* '{}' : the statistics (correlation coefficient if
the `method` is 'spearman' or 'pearson'). If it
is larger than zero for a given feature, it indicates this
feature is positively correlated with the sample metadata;
otherwise, negatively correlated.
* '{}' : in which of the 2 sample groups this given feature is increased.
'''
if field not in exp.sample_metadata.columns:
raise ValueError('Field %s not in sample_metadata. Possible fields are: %s' % (field, exp.sample_metadata.columns))
Expand Down Expand Up @@ -127,16 +129,16 @@ def correlation(exp: Experiment, field, method='spearman', nonzero=False, transf
else:
raise ValueError('Cannot use nonzero=True on methods except "pearson" or "spearman"')
# find the significant features
keep, odif, pvals = dsfdr.dsfdr(data, labels, method=method, transform_type=transform, alpha=alpha, numperm=numperm, fdr_method=fdr_method)
keep, odif, pvals, qvals = dsfdr.dsfdr(data, labels, method=method, transform_type=transform, alpha=alpha, numperm=numperm, fdr_method=fdr_method)
logger.info('Positive correlated features : %d. Negative correlated features : %d. total %d'
% (np.sum(odif[keep] > 0), np.sum(odif[keep] < 0), np.sum(keep)))
newexp = _new_experiment_from_pvals(cexp, exp, keep, odif, pvals)
newexp = _new_experiment_from_pvals(cexp, exp, keep, odif, pvals, qvals)
newexp.feature_metadata[_CALOUR_DIRECTION] = [field if x > 0 else 'Anti-%s' % field for x in newexp.feature_metadata[_CALOUR_STAT]]
return newexp


@Experiment._record_sig
@format_docstring(_CALOUR_PVAL, _CALOUR_STAT, _CALOUR_DIRECTION)
@format_docstring(_CALOUR_PVAL, _CALOUR_QVAL, _CALOUR_STAT, _CALOUR_DIRECTION)
def diff_abundance(exp: Experiment, field, val1, val2=None, method='meandiff', transform='rankdata', numperm=1000, alpha=0.1, fdr_method='dsfdr', random_seed=None):
'''Differential abundance test between 2 groups of samples for all the features.
Expand Down Expand Up @@ -195,7 +197,8 @@ def diff_abundance(exp: Experiment, field, val1, val2=None, method='meandiff', t
according to their effect size. The new experiment contains
additional ``feature_metadata`` fields that include:
* '{}' : the FDR adjusted p-values for each feature
* '{}' : the non-adjusted p-values for each feature
* '{}' : the FDR-adjusted q-values for each feature
* '{}' : the effect size (t-statistic). If it is larger than
zero for a given feature, it indicates this feature is
increased in the first group of samples (``val1``); if
Expand Down Expand Up @@ -231,10 +234,10 @@ def diff_abundance(exp: Experiment, field, val1, val2=None, method='meandiff', t
labels = np.zeros(len(cexp.sample_metadata))
labels[cexp.sample_metadata[field].isin(val1).values] = 1
logger.info('%d samples with value 1 (%s)' % (np.sum(labels), val1))
keep, odif, pvals = dsfdr.dsfdr(data, labels, method=method, transform_type=transform, alpha=alpha, numperm=numperm, fdr_method=fdr_method)
keep, odif, pvals, qvals = dsfdr.dsfdr(data, labels, method=method, transform_type=transform, alpha=alpha, numperm=numperm, fdr_method=fdr_method)
logger.info('number of higher in {}: {}. number of higher in {} : {}. total {}'.format(
grp1, np.sum(odif[keep] > 0), grp2, np.sum(odif[keep] < 0), np.sum(keep)))
newexp = _new_experiment_from_pvals(cexp, exp, keep, odif, pvals)
newexp = _new_experiment_from_pvals(cexp, exp, keep, odif, pvals, qvals)
newexp.feature_metadata[_CALOUR_DIRECTION] = [grp1 if x > 0 else grp2 for x in newexp.feature_metadata[_CALOUR_STAT]]
return newexp

Expand Down Expand Up @@ -296,13 +299,13 @@ def diff_abundance_kw(exp: Experiment, field, transform='rankdata', numperm=1000
for idx, clabel in enumerate(exp.sample_metadata[field].unique()):
labels[exp.sample_metadata[field].values == clabel] = idx
logger.debug('Found %d unique sample labels' % (idx + 1))
keep, odif, pvals = dsfdr.dsfdr(data, labels, method='kruwallis', transform_type=transform, alpha=alpha, numperm=numperm, fdr_method=fdr_method)
keep, odif, pvals, qvals = dsfdr.dsfdr(data, labels, method='kruwallis', transform_type=transform, alpha=alpha, numperm=numperm, fdr_method=fdr_method)

logger.info('Found %d significant features' % (np.sum(keep)))
return _new_experiment_from_pvals(cexp, exp, keep, odif, pvals)
return _new_experiment_from_pvals(cexp, exp, keep, odif, pvals, qvals)


def _new_experiment_from_pvals(cexp, exp, keep, odif, pvals):
def _new_experiment_from_pvals(cexp, exp, keep, odif, pvals, qvals):
'''Combine the pvalues and effect size into a new experiment.
Keep only the significant features, sort the features by the effect size
Expand All @@ -317,8 +320,10 @@ def _new_experiment_from_pvals(cexp, exp, keep, odif, pvals):
One entry per exp feature. True for the features which are significant (following FDR correction)
odif : np.array of float
One entry per exp feature. The effect size per feature (can be positive or negative)
pval : np.array of float
pvals : np.array of float
One entry per exp feature. The p-value associated with each feature (not FDR corrected)
qvals: np.array of float
One entry per exp feature. The q-value associated with each feature (FDR corrected)
Returns
-------
Expand All @@ -335,10 +340,12 @@ def _new_experiment_from_pvals(cexp, exp, keep, odif, pvals):
newexp = exp.filter_ids(newexp.feature_metadata.index.values)
odif = odif[keep[0]]
pvals = pvals[keep[0]]
qvals = qvals[keep[0]]
si = np.argsort(odif, kind='mergesort')
odif = odif[si]
pvals = pvals[si]
newexp = newexp.reorder(si, axis=1)
newexp.feature_metadata[_CALOUR_STAT] = odif
newexp.feature_metadata[_CALOUR_PVAL] = pvals
newexp.feature_metadata[_CALOUR_QVAL] = qvals
return newexp
39 changes: 24 additions & 15 deletions calour/dsfdr.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def dsfdr(data, labels, transform_type='rankdata', method='meandiff',
input:
data : N x S numpy array
each column is a sample (S total), each row an OTU (N total)
each column is a sample (S total), each row a feature (N total)
labels : a 1d numpy array (length S)
the labels of each sample (same order as data) with the group
(0/1 if binary, 0-G-1 if G groups, or numeric values for correlation)
Expand All @@ -112,8 +112,8 @@ def dsfdr(data, labels, transform_type='rankdata', method='meandiff',
transform_type : str or None
transformation to apply to the data before caluculating
the test statistic
'rankdata' : rank transfrom each OTU reads
'log2data' : calculate log2 for each OTU using minimal cutoff of 2
'rankdata' : rank transfrom each feature
'log2data' : calculate log2 for each feature using minimal cutoff of 2
'normdata' : normalize the data to constant sum per samples
'binarydata' : convert to binary absence/presence
None : no transformation to perform
Expand Down Expand Up @@ -146,11 +146,13 @@ def dsfdr(data, labels, transform_type='rankdata', method='meandiff',
output:
reject : np array of bool (length N)
True for OTUs where the null hypothesis is rejected
True for features where the null hypothesis is rejected
tstat : np array of float (length N)
the test statistic value for each OTU (for effect size)
the test statistic value for each feature (for effect size)
pvals : np array of float (length N)
the p-value for each OTU
the p-value (uncorrected) for each feature
qvals: np array of float (length N)
the q-value (corrected p-value) for each feature.
'''

logger.debug('dsfdr using fdr method: %s' % fdr_method)
Expand Down Expand Up @@ -304,8 +306,7 @@ def dsfdr(data, labels, transform_type='rankdata', method='meandiff',
u[:, cperm] = rt
u = np.abs(u)
else:
print('unsupported method %s' % method)
return None, None
raise ValueError('unsupported method %s' % method)

# fix floating point errors (important for permutation values!)
# https://github.com/numpy/numpy/issues/8116
Expand All @@ -315,6 +316,7 @@ def dsfdr(data, labels, transform_type='rankdata', method='meandiff',

# calculate permutation p-vals
pvals = np.zeros([numbact]) # p-value for original test statistic t
qvals = np.ones([numbact]) # q-value (corrected p-value) for each feature.
pvals_u = np.zeros([numbact, numperm])
# pseudo p-values for permutated test statistic u
for crow in range(numbact):
Expand Down Expand Up @@ -342,30 +344,37 @@ def dsfdr(data, labels, transform_type='rankdata', method='meandiff',
allfdr.append(fdr)
allt.append(cp)
if fdr <= alpha:
realcp = cp
foundit = True
break
if not foundit:
realcp = cp
foundit = True

if not foundit:
# no good threshold was found
reject = np.repeat([False], numbact)
return reject, tstat, pvals
return reject, tstat, pvals, qvals

# fill the reject null hypothesis
reject = np.zeros(numbact, dtype=int)
reject = (pvals <= realcp)

# fill the q-values
for idx, cfdr in enumerate(allfdr):
# fix for qval > 1 (since we count on all features in random permutation)
cfdr = np.min([cfdr, 1])
cpval = allt[idx]
qvals[pvals == cpval] = cfdr

elif fdr_method == 'bhfdr' or fdr_method == 'filterBH':
t_star = np.array([t, ] * numperm).transpose()
pvals = (np.sum(u >= t_star, axis=1) + 1) / (numperm + 1)
reject = multipletests(pvals, alpha=alpha, method='fdr_bh')[0]
reject, qvals, *_ = multipletests(pvals, alpha=alpha, method='fdr_bh')

elif fdr_method == 'byfdr':
t_star = np.array([t, ] * numperm).transpose()
pvals = (np.sum(u >= t_star, axis=1) + 1) / (numperm + 1)
reject = multipletests(pvals, alpha=alpha, method='fdr_by')[0]
reject, qvals, *_ = multipletests(pvals, alpha=alpha, method='fdr_by')

else:
raise ValueError('fdr method %s not supported' % fdr_method)

return reject, tstat, pvals
return reject, tstat, pvals, qvals
10 changes: 7 additions & 3 deletions calour/tests/test_dsfdr.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,19 +126,21 @@ def test_dsfdr(self):
res_ds = dsfdr.dsfdr(self.data, self.labels, method='meandiff',
transform_type=None, alpha=0.1, numperm=1000,
fdr_method='dsfdr')
self.assertEqual(np.shape(res_ds)[0], self.data.shape[0])
self.assertEqual(np.shape(res_ds[0])[0], self.data.shape[0])
np.testing.assert_array_equal(res_ds[0], [True, False, False])
# test the qvals behave logically
self.assertEqual(np.sum(res_ds[3] <= 0.1), np.sum(res_ds[0]))

res_bh = dsfdr.dsfdr(self.data, self.labels, method='meandiff',
transform_type=None, alpha=0.1, numperm=1000,
fdr_method='bhfdr')
self.assertEqual(np.shape(res_bh)[0], self.data.shape[0])
self.assertEqual(np.shape(res_bh[0])[0], self.data.shape[0])
np.testing.assert_array_equal(res_bh[0], [True, False, False])

res_by = dsfdr.dsfdr(self.data, self.labels, method='meandiff',
transform_type=None, alpha=0.1, numperm=1000,
fdr_method='byfdr')
self.assertEqual(np.shape(res_by)[0], self.data.shape[0])
self.assertEqual(np.shape(res_by[0])[0], self.data.shape[0])
np.testing.assert_array_equal(res_by[0], [True, False, False])

# test on simulated self.data_sim
Expand All @@ -148,6 +150,8 @@ def test_dsfdr(self):
alpha=0.1, numperm=1000, fdr_method='dsfdr')[0]
fdr_ds2 = (np.sum(np.where(res_ds2)[0] >= 100)) / np.sum(res_ds2)
np.testing.assert_equal(fdr_ds2 <= 0.1, True)
# test the qvals behave logically
self.assertEqual(np.sum(res_ds2[3] <= 0.1), np.sum(res_ds2[0] is True))

np.random.seed(31)
res_bh2 = dsfdr.dsfdr(self.data_sim, self.labels_sim,
Expand Down

0 comments on commit e128102

Please sign in to comment.