-
Notifications
You must be signed in to change notification settings - Fork 22
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Ms1 experiment #213
Ms1 experiment #213
Changes from all commits
a3fd298
a923f53
9d94cef
a437253
9736fb8
87a664e
2c0f99a
df37c5d
97d6530
28dea54
eac3b60
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,9 +23,10 @@ | |
from logging import getLogger | ||
|
||
import matplotlib as mpl | ||
import numpy as np | ||
|
||
from .experiment import Experiment | ||
|
||
from .util import _to_list | ||
|
||
logger = getLogger(__name__) | ||
|
||
|
@@ -91,9 +92,200 @@ def heatmap(self, *args, **kwargs): | |
''' | ||
if 'norm' not in kwargs: | ||
kwargs['norm'] = mpl.colors.LogNorm() | ||
if 'mz_rt' in self.feature_metadata.columns: | ||
if 'yticklabel_len' not in kwargs: | ||
kwargs['yticklabel_len'] = None | ||
if 'feature_field' not in kwargs: | ||
kwargs['feature_field'] = 'mz_rt' | ||
if 'yticklabel_kwargs' not in kwargs: | ||
kwargs['yticklabel_kwargs'] = {'size': 6, 'rotation': 0} | ||
super().heatmap(*args, **kwargs) | ||
|
||
def __repr__(self): | ||
'''Return a string representation of this object.''' | ||
return 'MS1Experiment %s with %d samples, %d features' % ( | ||
self.description, self.data.shape[0], self.data.shape[1]) | ||
|
||
def get_spurious_duplicates(self, mz_tolerance=0.001, rt_tolerance=2, corr_thresh=0.8, inplace=False, negate=False): | ||
'''Get subgroups of metabolites that are suspected ms1 alignment artifacts. | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can describe how the filtering is performed? users have to look at code to understand what it is doing. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed. is that clear enough now? |
||
The function returns a calour.MS1Experiment with groups of metabolites that (within each group) have similar m/z and rt, and are highly | ||
correlated/anti-correlated. These are usually due to incorrect feature detection/alignment and can be used to optimize the feature selection parameters. | ||
correlation could be due to incomplete removal of isotopes or same metabolite in multiple RTs | ||
anti-correlation could be due to RT drift (splitting of one true metabolite) | ||
Metabolites in the new experiment are ordered by correlation clusters | ||
|
||
Parameters | ||
---------- | ||
mz_tolerance: float, optional | ||
the M/Z tolerance. Metabolites are similar if abs(metabolite_mz - mz) <= mz_tolerance | ||
rt_tolerance: float, optional | ||
the retention time tolerance. Metabolites are similar if abs(metabolite_rt - rt) <= rt_tolerance | ||
corr_threshold: float, optional | ||
the minimal (abs) correlation/anti-correlation value in order to call features correlated | ||
inplace: bool, optional | ||
True to replace current experiment, False to create new experiment with results | ||
negate: bool, optional | ||
If False, keep only metabolites that show a correlation with another metabolite | ||
If True, remove metabolites showing correlation | ||
|
||
Returns | ||
------- | ||
MS1Experiment | ||
features filtered and ordered basen on m/z and rt similarity and correlation | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what do you do here? remove (semi-) duplicate features? or combine them? you need a better function name! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. any ideas? this is the best name i could come up with :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so these are spurious duplicate features? if so, how about get_spurious_duplicates? what's the workfflow here? what do u do after you get these features? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. feature selection in ms is complicated... and lots of parameters |
||
''' | ||
features = self.feature_metadata.copy() | ||
keep_features = [] | ||
data = self.get_data(sparse=False) | ||
while len(features) > 0: | ||
# get the first feature | ||
cfeature = features.iloc[0] | ||
features.drop(index=cfeature.name, inplace=True) | ||
# find all mz/rt neighbors of the feature | ||
mzdist = np.abs(features['MZ'] - cfeature['MZ']) | ||
rtdist = np.abs(features['RT'] - cfeature['RT']) | ||
okf = features[np.logical_and(mzdist <= mz_tolerance, rtdist <= rt_tolerance)] | ||
if len(okf) == 0: | ||
continue | ||
# test the correlation of each neighbor | ||
odat = data[:, self.feature_metadata.index.get_loc(cfeature.name)] | ||
ckeep = [] | ||
for cf, *_ in okf.iterrows(): | ||
cdat = data[:, self.feature_metadata.index.get_loc(cf)] | ||
corrcf = np.corrcoef(odat, cdat)[0, 1] | ||
if np.abs(corrcf) >= corr_thresh: | ||
ckeep.append(cf) | ||
# store the result and remove all the correlated features from the features left to process | ||
if len(ckeep) > 0: | ||
keep_features.append(cfeature.name) | ||
keep_features.extend(ckeep) | ||
features.drop(index=ckeep, inplace=True) | ||
return self.filter_ids(keep_features, negate=negate, inplace=inplace) | ||
|
||
def merge_similar_features(self, mz_tolerance=0.001, rt_tolerance=0.5): | ||
'''Merge metabolites with similar mz/rt to a single metabolite | ||
|
||
Metabolites are initially sorted by frequency and a greedy clustering algorithm (starting from the highest freq.) is used to join together | ||
metabolites that are close in m/z and r/t, combining them to a signle metabolite with freq=sum(freq) of all metabolites in the cluster. | ||
|
||
Parameters | ||
---------- | ||
mz_tolerance: float, optional | ||
metabolites with abs(metabolite_mz - mz) <= mz_tolerance are joined | ||
rt_tolerance: float, optional | ||
metabolites with abs(metabolite_rt - rt) <= rt_tolerance are joined | ||
|
||
Returns | ||
------- | ||
MS1Experiment | ||
With close metabolites joined to a single metabolite. | ||
The m/z and rt of the new metabolite are the m/z and rt of the highest freq. metabolite. Frequency of the new metabolite is the sum of frequencies | ||
of all joined metabolites. | ||
New feature_metadata fields: _calour_merge_number, _calour_merge_ids are added listing the number and ids of the metabolites joined for each new metabolite | ||
''' | ||
exp = self.sort_abundance(reverse=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why sorting? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We want the center of each cluster to be the highest frequency metabolite and to join to it all the close metabolites. |
||
features = exp.feature_metadata | ||
features['_metabolite_group'] = np.zeros(len(features)) - 1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. instead of of messing and clutter feature metadata data frame, could you modify There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do you mean to make the field var in agg_be metadata can be a list/pd.series etc? |
||
gpos = list(features.columns).index('_metabolite_group') | ||
cgroup = 0 | ||
for cgroup, cfeature in features.iterrows(): | ||
mzdist = np.abs(features['MZ'] - cfeature['MZ']) | ||
rtdist = np.abs(features['RT'] - cfeature['RT']) | ||
ok = (mzdist <= mz_tolerance) & (rtdist <= rt_tolerance) & (features['_metabolite_group'] == -1) | ||
okpos = np.where(ok)[0] | ||
for cpos in okpos: | ||
features.iat[cpos, gpos] = cgroup | ||
exp = exp.aggregate_by_metadata('_metabolite_group', agg='sum', axis='f') | ||
exp.feature_metadata.drop('_metabolite_group', axis='columns', inplace=True) | ||
logger.info('%d metabolites remaining after merge' % len(exp.feature_metadata)) | ||
return exp | ||
|
||
def filter_mz_rt(self, mz=None, rt=None, mz_tolerance=0.05, rt_tolerance=0.2, inplace=False, negate=False): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. by the way, we should consistent with naming - is this more appropriate to name it "filter_by_mz_rt"? |
||
'''Filter metabolites based on m/z and/or retention time | ||
|
||
Keep (or remove if negate=True) metabolites that have an m/z and/or retention time close (up to tolerance) | ||
to the requested mz and/or rt (or list of mz and/or rt). | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. requested -> input |
||
If both mz and rt are provided, they should be matched (i.e. filtering is performed using each mz and rt pair with same index) | ||
|
||
Parameters | ||
---------- | ||
mz: float or list of float or None, optional | ||
the M/Z to filter | ||
if None, do not filter based on M/Z | ||
rt: float or list of float or None, optional | ||
the retention time to filter | ||
if None, do not filter based on rt | ||
mz_tolerance: float, optional | ||
the M/Z tolerance. filter metabolites with abs(metabolite_mz - mz) <= mz_tolerance | ||
rt_tolerance: float, optional | ||
the rt tolerance. filter metabolites with abs(metabolite_rt - rt) <= rt_tolerance | ||
inplace: bool, optional | ||
True to replace current experiment, False to create new experiment with results | ||
negate: bool, optional | ||
If False, keep only metabolites matching mz | ||
If True, remove metabolites matching mz | ||
|
||
Returns | ||
------- | ||
MS1Experiment | ||
features filtered based on mz | ||
''' | ||
if mz is None and rt is None: | ||
raise ValueError('at least one of "mz" and "rt" must not be None') | ||
if mz is not None: | ||
if 'MZ' not in self.feature_metadata.columns: | ||
raise ValueError('The Experiment does not contain the column "MZ". cannot filter by mz') | ||
else: | ||
mz = _to_list(mz) | ||
if rt is not None: | ||
if 'RT' not in self.feature_metadata.columns: | ||
raise ValueError('The Experiment does not contain the column "RT". cannot filter by rt') | ||
else: | ||
rt = _to_list(rt) | ||
|
||
select = np.zeros(len(self.feature_metadata), dtype='?') | ||
notfound = 0 | ||
if mz is None: | ||
mz = [None] * len(rt) | ||
if rt is None: | ||
rt = [None] * len(mz) | ||
if len(mz) != len(rt): | ||
raise ValueError('mz and rt must have same length') | ||
|
||
for cmz, crt in zip(mz, rt): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mz and rt should be matched if they are list-like? pls document this in the docstring. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
if cmz is not None: | ||
mzdiff = np.abs(self.feature_metadata['MZ'] - cmz) | ||
keepmz = mzdiff <= mz_tolerance | ||
else: | ||
keepmz = np.full([len(self.feature_metadata)], True) | ||
if crt is not None: | ||
rtdiff = np.abs(self.feature_metadata['RT'] - crt) | ||
keeprt = rtdiff <= rt_tolerance | ||
else: | ||
keeprt = np.full([len(self.feature_metadata)], True) | ||
bothok = np.logical_and(keepmz, keeprt) | ||
if bothok.sum() == 0: | ||
notfound += 1 | ||
select = np.logical_or(select, bothok) | ||
|
||
logger.info('Total from mz/rt list with no match: %d' % notfound) | ||
if negate: | ||
select = np.logical_not(select) | ||
return self.reorder(select, axis='f', inplace=inplace) | ||
|
||
def sort_mz_rt(self, inplace=False): | ||
'''Sort features according to m/z and retention time. | ||
|
||
This is a convenience function wrapping calour.sort_by_metadata() | ||
|
||
Parameters | ||
---------- | ||
inplace: bool, optional | ||
True to replace current experiment, False to create new experiment with results | ||
|
||
Returns | ||
------- | ||
MS1Experiment | ||
Sorted according to m/z and retention time | ||
''' | ||
return self.sort_by_metadata('mz_rt', axis='f', inplace=inplace) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i don't understand why we need this. this function add more code maintainance burden but doesn't provide any benefits beyond There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it helps make the analysis notebook more easy to understand. Important for non-expert calour users. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
# ---------------------------------------------------------------------------- | ||
# Copyright (c) 2016--, Calour development team. | ||
# | ||
# Distributed under the terms of the Modified BSD License. | ||
# | ||
# The full license is in the file COPYING.txt, distributed with this software. | ||
# ---------------------------------------------------------------------------- | ||
|
||
import numpy.testing as npt | ||
|
||
from calour._testing import Tests | ||
import calour as ca | ||
|
||
|
||
class ExperimentTests(Tests): | ||
def setUp(self): | ||
super().setUp() | ||
self.test1 = ca.read_amplicon(self.test1_biom, self.test1_samp, | ||
min_reads=1000, normalize=10000) | ||
|
||
def test_filter_mz_rt(self): | ||
# load an mzmine2 metabolomics table, and associated gnps clusterinfo file | ||
exp = ca.read_ms(self.mzmine2_csv, sample_metadata_file=self.gnps_map, | ||
data_file_type='mzmine2', use_gnps_id_from_AllFiles=False, normalize=None) | ||
|
||
# mz filtering | ||
res = exp.filter_mz_rt(100) | ||
self.assertEqual(len(res.feature_metadata), 1) | ||
self.assertEqual(res.feature_metadata['MZ'].values, [100]) | ||
|
||
res = exp.filter_mz_rt([100, 201]) | ||
self.assertEqual(len(res.feature_metadata), 1) | ||
self.assertEqual(res.feature_metadata['MZ'].values, [100]) | ||
|
||
res = exp.filter_mz_rt([100, 201], mz_tolerance=1) | ||
self.assertEqual(len(res.feature_metadata), 2) | ||
npt.assert_array_equal(res.feature_metadata['MZ'].values, [100, 200]) | ||
|
||
res = exp.filter_mz_rt([100, 201], negate=True) | ||
self.assertEqual(len(res.feature_metadata), 5) | ||
|
||
# rt filtering | ||
res = exp.filter_mz_rt(rt=[1, 2.5]) | ||
self.assertEqual(len(res.feature_metadata), 1) | ||
self.assertEqual(res.feature_metadata['RT'].values, [1]) | ||
|
||
res = exp.filter_mz_rt(rt=[1, 2.5], rt_tolerance=0.5) | ||
self.assertEqual(len(res.feature_metadata), 3) | ||
npt.assert_array_equal(res.feature_metadata['RT'].values, [1, 2, 3]) | ||
|
||
# complex - both mz and rt | ||
res = exp.filter_mz_rt([101, 200, 400, 505], [1, 3, 4, 5], mz_tolerance=2) | ||
self.assertEqual(res.shape[1], 2) | ||
|
||
def test_get_spurious_duplicates(self): | ||
# load an mzmine2 metabolomics table, and associated gnps clusterinfo file | ||
exp = ca.read_ms(self.mzmine2_csv, sample_metadata_file=self.gnps_map, | ||
data_file_type='mzmine2', use_gnps_id_from_AllFiles=False, normalize=None) | ||
# get rid of the all 0s metabolite (to get rid of std=0 warning) | ||
exp = exp.filter_sum_abundance(0.1) | ||
|
||
res = exp.get_spurious_duplicates() | ||
# no samples filtered away | ||
self.assertEqual(res.shape[0], 6) | ||
# default parameters don't identify and suspicious features | ||
self.assertEqual(res.shape[1], 0) | ||
|
||
res = exp.get_spurious_duplicates(mz_tolerance=100, rt_tolerance=0.5) | ||
self.assertEqual(res.shape[1], 0) | ||
|
||
res = exp.get_spurious_duplicates(rt_tolerance=1) | ||
self.assertEqual(res.shape[1], 0) | ||
|
||
res = exp.get_spurious_duplicates(mz_tolerance=100, rt_tolerance=1) | ||
self.assertEqual(res.shape[1], 2) | ||
|
||
res = exp.get_spurious_duplicates(mz_tolerance=100, rt_tolerance=1, corr_thresh=0.2) | ||
self.assertEqual(res.shape[1], 4) | ||
|
||
def test_merge_similar_features(self): | ||
# load an mzmine2 metabolomics table, and associated gnps clusterinfo file | ||
exp = ca.read_ms(self.mzmine2_csv, sample_metadata_file=self.gnps_map, | ||
data_file_type='mzmine2', use_gnps_id_from_AllFiles=False, normalize=None) | ||
# no merging since features are far away | ||
res = exp.merge_similar_features() | ||
self.assertEqual(res.shape[1], 6) | ||
|
||
# a little merging | ||
res = exp.merge_similar_features(mz_tolerance=100, rt_tolerance=1) | ||
self.assertEqual(res.shape[1], 3) | ||
self.assertEqual(res.feature_metadata.at[85022, '_calour_merge_ids'], '85022;93277') | ||
|
||
# a lot of merging | ||
res = exp.merge_similar_features(mz_tolerance=400, rt_tolerance=6) | ||
self.assertEqual(res.shape[1], 2) | ||
self.assertEqual(res.feature_metadata.at[121550, '_calour_merge_ids'], '121550') |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this function should use your new
filter_mz_rt
function? you have redundant code in the 2 functions.