-
Notifications
You must be signed in to change notification settings - Fork 22
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Ms1 experiment #213
Ms1 experiment #213
Changes from 5 commits
a3fd298
a923f53
9d94cef
a437253
9736fb8
87a664e
2c0f99a
df37c5d
97d6530
28dea54
eac3b60
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,9 +23,10 @@ | |
from logging import getLogger | ||
|
||
import matplotlib as mpl | ||
import numpy as np | ||
|
||
from .experiment import Experiment | ||
|
||
from .util import _to_list | ||
|
||
logger = getLogger(__name__) | ||
|
||
|
@@ -91,9 +92,102 @@ def heatmap(self, *args, **kwargs): | |
''' | ||
if 'norm' not in kwargs: | ||
kwargs['norm'] = mpl.colors.LogNorm() | ||
if 'mz_rt' in self.feature_metadata.columns: | ||
if 'yticklabel_len' not in kwargs: | ||
kwargs['yticklabel_len'] = None | ||
if 'feature_field' not in kwargs: | ||
kwargs['feature_field'] = 'mz_rt' | ||
if 'yticklabel_kwargs' not in kwargs: | ||
kwargs['yticklabel_kwargs'] = {'size': 6, 'rotation': 0} | ||
super().heatmap(*args, **kwargs) | ||
|
||
def __repr__(self): | ||
'''Return a string representation of this object.''' | ||
return 'MS1Experiment %s with %d samples, %d features' % ( | ||
self.description, self.data.shape[0], self.data.shape[1]) | ||
|
||
def filter_mz(self, mz, tolerance=0.001, inplace=False, negate=False): | ||
'''Filter metabolites based on m/z | ||
|
||
Parameters | ||
---------- | ||
mz: float or list of float | ||
the M/Z to filter | ||
tolerance: float, optional | ||
the M/Z tolerance. filter metabolites with abs(metabolite_mz - mz) <= tolerance | ||
inplace: bool, optional | ||
True to replace current experiment, False to create new experiment with results | ||
negate: bool, optional | ||
If False, keep only metabolites matching mz | ||
If True, remove metabolites matching mz | ||
|
||
Returns | ||
------- | ||
calour.MS1Experiment | ||
features filtered based on mz | ||
''' | ||
if 'MZ' not in self.feature_metadata.columns: | ||
raise ValueError('The Experiment does not contain the column "MZ". cannot filter by mz') | ||
mz = _to_list(mz) | ||
keep = set() | ||
for cmz in mz: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. better to use a boolean mask? like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. changed to filter_mz_rt() to enable filtering based on mz / rt/ or both |
||
mzdiff = np.abs(self.feature_metadata['MZ'] - cmz) | ||
keep = keep.union(set(np.where(mzdiff <= tolerance)[0])) | ||
if negate: | ||
keep = set(np.arange(len(self.feature_metadata))).difference(keep) | ||
logger.info('after filtering %d features remaining' % len(keep)) | ||
return self.reorder(sorted(list(keep)), axis='f', inplace=inplace) | ||
|
||
def get_bad_features(self, mz_tolerance=0.001, rt_tolerance=2, corr_thresh=0.8, inplace=False, negate=False): | ||
'''Get metabolites that have similar m/z and rt, and are correlated/anti-correlated. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a blank line. |
||
These are usually due to incorrect feature detection. | ||
correlation could be due to incomplete removal of isotopes or same metabolite in multiple RTs | ||
anti-correlation could be due to RT drift (splitting of one true metabolite) | ||
Metabolites in the new experiment are ordered by correlation clusters | ||
|
||
Parameters | ||
---------- | ||
mz_tolerance: float, optional | ||
the M/Z tolerance. Metabolites are similar if abs(metabolite_mz - mz) <= mz_tolerance | ||
rt_tolerance: float, optional | ||
the retention time tolerance. Metabolites are similar if abs(metabolite_rt - rt) <= rt_tolerance | ||
corr_threshold: float, optional | ||
the minimal (abs) correlation/anti-correlation value in order to call features correlated | ||
inplace: bool, optional | ||
True to replace current experiment, False to create new experiment with results | ||
negate: bool, optional | ||
If False, keep only metabolites that show a correlation with another metabolite | ||
If True, remove metabolites showing correlation | ||
|
||
Returns | ||
------- | ||
calour.MS1Experiment | ||
features filtered and ordered basen on m/z and rt similarity and correlation | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what do you do here? remove (semi-) duplicate features? or combine them? you need a better function name! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. any ideas? this is the best name i could come up with :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so these are spurious duplicate features? if so, how about get_spurious_duplicates? what's the workfflow here? what do u do after you get these features? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. feature selection in ms is complicated... and lots of parameters |
||
''' | ||
features = self.feature_metadata.copy() | ||
keep_features = [] | ||
data = self.get_data(sparse=False) | ||
while len(features) > 0: | ||
# get the first feature | ||
cfeature = features.iloc[0] | ||
features.drop(index=cfeature.name, inplace=True) | ||
# find all mz/rt neighbors of the feature | ||
mzdist = np.abs(features['MZ'] - cfeature['MZ']) | ||
rtdist = np.abs(features['RT'] - cfeature['RT']) | ||
okf = features[np.logical_and(mzdist <= mz_tolerance, rtdist <= rt_tolerance)] | ||
if len(okf) == 0: | ||
continue | ||
# test the correlation of each neighbor | ||
odat = data[:, self.feature_metadata.index.get_loc(cfeature.name)] | ||
ckeep = [] | ||
for cf, *_ in okf.iterrows(): | ||
cdat = data[:, self.feature_metadata.index.get_loc(cf)] | ||
corrcf = np.corrcoef(odat, cdat)[0, 1] | ||
if np.abs(corrcf) >= corr_thresh: | ||
ckeep.append(cf) | ||
# store the result and remove all the correlated features from the features left to process | ||
if len(ckeep) > 0: | ||
keep_features.append(cfeature.name) | ||
keep_features.extend(ckeep) | ||
features.drop(index=ckeep, inplace=True) | ||
return self.filter_ids(keep_features, negate=negate, inplace=inplace) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
# ---------------------------------------------------------------------------- | ||
# Copyright (c) 2016--, Calour development team. | ||
# | ||
# Distributed under the terms of the Modified BSD License. | ||
# | ||
# The full license is in the file COPYING.txt, distributed with this software. | ||
# ---------------------------------------------------------------------------- | ||
|
||
import numpy.testing as npt | ||
|
||
from calour._testing import Tests | ||
import calour as ca | ||
|
||
|
||
class ExperimentTests(Tests): | ||
def setUp(self): | ||
super().setUp() | ||
self.test1 = ca.read_amplicon(self.test1_biom, self.test1_samp, | ||
min_reads=1000, normalize=10000) | ||
|
||
def test_filter_mz(self): | ||
# load an mzmine2 metabolomics table, and associated gnps clusterinfo file | ||
exp = ca.read_ms(self.mzmine2_csv, sample_metadata_file=self.gnps_map, | ||
data_file_type='mzmine2', use_gnps_id_from_AllFiles=False, normalize=None) | ||
|
||
res = exp.filter_mz(100) | ||
self.assertEqual(len(res.feature_metadata), 1) | ||
self.assertEqual(res.feature_metadata['MZ'].values, [100]) | ||
|
||
res = exp.filter_mz([100, 201]) | ||
self.assertEqual(len(res.feature_metadata), 1) | ||
self.assertEqual(res.feature_metadata['MZ'].values, [100]) | ||
|
||
res = exp.filter_mz([100, 201], tolerance=1) | ||
self.assertEqual(len(res.feature_metadata), 2) | ||
npt.assert_array_equal(res.feature_metadata['MZ'].values, [100, 200]) | ||
|
||
res = exp.filter_mz([100, 201], negate=True) | ||
self.assertEqual(len(res.feature_metadata), 5) | ||
|
||
def test_get_bad_features(self): | ||
# load an mzmine2 metabolomics table, and associated gnps clusterinfo file | ||
exp = ca.read_ms(self.mzmine2_csv, sample_metadata_file=self.gnps_map, | ||
data_file_type='mzmine2', use_gnps_id_from_AllFiles=False, normalize=None) | ||
# get rid of the all 0s metabolite (to get rid of std=0 warning) | ||
exp = exp.filter_sum_abundance(0.1) | ||
|
||
res = exp.get_bad_features() | ||
# no samples filtered away | ||
self.assertEqual(res.shape[0], 6) | ||
# default parameters don't identify and suspicious features | ||
self.assertEqual(res.shape[1], 0) | ||
|
||
res = exp.get_bad_features(mz_tolerance=100, rt_tolerance=0.5) | ||
self.assertEqual(res.shape[1], 0) | ||
|
||
res = exp.get_bad_features(rt_tolerance=1) | ||
self.assertEqual(res.shape[1], 0) | ||
|
||
res = exp.get_bad_features(mz_tolerance=100, rt_tolerance=1) | ||
self.assertEqual(res.shape[1], 2) | ||
|
||
res = exp.get_bad_features(mz_tolerance=100, rt_tolerance=1, corr_thresh=0.2) | ||
self.assertEqual(res.shape[1], 4) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can describe how the filtering is performed? users have to look at code to understand what it is doing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed. is that clear enough now?