Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ms1 experiment #213

Merged
merged 11 commits into from
Jul 17, 2020
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion calour/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,10 @@ def read_ms(data_file, sample_metadata_file=None, feature_metadata_file=None, gn
# record the original total read count into sample metadata
exp.normalize(total=normalize, inplace=True)

# Create the combined field for easy sorting/plotting
if 'MZ' in exp.feature_metadata and 'RT' in exp.feature_metadata:
exp.feature_metadata['mz_rt'] = ['%08.4f_%05.2f' % (x[1]['MZ'], x[1]['RT']) for x in exp.feature_metadata.iterrows()]

if gnps_file:
# load the gnps table
gnps_data = pd.read_csv(gnps_file, sep='\t')
Expand All @@ -719,7 +723,6 @@ def read_ms(data_file, sample_metadata_file=None, feature_metadata_file=None, gn
# initialize the call history
param = ['{0!s}={1!r}'.format(k, v) for k, v in fparams.items()]
exp._call_history = ['{0}({1})'.format('read_amplicon', ','.join(param))]

return exp


Expand Down
96 changes: 95 additions & 1 deletion calour/ms1_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,10 @@
from logging import getLogger

import matplotlib as mpl
import numpy as np

from .experiment import Experiment

from .util import _to_list

logger = getLogger(__name__)

Expand Down Expand Up @@ -91,9 +92,102 @@ def heatmap(self, *args, **kwargs):
'''
if 'norm' not in kwargs:
kwargs['norm'] = mpl.colors.LogNorm()
if 'mz_rt' in self.feature_metadata.columns:
if 'yticklabel_len' not in kwargs:
kwargs['yticklabel_len'] = None
if 'feature_field' not in kwargs:
kwargs['feature_field'] = 'mz_rt'
if 'yticklabel_kwargs' not in kwargs:
kwargs['yticklabel_kwargs'] = {'size': 6, 'rotation': 0}
super().heatmap(*args, **kwargs)

def __repr__(self):
'''Return a string representation of this object.'''
return 'MS1Experiment %s with %d samples, %d features' % (
self.description, self.data.shape[0], self.data.shape[1])

def filter_mz(self, mz, tolerance=0.001, inplace=False, negate=False):
'''Filter metabolites based on m/z

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can describe how the filtering is performed? users have to look at code to understand what it is doing.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed. is that clear enough now?

Parameters
----------
mz: float or list of float
the M/Z to filter
tolerance: float, optional
the M/Z tolerance. filter metabolites with abs(metabolite_mz - mz) <= tolerance
inplace: bool, optional
True to replace current experiment, False to create new experiment with results
negate: bool, optional
If False, keep only metabolites matching mz
If True, remove metabolites matching mz

Returns
-------
calour.MS1Experiment
features filtered based on mz
'''
if 'MZ' not in self.feature_metadata.columns:
raise ValueError('The Experiment does not contain the column "MZ". cannot filter by mz')
mz = _to_list(mz)
keep = set()
for cmz in mz:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

better to use a boolean mask? like select in filtering.py

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changed to filter_mz_rt() to enable filtering based on mz / rt/ or both

mzdiff = np.abs(self.feature_metadata['MZ'] - cmz)
keep = keep.union(set(np.where(mzdiff <= tolerance)[0]))
if negate:
keep = set(np.arange(len(self.feature_metadata))).difference(keep)
logger.info('after filtering %d features remaining' % len(keep))
return self.reorder(sorted(list(keep)), axis='f', inplace=inplace)

def get_bad_features(self, mz_tolerance=0.001, rt_tolerance=2, corr_thresh=0.8, inplace=False, negate=False):
'''Get metabolites that have similar m/z and rt, and are correlated/anti-correlated.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a blank line.

These are usually due to incorrect feature detection.
correlation could be due to incomplete removal of isotopes or same metabolite in multiple RTs
anti-correlation could be due to RT drift (splitting of one true metabolite)
Metabolites in the new experiment are ordered by correlation clusters

Parameters
----------
mz_tolerance: float, optional
the M/Z tolerance. Metabolites are similar if abs(metabolite_mz - mz) <= mz_tolerance
rt_tolerance: float, optional
the retention time tolerance. Metabolites are similar if abs(metabolite_rt - rt) <= rt_tolerance
corr_threshold: float, optional
the minimal (abs) correlation/anti-correlation value in order to call features correlated
inplace: bool, optional
True to replace current experiment, False to create new experiment with results
negate: bool, optional
If False, keep only metabolites that show a correlation with another metabolite
If True, remove metabolites showing correlation

Returns
-------
calour.MS1Experiment
features filtered and ordered basen on m/z and rt similarity and correlation
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what do you do here? remove (semi-) duplicate features? or combine them?

you need a better function name!

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any ideas? this is the best name i could come up with :)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so these are spurious duplicate features? if so, how about get_spurious_duplicates?

what's the workfflow here? what do u do after you get these features?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

feature selection in ms is complicated... and lots of parameters
the workflow is that after you set your parameters and do feature selection, you load the resulting table and look at it also using get_spurious_duplicates(). If your feature selection is too sensitive, you get lots of metabolites with similar mz/rt that show anti-correlation or correlation (depending on the exact problem).
Then, if you get too many, you go back and redo feature selection with different params...
(but you will always get some bad features - it's a sensitivity/specificity payoff... it depends on how many and how severe... and how it affects your downstream analysis)

'''
features = self.feature_metadata.copy()
keep_features = []
data = self.get_data(sparse=False)
while len(features) > 0:
# get the first feature
cfeature = features.iloc[0]
features.drop(index=cfeature.name, inplace=True)
# find all mz/rt neighbors of the feature
mzdist = np.abs(features['MZ'] - cfeature['MZ'])
rtdist = np.abs(features['RT'] - cfeature['RT'])
okf = features[np.logical_and(mzdist <= mz_tolerance, rtdist <= rt_tolerance)]
if len(okf) == 0:
continue
# test the correlation of each neighbor
odat = data[:, self.feature_metadata.index.get_loc(cfeature.name)]
ckeep = []
for cf, *_ in okf.iterrows():
cdat = data[:, self.feature_metadata.index.get_loc(cf)]
corrcf = np.corrcoef(odat, cdat)[0, 1]
if np.abs(corrcf) >= corr_thresh:
ckeep.append(cf)
# store the result and remove all the correlated features from the features left to process
if len(ckeep) > 0:
keep_features.append(cfeature.name)
keep_features.extend(ckeep)
features.drop(index=ckeep, inplace=True)
return self.filter_ids(keep_features, negate=negate, inplace=inplace)
2 changes: 2 additions & 0 deletions calour/tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,10 @@ def test_read_open_ms(self):
# test we get the MZ and RT correct
self.assertIn('MZ', exp.feature_metadata)
self.assertIn('RT', exp.feature_metadata)
self.assertIn('mz_rt', exp.feature_metadata)
self.assertEqual(exp.feature_metadata['MZ'].iloc[1], 118.0869)
self.assertEqual(exp.feature_metadata['RT'].iloc[1], 23.9214)
self.assertEqual(exp.feature_metadata['mz_rt'].iloc[1], '118.0869_23.92')
# test normalizing
exp = ca.read_ms(self.openms_csv, normalize=10000, data_file_type='openms')
assert_array_almost_equal(exp.data.sum(axis=1), np.ones(exp.shape[0]) * 10000)
Expand Down
64 changes: 64 additions & 0 deletions calour/tests/test_ms1experiment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2016--, Calour development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

import numpy.testing as npt

from calour._testing import Tests
import calour as ca


class ExperimentTests(Tests):
def setUp(self):
super().setUp()
self.test1 = ca.read_amplicon(self.test1_biom, self.test1_samp,
min_reads=1000, normalize=10000)

def test_filter_mz(self):
# load an mzmine2 metabolomics table, and associated gnps clusterinfo file
exp = ca.read_ms(self.mzmine2_csv, sample_metadata_file=self.gnps_map,
data_file_type='mzmine2', use_gnps_id_from_AllFiles=False, normalize=None)

res = exp.filter_mz(100)
self.assertEqual(len(res.feature_metadata), 1)
self.assertEqual(res.feature_metadata['MZ'].values, [100])

res = exp.filter_mz([100, 201])
self.assertEqual(len(res.feature_metadata), 1)
self.assertEqual(res.feature_metadata['MZ'].values, [100])

res = exp.filter_mz([100, 201], tolerance=1)
self.assertEqual(len(res.feature_metadata), 2)
npt.assert_array_equal(res.feature_metadata['MZ'].values, [100, 200])

res = exp.filter_mz([100, 201], negate=True)
self.assertEqual(len(res.feature_metadata), 5)

def test_get_bad_features(self):
# load an mzmine2 metabolomics table, and associated gnps clusterinfo file
exp = ca.read_ms(self.mzmine2_csv, sample_metadata_file=self.gnps_map,
data_file_type='mzmine2', use_gnps_id_from_AllFiles=False, normalize=None)
# get rid of the all 0s metabolite (to get rid of std=0 warning)
exp = exp.filter_sum_abundance(0.1)

res = exp.get_bad_features()
# no samples filtered away
self.assertEqual(res.shape[0], 6)
# default parameters don't identify and suspicious features
self.assertEqual(res.shape[1], 0)

res = exp.get_bad_features(mz_tolerance=100, rt_tolerance=0.5)
self.assertEqual(res.shape[1], 0)

res = exp.get_bad_features(rt_tolerance=1)
self.assertEqual(res.shape[1], 0)

res = exp.get_bad_features(mz_tolerance=100, rt_tolerance=1)
self.assertEqual(res.shape[1], 2)

res = exp.get_bad_features(mz_tolerance=100, rt_tolerance=1, corr_thresh=0.2)
self.assertEqual(res.shape[1], 4)