Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ms1 experiment #213

Merged
merged 11 commits into from
Jul 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion calour/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,10 @@ def read_ms(data_file, sample_metadata_file=None, feature_metadata_file=None, gn
# record the original total read count into sample metadata
exp.normalize(total=normalize, inplace=True)

# Create the combined field for easy sorting/plotting
if 'MZ' in exp.feature_metadata and 'RT' in exp.feature_metadata:
exp.feature_metadata['mz_rt'] = ['%08.4f_%05.2f' % (x[1]['MZ'], x[1]['RT']) for x in exp.feature_metadata.iterrows()]

if gnps_file:
# load the gnps table
gnps_data = pd.read_csv(gnps_file, sep='\t')
Expand All @@ -719,7 +723,6 @@ def read_ms(data_file, sample_metadata_file=None, feature_metadata_file=None, gn
# initialize the call history
param = ['{0!s}={1!r}'.format(k, v) for k, v in fparams.items()]
exp._call_history = ['{0}({1})'.format('read_amplicon', ','.join(param))]

return exp


Expand Down
194 changes: 193 additions & 1 deletion calour/ms1_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,10 @@
from logging import getLogger

import matplotlib as mpl
import numpy as np

from .experiment import Experiment

from .util import _to_list

logger = getLogger(__name__)

Expand Down Expand Up @@ -91,9 +92,200 @@ def heatmap(self, *args, **kwargs):
'''
if 'norm' not in kwargs:
kwargs['norm'] = mpl.colors.LogNorm()
if 'mz_rt' in self.feature_metadata.columns:
if 'yticklabel_len' not in kwargs:
kwargs['yticklabel_len'] = None
if 'feature_field' not in kwargs:
kwargs['feature_field'] = 'mz_rt'
if 'yticklabel_kwargs' not in kwargs:
kwargs['yticklabel_kwargs'] = {'size': 6, 'rotation': 0}
super().heatmap(*args, **kwargs)

def __repr__(self):
'''Return a string representation of this object.'''
return 'MS1Experiment %s with %d samples, %d features' % (
self.description, self.data.shape[0], self.data.shape[1])

def get_spurious_duplicates(self, mz_tolerance=0.001, rt_tolerance=2, corr_thresh=0.8, inplace=False, negate=False):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this function should use your new filter_mz_rt function? you have redundant code in the 2 functions.

'''Get subgroups of metabolites that are suspected ms1 alignment artifacts.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can describe how the filtering is performed? users have to look at code to understand what it is doing.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed. is that clear enough now?

The function returns a calour.MS1Experiment with groups of metabolites that (within each group) have similar m/z and rt, and are highly
correlated/anti-correlated. These are usually due to incorrect feature detection/alignment and can be used to optimize the feature selection parameters.
correlation could be due to incomplete removal of isotopes or same metabolite in multiple RTs
anti-correlation could be due to RT drift (splitting of one true metabolite)
Metabolites in the new experiment are ordered by correlation clusters

Parameters
----------
mz_tolerance: float, optional
the M/Z tolerance. Metabolites are similar if abs(metabolite_mz - mz) <= mz_tolerance
rt_tolerance: float, optional
the retention time tolerance. Metabolites are similar if abs(metabolite_rt - rt) <= rt_tolerance
corr_threshold: float, optional
the minimal (abs) correlation/anti-correlation value in order to call features correlated
inplace: bool, optional
True to replace current experiment, False to create new experiment with results
negate: bool, optional
If False, keep only metabolites that show a correlation with another metabolite
If True, remove metabolites showing correlation

Returns
-------
MS1Experiment
features filtered and ordered basen on m/z and rt similarity and correlation
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what do you do here? remove (semi-) duplicate features? or combine them?

you need a better function name!

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any ideas? this is the best name i could come up with :)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so these are spurious duplicate features? if so, how about get_spurious_duplicates?

what's the workfflow here? what do u do after you get these features?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

feature selection in ms is complicated... and lots of parameters
the workflow is that after you set your parameters and do feature selection, you load the resulting table and look at it also using get_spurious_duplicates(). If your feature selection is too sensitive, you get lots of metabolites with similar mz/rt that show anti-correlation or correlation (depending on the exact problem).
Then, if you get too many, you go back and redo feature selection with different params...
(but you will always get some bad features - it's a sensitivity/specificity payoff... it depends on how many and how severe... and how it affects your downstream analysis)

'''
features = self.feature_metadata.copy()
keep_features = []
data = self.get_data(sparse=False)
while len(features) > 0:
# get the first feature
cfeature = features.iloc[0]
features.drop(index=cfeature.name, inplace=True)
# find all mz/rt neighbors of the feature
mzdist = np.abs(features['MZ'] - cfeature['MZ'])
rtdist = np.abs(features['RT'] - cfeature['RT'])
okf = features[np.logical_and(mzdist <= mz_tolerance, rtdist <= rt_tolerance)]
if len(okf) == 0:
continue
# test the correlation of each neighbor
odat = data[:, self.feature_metadata.index.get_loc(cfeature.name)]
ckeep = []
for cf, *_ in okf.iterrows():
cdat = data[:, self.feature_metadata.index.get_loc(cf)]
corrcf = np.corrcoef(odat, cdat)[0, 1]
if np.abs(corrcf) >= corr_thresh:
ckeep.append(cf)
# store the result and remove all the correlated features from the features left to process
if len(ckeep) > 0:
keep_features.append(cfeature.name)
keep_features.extend(ckeep)
features.drop(index=ckeep, inplace=True)
return self.filter_ids(keep_features, negate=negate, inplace=inplace)

def merge_similar_features(self, mz_tolerance=0.001, rt_tolerance=0.5):
'''Merge metabolites with similar mz/rt to a single metabolite

Metabolites are initially sorted by frequency and a greedy clustering algorithm (starting from the highest freq.) is used to join together
metabolites that are close in m/z and r/t, combining them to a signle metabolite with freq=sum(freq) of all metabolites in the cluster.

Parameters
----------
mz_tolerance: float, optional
metabolites with abs(metabolite_mz - mz) <= mz_tolerance are joined
rt_tolerance: float, optional
metabolites with abs(metabolite_rt - rt) <= rt_tolerance are joined

Returns
-------
MS1Experiment
With close metabolites joined to a single metabolite.
The m/z and rt of the new metabolite are the m/z and rt of the highest freq. metabolite. Frequency of the new metabolite is the sum of frequencies
of all joined metabolites.
New feature_metadata fields: _calour_merge_number, _calour_merge_ids are added listing the number and ids of the metabolites joined for each new metabolite
'''
exp = self.sort_abundance(reverse=False)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why sorting?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We want the center of each cluster to be the highest frequency metabolite and to join to it all the close metabolites.

features = exp.feature_metadata
features['_metabolite_group'] = np.zeros(len(features)) - 1
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

instead of of messing and clutter feature metadata data frame, could you modify aggregate_by_metadata to accept a list-like (pd.series, tuple, etc.) besides a column name for group-then-apply manipulation? it is a very light change and avoids adding intermediary column to metadata data frame.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you mean to make the field var in agg_be metadata can be a list/pd.series etc?
I think this will be confusing for the API of the function (field can be a tuple...)
adding it as a different param instead of field will also be confusing for the api...
The new column is deleted from the final result, so the user does not feel anything.
i think making the API clean is more important than making the code clean...

gpos = list(features.columns).index('_metabolite_group')
cgroup = 0
for cgroup, cfeature in features.iterrows():
mzdist = np.abs(features['MZ'] - cfeature['MZ'])
rtdist = np.abs(features['RT'] - cfeature['RT'])
ok = (mzdist <= mz_tolerance) & (rtdist <= rt_tolerance) & (features['_metabolite_group'] == -1)
okpos = np.where(ok)[0]
for cpos in okpos:
features.iat[cpos, gpos] = cgroup
exp = exp.aggregate_by_metadata('_metabolite_group', agg='sum', axis='f')
exp.feature_metadata.drop('_metabolite_group', axis='columns', inplace=True)
logger.info('%d metabolites remaining after merge' % len(exp.feature_metadata))
return exp

def filter_mz_rt(self, mz=None, rt=None, mz_tolerance=0.05, rt_tolerance=0.2, inplace=False, negate=False):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

by the way, we should consistent with naming - is this more appropriate to name it "filter_by_mz_rt"?

'''Filter metabolites based on m/z and/or retention time

Keep (or remove if negate=True) metabolites that have an m/z and/or retention time close (up to tolerance)
to the requested mz and/or rt (or list of mz and/or rt).
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

requested -> input

If both mz and rt are provided, they should be matched (i.e. filtering is performed using each mz and rt pair with same index)

Parameters
----------
mz: float or list of float or None, optional
the M/Z to filter
if None, do not filter based on M/Z
rt: float or list of float or None, optional
the retention time to filter
if None, do not filter based on rt
mz_tolerance: float, optional
the M/Z tolerance. filter metabolites with abs(metabolite_mz - mz) <= mz_tolerance
rt_tolerance: float, optional
the rt tolerance. filter metabolites with abs(metabolite_rt - rt) <= rt_tolerance
inplace: bool, optional
True to replace current experiment, False to create new experiment with results
negate: bool, optional
If False, keep only metabolites matching mz
If True, remove metabolites matching mz

Returns
-------
MS1Experiment
features filtered based on mz
'''
if mz is None and rt is None:
raise ValueError('at least one of "mz" and "rt" must not be None')
if mz is not None:
if 'MZ' not in self.feature_metadata.columns:
raise ValueError('The Experiment does not contain the column "MZ". cannot filter by mz')
else:
mz = _to_list(mz)
if rt is not None:
if 'RT' not in self.feature_metadata.columns:
raise ValueError('The Experiment does not contain the column "RT". cannot filter by rt')
else:
rt = _to_list(rt)

select = np.zeros(len(self.feature_metadata), dtype='?')
notfound = 0
if mz is None:
mz = [None] * len(rt)
if rt is None:
rt = [None] * len(mz)
if len(mz) != len(rt):
raise ValueError('mz and rt must have same length')

for cmz, crt in zip(mz, rt):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mz and rt should be matched if they are list-like? pls document this in the docstring.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

if cmz is not None:
mzdiff = np.abs(self.feature_metadata['MZ'] - cmz)
keepmz = mzdiff <= mz_tolerance
else:
keepmz = np.full([len(self.feature_metadata)], True)
if crt is not None:
rtdiff = np.abs(self.feature_metadata['RT'] - crt)
keeprt = rtdiff <= rt_tolerance
else:
keeprt = np.full([len(self.feature_metadata)], True)
bothok = np.logical_and(keepmz, keeprt)
if bothok.sum() == 0:
notfound += 1
select = np.logical_or(select, bothok)

logger.info('Total from mz/rt list with no match: %d' % notfound)
if negate:
select = np.logical_not(select)
return self.reorder(select, axis='f', inplace=inplace)

def sort_mz_rt(self, inplace=False):
'''Sort features according to m/z and retention time.

This is a convenience function wrapping calour.sort_by_metadata()

Parameters
----------
inplace: bool, optional
True to replace current experiment, False to create new experiment with results

Returns
-------
MS1Experiment
Sorted according to m/z and retention time
'''
return self.sort_by_metadata('mz_rt', axis='f', inplace=inplace)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i don't understand why we need this. this function add more code maintainance burden but doesn't provide any benefits beyond sort_by_metadata.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it helps make the analysis notebook more easy to understand. Important for non-expert calour users.
similar to sort_samples()

2 changes: 2 additions & 0 deletions calour/tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,10 @@ def test_read_open_ms(self):
# test we get the MZ and RT correct
self.assertIn('MZ', exp.feature_metadata)
self.assertIn('RT', exp.feature_metadata)
self.assertIn('mz_rt', exp.feature_metadata)
self.assertEqual(exp.feature_metadata['MZ'].iloc[1], 118.0869)
self.assertEqual(exp.feature_metadata['RT'].iloc[1], 23.9214)
self.assertEqual(exp.feature_metadata['mz_rt'].iloc[1], '118.0869_23.92')
# test normalizing
exp = ca.read_ms(self.openms_csv, normalize=10000, data_file_type='openms')
assert_array_almost_equal(exp.data.sum(axis=1), np.ones(exp.shape[0]) * 10000)
Expand Down
96 changes: 96 additions & 0 deletions calour/tests/test_ms1experiment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2016--, Calour development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

import numpy.testing as npt

from calour._testing import Tests
import calour as ca


class ExperimentTests(Tests):
def setUp(self):
super().setUp()
self.test1 = ca.read_amplicon(self.test1_biom, self.test1_samp,
min_reads=1000, normalize=10000)

def test_filter_mz_rt(self):
# load an mzmine2 metabolomics table, and associated gnps clusterinfo file
exp = ca.read_ms(self.mzmine2_csv, sample_metadata_file=self.gnps_map,
data_file_type='mzmine2', use_gnps_id_from_AllFiles=False, normalize=None)

# mz filtering
res = exp.filter_mz_rt(100)
self.assertEqual(len(res.feature_metadata), 1)
self.assertEqual(res.feature_metadata['MZ'].values, [100])

res = exp.filter_mz_rt([100, 201])
self.assertEqual(len(res.feature_metadata), 1)
self.assertEqual(res.feature_metadata['MZ'].values, [100])

res = exp.filter_mz_rt([100, 201], mz_tolerance=1)
self.assertEqual(len(res.feature_metadata), 2)
npt.assert_array_equal(res.feature_metadata['MZ'].values, [100, 200])

res = exp.filter_mz_rt([100, 201], negate=True)
self.assertEqual(len(res.feature_metadata), 5)

# rt filtering
res = exp.filter_mz_rt(rt=[1, 2.5])
self.assertEqual(len(res.feature_metadata), 1)
self.assertEqual(res.feature_metadata['RT'].values, [1])

res = exp.filter_mz_rt(rt=[1, 2.5], rt_tolerance=0.5)
self.assertEqual(len(res.feature_metadata), 3)
npt.assert_array_equal(res.feature_metadata['RT'].values, [1, 2, 3])

# complex - both mz and rt
res = exp.filter_mz_rt([101, 200, 400, 505], [1, 3, 4, 5], mz_tolerance=2)
self.assertEqual(res.shape[1], 2)

def test_get_spurious_duplicates(self):
# load an mzmine2 metabolomics table, and associated gnps clusterinfo file
exp = ca.read_ms(self.mzmine2_csv, sample_metadata_file=self.gnps_map,
data_file_type='mzmine2', use_gnps_id_from_AllFiles=False, normalize=None)
# get rid of the all 0s metabolite (to get rid of std=0 warning)
exp = exp.filter_sum_abundance(0.1)

res = exp.get_spurious_duplicates()
# no samples filtered away
self.assertEqual(res.shape[0], 6)
# default parameters don't identify and suspicious features
self.assertEqual(res.shape[1], 0)

res = exp.get_spurious_duplicates(mz_tolerance=100, rt_tolerance=0.5)
self.assertEqual(res.shape[1], 0)

res = exp.get_spurious_duplicates(rt_tolerance=1)
self.assertEqual(res.shape[1], 0)

res = exp.get_spurious_duplicates(mz_tolerance=100, rt_tolerance=1)
self.assertEqual(res.shape[1], 2)

res = exp.get_spurious_duplicates(mz_tolerance=100, rt_tolerance=1, corr_thresh=0.2)
self.assertEqual(res.shape[1], 4)

def test_merge_similar_features(self):
# load an mzmine2 metabolomics table, and associated gnps clusterinfo file
exp = ca.read_ms(self.mzmine2_csv, sample_metadata_file=self.gnps_map,
data_file_type='mzmine2', use_gnps_id_from_AllFiles=False, normalize=None)
# no merging since features are far away
res = exp.merge_similar_features()
self.assertEqual(res.shape[1], 6)

# a little merging
res = exp.merge_similar_features(mz_tolerance=100, rt_tolerance=1)
self.assertEqual(res.shape[1], 3)
self.assertEqual(res.feature_metadata.at[85022, '_calour_merge_ids'], '85022;93277')

# a lot of merging
res = exp.merge_similar_features(mz_tolerance=400, rt_tolerance=6)
self.assertEqual(res.shape[1], 2)
self.assertEqual(res.feature_metadata.at[121550, '_calour_merge_ids'], '121550')