biocore · RNAer · Jul 17, 2020 · Jul 11, 2020 · Jul 12, 2020 · Jul 13, 2020
diff --git a/calour/io.py b/calour/io.py
@@ -705,6 +705,10 @@ def read_ms(data_file, sample_metadata_file=None, feature_metadata_file=None, gn
         # record the original total read count into sample metadata
         exp.normalize(total=normalize, inplace=True)
 
+    # Create the combined field for easy sorting/plotting
+    if 'MZ' in exp.feature_metadata and 'RT' in exp.feature_metadata:
+        exp.feature_metadata['mz_rt'] = ['%08.4f_%05.2f' % (x[1]['MZ'], x[1]['RT']) for x in exp.feature_metadata.iterrows()]
+
     if gnps_file:
         # load the gnps table
         gnps_data = pd.read_csv(gnps_file, sep='\t')
@@ -719,7 +723,6 @@ def read_ms(data_file, sample_metadata_file=None, feature_metadata_file=None, gn
     # initialize the call history
     param = ['{0!s}={1!r}'.format(k, v) for k, v in fparams.items()]
     exp._call_history = ['{0}({1})'.format('read_amplicon', ','.join(param))]
-
     return exp
 
 

diff --git a/calour/ms1_experiment.py b/calour/ms1_experiment.py
@@ -23,9 +23,10 @@
 from logging import getLogger
 
 import matplotlib as mpl
+import numpy as np
 
 from .experiment import Experiment
-
+from .util import _to_list
 
 logger = getLogger(__name__)
 
@@ -91,9 +92,102 @@ def heatmap(self, *args, **kwargs):
         '''
         if 'norm' not in kwargs:
             kwargs['norm'] = mpl.colors.LogNorm()
+        if 'mz_rt' in self.feature_metadata.columns:
+            if 'yticklabel_len' not in kwargs:
+                kwargs['yticklabel_len'] = None
+            if 'feature_field' not in kwargs:
+                kwargs['feature_field'] = 'mz_rt'
+            if 'yticklabel_kwargs' not in kwargs:
+                kwargs['yticklabel_kwargs'] = {'size': 6, 'rotation': 0}
         super().heatmap(*args, **kwargs)
 
     def __repr__(self):
         '''Return a string representation of this object.'''
         return 'MS1Experiment %s with %d samples, %d features' % (
             self.description, self.data.shape[0], self.data.shape[1])
+
+    def filter_mz(self, mz, tolerance=0.001, inplace=False, negate=False):
+        '''Filter metabolites based on m/z
+
+        Parameters
+        ----------
+        mz: float or list of float
+            the M/Z to filter
+        tolerance: float, optional
+            the M/Z tolerance. filter metabolites with abs(metabolite_mz - mz) <= tolerance
+        inplace: bool, optional
+            True to replace current experiment, False to create new experiment with results
+        negate: bool, optional
+            If False, keep only metabolites matching mz
+            If True, remove metabolites matching mz
+
+        Returns
+        -------
+        calour.MS1Experiment
+            features filtered based on mz
+        '''
+        if 'MZ' not in self.feature_metadata.columns:
+            raise ValueError('The Experiment does not contain the column "MZ". cannot filter by mz')
+        mz = _to_list(mz)
+        keep = set()
+        for cmz in mz:
+            mzdiff = np.abs(self.feature_metadata['MZ'] - cmz)
+            keep = keep.union(set(np.where(mzdiff <= tolerance)[0]))
+        if negate:
+            keep = set(np.arange(len(self.feature_metadata))).difference(keep)
+        logger.info('after filtering %d features remaining' % len(keep))
+        return self.reorder(sorted(list(keep)), axis='f', inplace=inplace)
+
+    def get_bad_features(self, mz_tolerance=0.001, rt_tolerance=2, corr_thresh=0.8, inplace=False, negate=False):
+        '''Get metabolites that have similar m/z and rt, and are correlated/anti-correlated.
+        These are usually due to incorrect feature detection.
+        correlation could be due to incomplete removal of isotopes or same metabolite in multiple RTs
+        anti-correlation could be due to RT drift (splitting of one true metabolite)
+        Metabolites in the new experiment are ordered by correlation clusters
+
+        Parameters
+        ----------
+        mz_tolerance: float, optional
+            the M/Z tolerance. Metabolites are similar if abs(metabolite_mz - mz) <= mz_tolerance
+        rt_tolerance: float, optional
+            the retention time tolerance. Metabolites are similar if abs(metabolite_rt - rt) <= rt_tolerance
+        corr_threshold: float, optional
+            the minimal (abs) correlation/anti-correlation value in order to call features correlated
+        inplace: bool, optional
+            True to replace current experiment, False to create new experiment with results
+        negate: bool, optional
+            If False, keep only metabolites that show a correlation with another metabolite
+            If True, remove metabolites showing correlation
+
+        Returns
+        -------
+        calour.MS1Experiment
+            features filtered and ordered basen on m/z and rt similarity and correlation
+        '''
+        features = self.feature_metadata.copy()
+        keep_features = []
+        data = self.get_data(sparse=False)
+        while len(features) > 0:
+            # get the first feature
+            cfeature = features.iloc[0]
+            features.drop(index=cfeature.name, inplace=True)
+            # find all mz/rt neighbors of the feature
+            mzdist = np.abs(features['MZ'] - cfeature['MZ'])
+            rtdist = np.abs(features['RT'] - cfeature['RT'])
+            okf = features[np.logical_and(mzdist <= mz_tolerance, rtdist <= rt_tolerance)]
+            if len(okf) == 0:
+                continue
+            # test the correlation of each neighbor
+            odat = data[:, self.feature_metadata.index.get_loc(cfeature.name)]
+            ckeep = []
+            for cf, *_ in okf.iterrows():
+                cdat = data[:, self.feature_metadata.index.get_loc(cf)]
+                corrcf = np.corrcoef(odat, cdat)[0, 1]
+                if np.abs(corrcf) >= corr_thresh:
+                    ckeep.append(cf)
+            # store the result and remove all the correlated features from the features left to process
+            if len(ckeep) > 0:
+                keep_features.append(cfeature.name)
+                keep_features.extend(ckeep)
+                features.drop(index=ckeep, inplace=True)
+        return self.filter_ids(keep_features, negate=negate, inplace=inplace)
diff --git a/calour/tests/test_io.py b/calour/tests/test_io.py
@@ -176,8 +176,10 @@ def test_read_open_ms(self):
         # test we get the MZ and RT correct
         self.assertIn('MZ', exp.feature_metadata)
         self.assertIn('RT', exp.feature_metadata)
+        self.assertIn('mz_rt', exp.feature_metadata)
         self.assertEqual(exp.feature_metadata['MZ'].iloc[1], 118.0869)
         self.assertEqual(exp.feature_metadata['RT'].iloc[1], 23.9214)
+        self.assertEqual(exp.feature_metadata['mz_rt'].iloc[1], '118.0869_23.92')
         # test normalizing
         exp = ca.read_ms(self.openms_csv, normalize=10000, data_file_type='openms')
         assert_array_almost_equal(exp.data.sum(axis=1), np.ones(exp.shape[0]) * 10000)

diff --git a/calour/tests/test_ms1experiment.py b/calour/tests/test_ms1experiment.py
@@ -0,0 +1,64 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016--,  Calour development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+import numpy.testing as npt
+
+from calour._testing import Tests
+import calour as ca
+
+
+class ExperimentTests(Tests):
+    def setUp(self):
+        super().setUp()
+        self.test1 = ca.read_amplicon(self.test1_biom, self.test1_samp,
+                                      min_reads=1000, normalize=10000)
+
+    def test_filter_mz(self):
+        # load an mzmine2 metabolomics table, and associated gnps clusterinfo file
+        exp = ca.read_ms(self.mzmine2_csv, sample_metadata_file=self.gnps_map,
+                         data_file_type='mzmine2', use_gnps_id_from_AllFiles=False, normalize=None)
+
+        res = exp.filter_mz(100)
+        self.assertEqual(len(res.feature_metadata), 1)
+        self.assertEqual(res.feature_metadata['MZ'].values, [100])
+
+        res = exp.filter_mz([100, 201])
+        self.assertEqual(len(res.feature_metadata), 1)
+        self.assertEqual(res.feature_metadata['MZ'].values, [100])
+
+        res = exp.filter_mz([100, 201], tolerance=1)
+        self.assertEqual(len(res.feature_metadata), 2)
+        npt.assert_array_equal(res.feature_metadata['MZ'].values, [100, 200])
+
+        res = exp.filter_mz([100, 201], negate=True)
+        self.assertEqual(len(res.feature_metadata), 5)
+
+    def test_get_bad_features(self):
+        # load an mzmine2 metabolomics table, and associated gnps clusterinfo file
+        exp = ca.read_ms(self.mzmine2_csv, sample_metadata_file=self.gnps_map,
+                         data_file_type='mzmine2', use_gnps_id_from_AllFiles=False, normalize=None)
+        # get rid of the all 0s metabolite (to get rid of std=0 warning)
+        exp = exp.filter_sum_abundance(0.1)
+
+        res = exp.get_bad_features()
+        # no samples filtered away
+        self.assertEqual(res.shape[0], 6)
+        # default parameters don't identify and suspicious features
+        self.assertEqual(res.shape[1], 0)
+
+        res = exp.get_bad_features(mz_tolerance=100, rt_tolerance=0.5)
+        self.assertEqual(res.shape[1], 0)
+
+        res = exp.get_bad_features(rt_tolerance=1)
+        self.assertEqual(res.shape[1], 0)
+
+        res = exp.get_bad_features(mz_tolerance=100, rt_tolerance=1)
+        self.assertEqual(res.shape[1], 2)
+
+        res = exp.get_bad_features(mz_tolerance=100, rt_tolerance=1, corr_thresh=0.2)
+        self.assertEqual(res.shape[1], 4)