biocore · RNAer · Jul 17, 2020 · Jul 11, 2020 · Jul 12, 2020 · Jul 13, 2020
diff --git a/calour/ms1_experiment.py b/calour/ms1_experiment.py
@@ -131,7 +131,7 @@ def get_spurious_duplicates(self, mz_tolerance=0.001, rt_tolerance=2, corr_thres
 
         Returns
         -------
-        calour.MS1Experiment
+        MS1Experiment
             features filtered and ordered basen on m/z and rt similarity and correlation
         '''
         features = self.feature_metadata.copy()
@@ -165,8 +165,8 @@ def get_spurious_duplicates(self, mz_tolerance=0.001, rt_tolerance=2, corr_thres
     def merge_similar_features(self, mz_tolerance=0.001, rt_tolerance=0.5):
         '''Merge metabolites with similar mz/rt to a single metabolite
 
-        metabolites are initially sorted by frequency and a greefy clustering algorithm (starting from the highest freq.) is used to join together
-        metabolites that are close in m/z and r/t.
+        Metabolites are initially sorted by frequency and a greedy clustering algorithm (starting from the highest freq.) is used to join together
+        metabolites that are close in m/z and r/t, combining them to a signle metabolite with freq=sum(freq) of all metabolites in the cluster.
 
         Parameters
         ----------
@@ -177,9 +177,11 @@ def merge_similar_features(self, mz_tolerance=0.001, rt_tolerance=0.5):
 
         Returns
         -------
-        calour.MS1Experiment with  close metabolites joined to a single metabolite.
-        The m/z and rt of the new metabolite are the m/z and rt of the highest freq. metabolite.
-        new feature_metadata fields: _calour_merge_number, _calour_merge_ids are added listing the number and ids of the metabolites joined for each new metabolite
+        MS1Experiment
+            With  close metabolites joined to a single metabolite.
+            The m/z and rt of the new metabolite are the m/z and rt of the highest freq. metabolite. Frequency of the new metabolite is the sum of frequencies
+            of all joined metabolites.
+            New feature_metadata fields: _calour_merge_number, _calour_merge_ids are added listing the number and ids of the metabolites joined for each new metabolite
         '''
         exp = self.sort_abundance(reverse=False)
         features = exp.feature_metadata
@@ -189,8 +191,7 @@ def merge_similar_features(self, mz_tolerance=0.001, rt_tolerance=0.5):
         for cgroup, cfeature in features.iterrows():
             mzdist = np.abs(features['MZ'] - cfeature['MZ'])
             rtdist = np.abs(features['RT'] - cfeature['RT'])
-            ok = np.logical_and(mzdist <= mz_tolerance, rtdist <= rt_tolerance)
-            ok = np.logical_and(ok, features['_metabolite_group'] == -1)
+            ok = (mzdist <= mz_tolerance) & (rtdist <= rt_tolerance) & (features['_metabolite_group'] == -1)
             okpos = np.where(ok)[0]
             for cpos in okpos:
                 features.iat[cpos, gpos] = cgroup
@@ -204,6 +205,7 @@ def filter_mz_rt(self, mz=None, rt=None, mz_tolerance=0.05, rt_tolerance=0.2, in
 
         Keep (or remove if negate=True) metabolites that have an m/z and/or retention time close (up to tolerance)
         to the requested mz and/or rt (or list of mz and/or rt).
+        If both mz and rt are provided, they should be matched (i.e. filtering is performed using each mz and rt pair with same index)
 
         Parameters
         ----------
@@ -225,7 +227,7 @@ def filter_mz_rt(self, mz=None, rt=None, mz_tolerance=0.05, rt_tolerance=0.2, in
 
         Returns
         -------
-        calour.MS1Experiment
+        MS1Experiment
             features filtered based on mz
         '''
         if mz is None and rt is None:
@@ -241,7 +243,7 @@ def filter_mz_rt(self, mz=None, rt=None, mz_tolerance=0.05, rt_tolerance=0.2, in
             else:
                 rt = _to_list(rt)
 
-        keep = set()
+        select = np.zeros(len(self.feature_metadata), dtype='?')
         notfound = 0
         if mz is None:
             mz = [None] * len(rt)
@@ -264,12 +266,12 @@ def filter_mz_rt(self, mz=None, rt=None, mz_tolerance=0.05, rt_tolerance=0.2, in
             bothok = np.logical_and(keepmz, keeprt)
             if bothok.sum() == 0:
                 notfound += 1
-            keep = keep.union(set(np.where(bothok)[0]))
+            select = np.logical_or(select, bothok)
 
-        logger.info('total from mz/rt list not found: %d' % notfound)
+        logger.info('Total from mz/rt list with no match: %d' % notfound)
         if negate:
-            keep = set(np.arange(len(self.feature_metadata))).difference(keep)
-        return self.reorder(sorted(list(keep)), axis='f', inplace=inplace)
+            select = np.logical_not(select)
+        return self.reorder(select, axis='f', inplace=inplace)
 
     def sort_mz_rt(self, inplace=False):
         '''Sort features according to m/z and retention time.
@@ -283,7 +285,7 @@ def sort_mz_rt(self, inplace=False):
 
         Returns
         -------
-        calour.MS1Experiment
-        Sorted according to m/z and retention time
+        MS1Experiment
+            Sorted according to m/z and retention time
         '''
         return self.sort_by_metadata('mz_rt', axis='f', inplace=inplace)
diff --git a/calour/transforming.py b/calour/transforming.py
@@ -370,3 +370,17 @@ def subsample_count(exp: Experiment, total, replace=False, inplace=False, random
     exp.reorder([i not in drops for i in range(exp.data.shape[0])], inplace=True)
     exp.normalized = total
     return exp
+
+
+def _subsample(data, depth):
+    for csamp in range(data.shape[0]):
+        totreads = data[csamp, :].sum()
+        reads = np.zeros([totreads], dtype=int)
+        cpos = 0
+        for idx in range(data.shape[1]):
+            reads[cpos:cpos + data[csamp, idx]] = idx
+            cpos += data[csamp, idx]
+        new_reads = np.random.permutation(reads)[: depth]
+        # res = np.unique(new_reads, return_counts=True)
+        res = np.bincount(new_reads)
+    return res