Merge 46f6ca2 into fd40a28

biocore · Jul 4, 2020 · 3884aa9 · 3884aa9
2 parents fd40a28 + 46f6ca2
commit 3884aa9
Show file tree

Hide file tree

Showing 6 changed files with 62 additions and 15 deletions.
diff --git a/calour/amplicon_experiment.py b/calour/amplicon_experiment.py
@@ -23,10 +23,9 @@
 from logging import getLogger
 
 import numpy as np
-import skbio
 
 from .experiment import Experiment
-from .util import _get_taxonomy_string, _to_list
+from .util import _get_taxonomy_string, _to_list, _iter_fasta
 
 
 logger = getLogger(__name__)
@@ -140,9 +139,10 @@ def filter_fasta(exp: Experiment, filename, negate=False, inplace=False):
         logger.debug('Filter by sequence using fasta file %s' % filename)
         okpos = []
         tot_seqs = 0
-        for cseq in skbio.read(filename, format='fasta'):
+
+        for chead, cseq in _iter_fasta(filename):
             tot_seqs += 1
-            cseq = str(cseq).upper()
+            cseq = cseq.upper()
             if cseq in exp.feature_metadata.index:
                 pos = exp.feature_metadata.index.get_loc(cseq)
                 okpos.append(pos)

diff --git a/calour/io.py b/calour/io.py
@@ -36,10 +36,9 @@
 import pandas as pd
 import numpy as np
 import biom
-import skbio
 
 from . import Experiment, AmpliconExperiment, MS1Experiment
-from .util import get_file_md5, get_data_md5, _get_taxonomy_string
+from .util import get_file_md5, get_data_md5, _get_taxonomy_string, _iter_fasta
 from ._doc import ds
 from .database import _get_database_class
 
@@ -180,9 +179,9 @@ def read_qiime2(fp, sample_metadata_file=None, rep_seq_file=None, taxonomy_file=
             rs_name = _file_from_zip(tempdir, rep_seq_file, internal_data='data/dna-sequences.fasta')
             rseqs = []
             rids = []
-            for cseq in skbio.read(rs_name, format='fasta'):
-                rseqs.append(str(cseq).upper())
-                rids.append(cseq.metadata['id'])
+            for chead, cseq in _iter_fasta(rs_name):
+                rseqs.append(cseq.upper())
+                rids.append(chead)
             rep_seqs = pd.Series(data=rseqs, index=rids, name='_feature_id')
 
             # test if all hashes are identical to the rep_seqs file supplied

diff --git a/calour/tests/data/seqs1.fasta b/calour/tests/data/seqs1.fasta
@@ -1,4 +1,6 @@
 >real_seq_6
 TT
->not real seq
-AACGGAGGATGCGAGCGTTATCTGGAATCATTGGGTTTAAAGGGTCCGTAGGCGGGTTGATAAGTCAGAGGTGAAAGCGCTTAGCTCAACTAAGCAACTGCCTTTGAAACTGTCAGTCTTGAATGATTGTGAAGTAGTTGGAATGTGTAG
+>  not real seq
+AACGGAGGATGCGAGCGTTATCTGGAATCATTGGGTTTAAAGGGTCCGTAGGCGGGTTGATAAGTCAGAGGTGAAAGCGCTTAGCTC
+AACTAAGCAACTGCCTTTGAAACTGTCAGTCTTGAATGATTGTGAAGTAGTTGGAATGTGTAG
+
diff --git a/calour/tests/test_io.py b/calour/tests/test_io.py
@@ -13,7 +13,6 @@
 import shutil
 import logging
 
-import skbio
 import scipy.sparse
 import numpy as np
 import pandas as pd
@@ -22,6 +21,7 @@
 import calour as ca
 from calour._testing import Tests, assert_experiment_equal
 from calour.io import _create_biom_table_from_exp
+from calour.util import _iter_fasta
 
 
 class IOTests(Tests):
@@ -229,7 +229,7 @@ def test_read_open_ms_samples_rows(self):
         self.assertAlmostEqual(exp.feature_metadata['MZ'].iloc[1], 118.0869)
         self.assertAlmostEqual(exp.feature_metadata['RT'].iloc[1], 23.9214)
 
-    def test_read_qiim2(self):
+    def test_read_qiime2(self):
         # test the non-hashed table
         exp = ca.read_qiime2(self.qiime2table, normalize=None, min_reads=None)
         self.assertEqual(exp.shape, (104, 658))
@@ -258,8 +258,8 @@ def test_save_fasta(self):
         f = join(d, 'test1.fasta')
         exp.save_fasta(f)
         seqs = []
-        for seq in skbio.read(f, format='fasta'):
-            seqs.append(str(seq))
+        for chead, cseq in _iter_fasta(f):
+            seqs.append(cseq)
         self.assertCountEqual(seqs, exp.feature_metadata.index.values)
         shutil.rmtree(d)
 

diff --git a/calour/tests/test_util.py b/calour/tests/test_util.py
@@ -23,6 +23,15 @@ def setUp(self):
         super().setUp()
         self.test1 = ca.read(self.test1_biom, self.test1_samp, normalize=None)
 
+    def test_iter_fasta(self):
+        seqs = []
+        heads = []
+        for chead, cseq in util._iter_fasta(self.seqs1_fasta):
+            seqs.append(cseq)
+            heads.append(chead)
+        self.assertListEqual(heads, ['real_seq_6', 'not real seq'])
+        self.assertListEqual(seqs, ['TT', 'AACGGAGGATGCGAGCGTTATCTGGAATCATTGGGTTTAAAGGGTCCGTAGGCGGGTTGATAAGTCAGAGGTGAAAGCGCTTAGCTCAACTAAGCAACTGCCTTTGAAACTGTCAGTCTTGAATGATTGTGAAGTAGTTGGAATGTGTAG'])
+
     def test_get_taxonomy_string(self):
         orig_tax = list(self.test1.feature_metadata['taxonomy'].values)
         # test string taxonomy

diff --git a/calour/util.py b/calour/util.py
@@ -43,6 +43,43 @@
 logger = getLogger(__name__)
 
 
+def _iter_fasta(fp):
+    '''Iterate over fasta file.
+
+    Fasta file must contain header line (starting with ">") and one or more sequence lines.
+
+    Parameters
+    ----------
+    fp: str
+        name of the fasta file
+
+    Yields
+    ------
+    header: str
+        the header line (without ">")
+    sequence: str
+        the sequence ('ACGT'). Both header and sequence are whitespace stripped.
+    '''
+    # skip non-header lines at beginning of file
+    with open(fp, 'r') as fl:
+        for cline in fl:
+            if cline[0] == ">":
+                title = cline[1:].rstrip()
+                break
+            logger.warning('Fasta file %s has no headers' % fp)
+            return
+
+        lines = []
+        for cline in fl:
+            if cline[0] == ">":
+                yield title, ''.join(lines)
+                lines = []
+                title = cline[1:].strip()
+                continue
+            lines.append(cline.strip())
+        yield title, "".join(lines)
+
+
 def compute_prevalence(abundance):
     '''Return the prevalence at each abundance cutoffs.