From 12565d47c194700b8e3fe3510c9b3254f2777832 Mon Sep 17 00:00:00 2001
From: amnona <amnonim@gmail.com>
Date: Mon, 18 Feb 2019 18:34:10 +0200
Subject: [PATCH 1/4] fix indentation in dsfdr (who changed it?)

---
 calour/dsfdr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/calour/dsfdr.py b/calour/dsfdr.py
index 6c263594..0cbf50e0 100644
--- a/calour/dsfdr.py
+++ b/calour/dsfdr.py
@@ -183,7 +183,7 @@ def dsfdr(data, labels, transform_type='rankdata', method='meandiff',
     elif transform_type == 'normdata':
         data = normdata(data)
     elif transform_type is None:
-            pass
+        pass
     else:
         raise ValueError('transform type %s not supported' % transform_type)
 

From f28babcb645a1f7982c30d5583f5b872b3043de3 Mon Sep 17 00:00:00 2001
From: amnona <amnonim@gmail.com>
Date: Fri, 3 Jul 2020 16:42:05 +0300
Subject: [PATCH 2/4] read fasta using our own function

---
 calour/amplicon_experiment.py |  8 ++++----
 calour/io.py                  |  9 ++++-----
 calour/tests/data/seqs1.fasta |  6 ++++--
 calour/tests/test_io.py       |  2 +-
 calour/tests/test_util.py     |  9 +++++++++
 calour/util.py                | 38 +++++++++++++++++++++++++++++++++++
 6 files changed, 60 insertions(+), 12 deletions(-)

diff --git a/calour/amplicon_experiment.py b/calour/amplicon_experiment.py
index 59da6bfa..ad75d09a 100644
--- a/calour/amplicon_experiment.py
+++ b/calour/amplicon_experiment.py
@@ -23,10 +23,9 @@
 from logging import getLogger
 
 import numpy as np
-import skbio
 
 from .experiment import Experiment
-from .util import _get_taxonomy_string, _to_list
+from .util import _get_taxonomy_string, _to_list, _iter_fasta
 
 
 logger = getLogger(__name__)
@@ -140,9 +139,10 @@ def filter_fasta(exp: Experiment, filename, negate=False, inplace=False):
         logger.debug('Filter by sequence using fasta file %s' % filename)
         okpos = []
         tot_seqs = 0
-        for cseq in skbio.read(filename, format='fasta'):
+
+        for chead, cseq in _iter_fasta(filename):
             tot_seqs += 1
-            cseq = str(cseq).upper()
+            cseq = cseq.upper()
             if cseq in exp.feature_metadata.index:
                 pos = exp.feature_metadata.index.get_loc(cseq)
                 okpos.append(pos)
diff --git a/calour/io.py b/calour/io.py
index 2efa2f7e..d5676a42 100644
--- a/calour/io.py
+++ b/calour/io.py
@@ -36,10 +36,9 @@
 import pandas as pd
 import numpy as np
 import biom
-import skbio
 
 from . import Experiment, AmpliconExperiment, MS1Experiment
-from .util import get_file_md5, get_data_md5, _get_taxonomy_string
+from .util import get_file_md5, get_data_md5, _get_taxonomy_string, _iter_fasta
 from ._doc import ds
 from .database import _get_database_class
 
@@ -180,9 +179,9 @@ def read_qiime2(fp, sample_metadata_file=None, rep_seq_file=None, taxonomy_file=
             rs_name = _file_from_zip(tempdir, rep_seq_file, internal_data='data/dna-sequences.fasta')
             rseqs = []
             rids = []
-            for cseq in skbio.read(rs_name, format='fasta'):
-                rseqs.append(str(cseq).upper())
-                rids.append(cseq.metadata['id'])
+            for chead, cseq in _iter_fasta(rs_name):
+                rseqs.append(cseq.upper())
+                rids.append(chead)
             rep_seqs = pd.Series(data=rseqs, index=rids, name='_feature_id')
 
             # test if all hashes are identical to the rep_seqs file supplied
diff --git a/calour/tests/data/seqs1.fasta b/calour/tests/data/seqs1.fasta
index 4cdd083a..0d1882b7 100644
--- a/calour/tests/data/seqs1.fasta
+++ b/calour/tests/data/seqs1.fasta
@@ -1,4 +1,6 @@
 >real_seq_6
 TT
->not real seq
-AACGGAGGATGCGAGCGTTATCTGGAATCATTGGGTTTAAAGGGTCCGTAGGCGGGTTGATAAGTCAGAGGTGAAAGCGCTTAGCTCAACTAAGCAACTGCCTTTGAAACTGTCAGTCTTGAATGATTGTGAAGTAGTTGGAATGTGTAG
+>  not real seq
+AACGGAGGATGCGAGCGTTATCTGGAATCATTGGGTTTAAAGGGTCCGTAGGCGGGTTGATAAGTCAGAGGTGAAAGCGCTTAGCTC
+AACTAAGCAACTGCCTTTGAAACTGTCAGTCTTGAATGATTGTGAAGTAGTTGGAATGTGTAG
+
diff --git a/calour/tests/test_io.py b/calour/tests/test_io.py
index 785e7261..2542e819 100644
--- a/calour/tests/test_io.py
+++ b/calour/tests/test_io.py
@@ -229,7 +229,7 @@ def test_read_open_ms_samples_rows(self):
         self.assertAlmostEqual(exp.feature_metadata['MZ'].iloc[1], 118.0869)
         self.assertAlmostEqual(exp.feature_metadata['RT'].iloc[1], 23.9214)
 
-    def test_read_qiim2(self):
+    def test_read_qiime2(self):
         # test the non-hashed table
         exp = ca.read_qiime2(self.qiime2table, normalize=None, min_reads=None)
         self.assertEqual(exp.shape, (104, 658))
diff --git a/calour/tests/test_util.py b/calour/tests/test_util.py
index 48d356f9..82b2f665 100644
--- a/calour/tests/test_util.py
+++ b/calour/tests/test_util.py
@@ -23,6 +23,15 @@ def setUp(self):
         super().setUp()
         self.test1 = ca.read(self.test1_biom, self.test1_samp, normalize=None)
 
+    def test_iter_fasta(self):
+        seqs = []
+        heads = []
+        for chead, cseq in util._iter_fasta(self.seqs1_fasta):
+            seqs.append(cseq)
+            heads.append(chead)
+        self.assertListEqual(heads, ['real_seq_6', 'not real seq'])
+        self.assertListEqual(seqs, ['TT', 'AACGGAGGATGCGAGCGTTATCTGGAATCATTGGGTTTAAAGGGTCCGTAGGCGGGTTGATAAGTCAGAGGTGAAAGCGCTTAGCTCAACTAAGCAACTGCCTTTGAAACTGTCAGTCTTGAATGATTGTGAAGTAGTTGGAATGTGTAG'])
+
     def test_get_taxonomy_string(self):
         orig_tax = list(self.test1.feature_metadata['taxonomy'].values)
         # test string taxonomy
diff --git a/calour/util.py b/calour/util.py
index af4ca394..0f37ddaf 100644
--- a/calour/util.py
+++ b/calour/util.py
@@ -43,6 +43,44 @@
 logger = getLogger(__name__)
 
 
+def _iter_fasta(file_name):
+    '''Iterate over fasta file.
+    Fasta file must contain header line (starting with ">") and one or more sequence lines.
+
+    Parameters
+    ----------
+    file_name: str
+        name of the fasta file
+
+    Yields
+    ------
+    (header, sequence)
+    header: str
+        the header line (without ">")
+    sequence: str
+        the sequence ('ACGT')
+    NOTE: header and sequence are stripped (no new line/spaces at beginning or end)
+    '''
+    # skip non-header lines at beginning of file
+    with open(file_name, 'r') as fl:
+        for cline in fl:
+            if cline[0] == ">":
+                title = cline[1:].rstrip()
+                break
+            logger.warning('Fasta file %s has no headers' % file_name)
+            return
+
+        lines = []
+        for cline in fl:
+            if cline[0] == ">":
+                yield title, ''.join(lines)
+                lines = []
+                title = cline[1:].strip()
+                continue
+            lines.append(cline.strip())
+        yield title, "".join(lines)
+
+
 def compute_prevalence(abundance):
     '''Return the prevalence at each abundance cutoffs.
 

From 072a1dffad71e17c39a5ddefb60f881253336a16 Mon Sep 17 00:00:00 2001
From: amnona <amnonim@gmail.com>
Date: Fri, 3 Jul 2020 17:07:24 +0300
Subject: [PATCH 3/4] read fasta using our own function

---
 calour/tests/test_io.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/calour/tests/test_io.py b/calour/tests/test_io.py
index 2542e819..899d13c2 100644
--- a/calour/tests/test_io.py
+++ b/calour/tests/test_io.py
@@ -13,7 +13,6 @@
 import shutil
 import logging
 
-import skbio
 import scipy.sparse
 import numpy as np
 import pandas as pd
@@ -22,6 +21,7 @@
 import calour as ca
 from calour._testing import Tests, assert_experiment_equal
 from calour.io import _create_biom_table_from_exp
+from calour.util import _iter_fasta
 
 
 class IOTests(Tests):
@@ -258,8 +258,8 @@ def test_save_fasta(self):
         f = join(d, 'test1.fasta')
         exp.save_fasta(f)
         seqs = []
-        for seq in skbio.read(f, format='fasta'):
-            seqs.append(str(seq))
+        for chead, cseq in _iter_fasta(f):
+            seqs.append(cseq)
         self.assertCountEqual(seqs, exp.feature_metadata.index.values)
         shutil.rmtree(d)
 

From 46f6ca2b2cd0ee030a0d358815cdd5e3f059053a Mon Sep 17 00:00:00 2001
From: amnona <amnonim@gmail.com>
Date: Sat, 4 Jul 2020 13:54:45 +0300
Subject: [PATCH 4/4] pr fixes

---
 calour/util.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/calour/util.py b/calour/util.py
index 0f37ddaf..06162186 100644
--- a/calour/util.py
+++ b/calour/util.py
@@ -43,31 +43,30 @@
 logger = getLogger(__name__)
 
 
-def _iter_fasta(file_name):
+def _iter_fasta(fp):
     '''Iterate over fasta file.
+
     Fasta file must contain header line (starting with ">") and one or more sequence lines.
 
     Parameters
     ----------
-    file_name: str
+    fp: str
         name of the fasta file
 
     Yields
     ------
-    (header, sequence)
     header: str
         the header line (without ">")
     sequence: str
-        the sequence ('ACGT')
-    NOTE: header and sequence are stripped (no new line/spaces at beginning or end)
+        the sequence ('ACGT'). Both header and sequence are whitespace stripped.
     '''
     # skip non-header lines at beginning of file
-    with open(file_name, 'r') as fl:
+    with open(fp, 'r') as fl:
         for cline in fl:
             if cline[0] == ">":
                 title = cline[1:].rstrip()
                 break
-            logger.warning('Fasta file %s has no headers' % file_name)
+            logger.warning('Fasta file %s has no headers' % fp)
             return
 
         lines = []