From 12565d47c194700b8e3fe3510c9b3254f2777832 Mon Sep 17 00:00:00 2001 From: amnona Date: Mon, 18 Feb 2019 18:34:10 +0200 Subject: [PATCH 1/4] fix indentation in dsfdr (who changed it?) --- calour/dsfdr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/calour/dsfdr.py b/calour/dsfdr.py index 6c263594..0cbf50e0 100644 --- a/calour/dsfdr.py +++ b/calour/dsfdr.py @@ -183,7 +183,7 @@ def dsfdr(data, labels, transform_type='rankdata', method='meandiff', elif transform_type == 'normdata': data = normdata(data) elif transform_type is None: - pass + pass else: raise ValueError('transform type %s not supported' % transform_type) From f28babcb645a1f7982c30d5583f5b872b3043de3 Mon Sep 17 00:00:00 2001 From: amnona Date: Fri, 3 Jul 2020 16:42:05 +0300 Subject: [PATCH 2/4] read fasta using our own function --- calour/amplicon_experiment.py | 8 ++++---- calour/io.py | 9 ++++----- calour/tests/data/seqs1.fasta | 6 ++++-- calour/tests/test_io.py | 2 +- calour/tests/test_util.py | 9 +++++++++ calour/util.py | 38 +++++++++++++++++++++++++++++++++++ 6 files changed, 60 insertions(+), 12 deletions(-) diff --git a/calour/amplicon_experiment.py b/calour/amplicon_experiment.py index 59da6bfa..ad75d09a 100644 --- a/calour/amplicon_experiment.py +++ b/calour/amplicon_experiment.py @@ -23,10 +23,9 @@ from logging import getLogger import numpy as np -import skbio from .experiment import Experiment -from .util import _get_taxonomy_string, _to_list +from .util import _get_taxonomy_string, _to_list, _iter_fasta logger = getLogger(__name__) @@ -140,9 +139,10 @@ def filter_fasta(exp: Experiment, filename, negate=False, inplace=False): logger.debug('Filter by sequence using fasta file %s' % filename) okpos = [] tot_seqs = 0 - for cseq in skbio.read(filename, format='fasta'): + + for chead, cseq in _iter_fasta(filename): tot_seqs += 1 - cseq = str(cseq).upper() + cseq = cseq.upper() if cseq in exp.feature_metadata.index: pos = exp.feature_metadata.index.get_loc(cseq) okpos.append(pos) diff --git a/calour/io.py b/calour/io.py index 2efa2f7e..d5676a42 100644 --- a/calour/io.py +++ b/calour/io.py @@ -36,10 +36,9 @@ import pandas as pd import numpy as np import biom -import skbio from . import Experiment, AmpliconExperiment, MS1Experiment -from .util import get_file_md5, get_data_md5, _get_taxonomy_string +from .util import get_file_md5, get_data_md5, _get_taxonomy_string, _iter_fasta from ._doc import ds from .database import _get_database_class @@ -180,9 +179,9 @@ def read_qiime2(fp, sample_metadata_file=None, rep_seq_file=None, taxonomy_file= rs_name = _file_from_zip(tempdir, rep_seq_file, internal_data='data/dna-sequences.fasta') rseqs = [] rids = [] - for cseq in skbio.read(rs_name, format='fasta'): - rseqs.append(str(cseq).upper()) - rids.append(cseq.metadata['id']) + for chead, cseq in _iter_fasta(rs_name): + rseqs.append(cseq.upper()) + rids.append(chead) rep_seqs = pd.Series(data=rseqs, index=rids, name='_feature_id') # test if all hashes are identical to the rep_seqs file supplied diff --git a/calour/tests/data/seqs1.fasta b/calour/tests/data/seqs1.fasta index 4cdd083a..0d1882b7 100644 --- a/calour/tests/data/seqs1.fasta +++ b/calour/tests/data/seqs1.fasta @@ -1,4 +1,6 @@ >real_seq_6 TT ->not real seq -AACGGAGGATGCGAGCGTTATCTGGAATCATTGGGTTTAAAGGGTCCGTAGGCGGGTTGATAAGTCAGAGGTGAAAGCGCTTAGCTCAACTAAGCAACTGCCTTTGAAACTGTCAGTCTTGAATGATTGTGAAGTAGTTGGAATGTGTAG +> not real seq +AACGGAGGATGCGAGCGTTATCTGGAATCATTGGGTTTAAAGGGTCCGTAGGCGGGTTGATAAGTCAGAGGTGAAAGCGCTTAGCTC +AACTAAGCAACTGCCTTTGAAACTGTCAGTCTTGAATGATTGTGAAGTAGTTGGAATGTGTAG + diff --git a/calour/tests/test_io.py b/calour/tests/test_io.py index 785e7261..2542e819 100644 --- a/calour/tests/test_io.py +++ b/calour/tests/test_io.py @@ -229,7 +229,7 @@ def test_read_open_ms_samples_rows(self): self.assertAlmostEqual(exp.feature_metadata['MZ'].iloc[1], 118.0869) self.assertAlmostEqual(exp.feature_metadata['RT'].iloc[1], 23.9214) - def test_read_qiim2(self): + def test_read_qiime2(self): # test the non-hashed table exp = ca.read_qiime2(self.qiime2table, normalize=None, min_reads=None) self.assertEqual(exp.shape, (104, 658)) diff --git a/calour/tests/test_util.py b/calour/tests/test_util.py index 48d356f9..82b2f665 100644 --- a/calour/tests/test_util.py +++ b/calour/tests/test_util.py @@ -23,6 +23,15 @@ def setUp(self): super().setUp() self.test1 = ca.read(self.test1_biom, self.test1_samp, normalize=None) + def test_iter_fasta(self): + seqs = [] + heads = [] + for chead, cseq in util._iter_fasta(self.seqs1_fasta): + seqs.append(cseq) + heads.append(chead) + self.assertListEqual(heads, ['real_seq_6', 'not real seq']) + self.assertListEqual(seqs, ['TT', 'AACGGAGGATGCGAGCGTTATCTGGAATCATTGGGTTTAAAGGGTCCGTAGGCGGGTTGATAAGTCAGAGGTGAAAGCGCTTAGCTCAACTAAGCAACTGCCTTTGAAACTGTCAGTCTTGAATGATTGTGAAGTAGTTGGAATGTGTAG']) + def test_get_taxonomy_string(self): orig_tax = list(self.test1.feature_metadata['taxonomy'].values) # test string taxonomy diff --git a/calour/util.py b/calour/util.py index af4ca394..0f37ddaf 100644 --- a/calour/util.py +++ b/calour/util.py @@ -43,6 +43,44 @@ logger = getLogger(__name__) +def _iter_fasta(file_name): + '''Iterate over fasta file. + Fasta file must contain header line (starting with ">") and one or more sequence lines. + + Parameters + ---------- + file_name: str + name of the fasta file + + Yields + ------ + (header, sequence) + header: str + the header line (without ">") + sequence: str + the sequence ('ACGT') + NOTE: header and sequence are stripped (no new line/spaces at beginning or end) + ''' + # skip non-header lines at beginning of file + with open(file_name, 'r') as fl: + for cline in fl: + if cline[0] == ">": + title = cline[1:].rstrip() + break + logger.warning('Fasta file %s has no headers' % file_name) + return + + lines = [] + for cline in fl: + if cline[0] == ">": + yield title, ''.join(lines) + lines = [] + title = cline[1:].strip() + continue + lines.append(cline.strip()) + yield title, "".join(lines) + + def compute_prevalence(abundance): '''Return the prevalence at each abundance cutoffs. From 072a1dffad71e17c39a5ddefb60f881253336a16 Mon Sep 17 00:00:00 2001 From: amnona Date: Fri, 3 Jul 2020 17:07:24 +0300 Subject: [PATCH 3/4] read fasta using our own function --- calour/tests/test_io.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/calour/tests/test_io.py b/calour/tests/test_io.py index 2542e819..899d13c2 100644 --- a/calour/tests/test_io.py +++ b/calour/tests/test_io.py @@ -13,7 +13,6 @@ import shutil import logging -import skbio import scipy.sparse import numpy as np import pandas as pd @@ -22,6 +21,7 @@ import calour as ca from calour._testing import Tests, assert_experiment_equal from calour.io import _create_biom_table_from_exp +from calour.util import _iter_fasta class IOTests(Tests): @@ -258,8 +258,8 @@ def test_save_fasta(self): f = join(d, 'test1.fasta') exp.save_fasta(f) seqs = [] - for seq in skbio.read(f, format='fasta'): - seqs.append(str(seq)) + for chead, cseq in _iter_fasta(f): + seqs.append(cseq) self.assertCountEqual(seqs, exp.feature_metadata.index.values) shutil.rmtree(d) From 46f6ca2b2cd0ee030a0d358815cdd5e3f059053a Mon Sep 17 00:00:00 2001 From: amnona Date: Sat, 4 Jul 2020 13:54:45 +0300 Subject: [PATCH 4/4] pr fixes --- calour/util.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/calour/util.py b/calour/util.py index 0f37ddaf..06162186 100644 --- a/calour/util.py +++ b/calour/util.py @@ -43,31 +43,30 @@ logger = getLogger(__name__) -def _iter_fasta(file_name): +def _iter_fasta(fp): '''Iterate over fasta file. + Fasta file must contain header line (starting with ">") and one or more sequence lines. Parameters ---------- - file_name: str + fp: str name of the fasta file Yields ------ - (header, sequence) header: str the header line (without ">") sequence: str - the sequence ('ACGT') - NOTE: header and sequence are stripped (no new line/spaces at beginning or end) + the sequence ('ACGT'). Both header and sequence are whitespace stripped. ''' # skip non-header lines at beginning of file - with open(file_name, 'r') as fl: + with open(fp, 'r') as fl: for cline in fl: if cline[0] == ">": title = cline[1:].rstrip() break - logger.warning('Fasta file %s has no headers' % file_name) + logger.warning('Fasta file %s has no headers' % fp) return lines = []