Skip to content

Commit

Permalink
Merge 46f6ca2 into fd40a28
Browse files Browse the repository at this point in the history
  • Loading branch information
amnona committed Jul 4, 2020
2 parents fd40a28 + 46f6ca2 commit 3884aa9
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 15 deletions.
8 changes: 4 additions & 4 deletions calour/amplicon_experiment.py
Expand Up @@ -23,10 +23,9 @@
from logging import getLogger

import numpy as np
import skbio

from .experiment import Experiment
from .util import _get_taxonomy_string, _to_list
from .util import _get_taxonomy_string, _to_list, _iter_fasta


logger = getLogger(__name__)
Expand Down Expand Up @@ -140,9 +139,10 @@ def filter_fasta(exp: Experiment, filename, negate=False, inplace=False):
logger.debug('Filter by sequence using fasta file %s' % filename)
okpos = []
tot_seqs = 0
for cseq in skbio.read(filename, format='fasta'):

for chead, cseq in _iter_fasta(filename):
tot_seqs += 1
cseq = str(cseq).upper()
cseq = cseq.upper()
if cseq in exp.feature_metadata.index:
pos = exp.feature_metadata.index.get_loc(cseq)
okpos.append(pos)
Expand Down
9 changes: 4 additions & 5 deletions calour/io.py
Expand Up @@ -36,10 +36,9 @@
import pandas as pd
import numpy as np
import biom
import skbio

from . import Experiment, AmpliconExperiment, MS1Experiment
from .util import get_file_md5, get_data_md5, _get_taxonomy_string
from .util import get_file_md5, get_data_md5, _get_taxonomy_string, _iter_fasta
from ._doc import ds
from .database import _get_database_class

Expand Down Expand Up @@ -180,9 +179,9 @@ def read_qiime2(fp, sample_metadata_file=None, rep_seq_file=None, taxonomy_file=
rs_name = _file_from_zip(tempdir, rep_seq_file, internal_data='data/dna-sequences.fasta')
rseqs = []
rids = []
for cseq in skbio.read(rs_name, format='fasta'):
rseqs.append(str(cseq).upper())
rids.append(cseq.metadata['id'])
for chead, cseq in _iter_fasta(rs_name):
rseqs.append(cseq.upper())
rids.append(chead)
rep_seqs = pd.Series(data=rseqs, index=rids, name='_feature_id')

# test if all hashes are identical to the rep_seqs file supplied
Expand Down
6 changes: 4 additions & 2 deletions calour/tests/data/seqs1.fasta
@@ -1,4 +1,6 @@
>real_seq_6
TT
>not real seq
AACGGAGGATGCGAGCGTTATCTGGAATCATTGGGTTTAAAGGGTCCGTAGGCGGGTTGATAAGTCAGAGGTGAAAGCGCTTAGCTCAACTAAGCAACTGCCTTTGAAACTGTCAGTCTTGAATGATTGTGAAGTAGTTGGAATGTGTAG
> not real seq
AACGGAGGATGCGAGCGTTATCTGGAATCATTGGGTTTAAAGGGTCCGTAGGCGGGTTGATAAGTCAGAGGTGAAAGCGCTTAGCTC
AACTAAGCAACTGCCTTTGAAACTGTCAGTCTTGAATGATTGTGAAGTAGTTGGAATGTGTAG

8 changes: 4 additions & 4 deletions calour/tests/test_io.py
Expand Up @@ -13,7 +13,6 @@
import shutil
import logging

import skbio
import scipy.sparse
import numpy as np
import pandas as pd
Expand All @@ -22,6 +21,7 @@
import calour as ca
from calour._testing import Tests, assert_experiment_equal
from calour.io import _create_biom_table_from_exp
from calour.util import _iter_fasta


class IOTests(Tests):
Expand Down Expand Up @@ -229,7 +229,7 @@ def test_read_open_ms_samples_rows(self):
self.assertAlmostEqual(exp.feature_metadata['MZ'].iloc[1], 118.0869)
self.assertAlmostEqual(exp.feature_metadata['RT'].iloc[1], 23.9214)

def test_read_qiim2(self):
def test_read_qiime2(self):
# test the non-hashed table
exp = ca.read_qiime2(self.qiime2table, normalize=None, min_reads=None)
self.assertEqual(exp.shape, (104, 658))
Expand Down Expand Up @@ -258,8 +258,8 @@ def test_save_fasta(self):
f = join(d, 'test1.fasta')
exp.save_fasta(f)
seqs = []
for seq in skbio.read(f, format='fasta'):
seqs.append(str(seq))
for chead, cseq in _iter_fasta(f):
seqs.append(cseq)
self.assertCountEqual(seqs, exp.feature_metadata.index.values)
shutil.rmtree(d)

Expand Down
9 changes: 9 additions & 0 deletions calour/tests/test_util.py
Expand Up @@ -23,6 +23,15 @@ def setUp(self):
super().setUp()
self.test1 = ca.read(self.test1_biom, self.test1_samp, normalize=None)

def test_iter_fasta(self):
seqs = []
heads = []
for chead, cseq in util._iter_fasta(self.seqs1_fasta):
seqs.append(cseq)
heads.append(chead)
self.assertListEqual(heads, ['real_seq_6', 'not real seq'])
self.assertListEqual(seqs, ['TT', 'AACGGAGGATGCGAGCGTTATCTGGAATCATTGGGTTTAAAGGGTCCGTAGGCGGGTTGATAAGTCAGAGGTGAAAGCGCTTAGCTCAACTAAGCAACTGCCTTTGAAACTGTCAGTCTTGAATGATTGTGAAGTAGTTGGAATGTGTAG'])

def test_get_taxonomy_string(self):
orig_tax = list(self.test1.feature_metadata['taxonomy'].values)
# test string taxonomy
Expand Down
37 changes: 37 additions & 0 deletions calour/util.py
Expand Up @@ -43,6 +43,43 @@
logger = getLogger(__name__)


def _iter_fasta(fp):
'''Iterate over fasta file.
Fasta file must contain header line (starting with ">") and one or more sequence lines.
Parameters
----------
fp: str
name of the fasta file
Yields
------
header: str
the header line (without ">")
sequence: str
the sequence ('ACGT'). Both header and sequence are whitespace stripped.
'''
# skip non-header lines at beginning of file
with open(fp, 'r') as fl:
for cline in fl:
if cline[0] == ">":
title = cline[1:].rstrip()
break
logger.warning('Fasta file %s has no headers' % fp)
return

lines = []
for cline in fl:
if cline[0] == ">":
yield title, ''.join(lines)
lines = []
title = cline[1:].strip()
continue
lines.append(cline.strip())
yield title, "".join(lines)


def compute_prevalence(abundance):
'''Return the prevalence at each abundance cutoffs.
Expand Down

0 comments on commit 3884aa9

Please sign in to comment.