Skip to content

Commit

Permalink
Merged drop-biopython branch into master; removes biopython dependency
Browse files Browse the repository at this point in the history
  • Loading branch information
reece committed Oct 23, 2018
2 parents 3876f36 + c0cdb79 commit 0382b86
Showing 1 changed file with 130 additions and 0 deletions.
130 changes: 130 additions & 0 deletions bioutils/sequences.py
Expand Up @@ -8,6 +8,7 @@

import six

from six.moves import range

_logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -39,6 +40,73 @@

aa1_to_aa3_lut = {v: k for k, v in six.iteritems(aa3_to_aa1_lut)}

dna_to_aa1_lut = { # NCBI standard translation table
'AAA': 'K',
'AAC': 'N',
'AAG': 'K',
'AAT': 'N',
'ACA': 'T',
'ACC': 'T',
'ACG': 'T',
'ACT': 'T',
'AGA': 'R',
'AGC': 'S',
'AGG': 'R',
'AGT': 'S',
'ATA': 'I',
'ATC': 'I',
'ATG': 'M',
'ATT': 'I',
'CAA': 'Q',
'CAC': 'H',
'CAG': 'Q',
'CAT': 'H',
'CCA': 'P',
'CCC': 'P',
'CCG': 'P',
'CCT': 'P',
'CGA': 'R',
'CGC': 'R',
'CGG': 'R',
'CGT': 'R',
'CTA': 'L',
'CTC': 'L',
'CTG': 'L',
'CTT': 'L',
'GAA': 'E',
'GAC': 'D',
'GAG': 'E',
'GAT': 'D',
'GCA': 'A',
'GCC': 'A',
'GCG': 'A',
'GCT': 'A',
'GGA': 'G',
'GGC': 'G',
'GGG': 'G',
'GGT': 'G',
'GTA': 'V',
'GTC': 'V',
'GTG': 'V',
'GTT': 'V',
'TAA': '*',
'TAC': 'Y',
'TAG': '*',
'TAT': 'Y',
'TCA': 'S',
'TCC': 'S',
'TCG': 'S',
'TCT': 'S',
'TGA': '*',
'TGC': 'C',
'TGG': 'W',
'TGT': 'C',
'TTA': 'L',
'TTC': 'F',
'TTG': 'L',
'TTT': 'F',
}


if six.PY2: # pragma: no cover
# flake8: noqa
Expand Down Expand Up @@ -251,6 +319,68 @@ def to_ascii(s):
return s if isinstance(s, six.binary_type) else s.encode("ASCII")


def translate_cds(seq, full_codons=True, ter_symbol="*"):
"""translate a DNA or RNA sequence into a single-letter amino acid sequence
using the standard translation table
If full_codons is True, a sequence whose length isn't a multiple of three
generates a ValueError; else an 'X' will be added as the last amino acid.
This matches biopython's behaviour when padding the last codon with 'N's.
>>> translate_cds("ATGCGA")
'MR'
>>> translate_cds("AUGCGA")
'MR'
>>> translate_cds(None)
>>> translate_cds("")
''
>>> translate_cds("AUGCG")
Traceback (most recent call last):
...
ValueError: Sequence length must be a multiple of three
>>> translate_cds("AUGCG", full_codons=False)
'M*'
>>> translate_cds("AUGCGQ")
Traceback (most recent call last):
...
ValueError: Codon CGQ at position 4..6 is undefined in codon table
"""
if seq is None:
return None

if len(seq) == 0:
return ""

if full_codons and len(seq) % 3 != 0:
raise ValueError("Sequence length must be a multiple of three")

seq = replace_u_to_t(seq)
seq = seq.upper()

protein_seq = list()
for i in range(0, len(seq) - len(seq) % 3, 3):
try:
aa = dna_to_aa1_lut[seq[i:i + 3]]
except KeyError:
raise ValueError("Codon {} at position {}..{} is undefined in codon table".format(
seq[i:i + 3], i+1, i+3))
protein_seq.append(aa)

# check for trailing bases and add the ter symbol if required
if not full_codons and len(seq) % 3 != 0:
protein_seq.append(ter_symbol)

return ''.join(protein_seq)


# legacy equivalents
_looks_like_aa3_p = looks_like_aa3_p
_to_unicode = to_unicode
Expand Down

0 comments on commit 0382b86

Please sign in to comment.