In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
from Bio.Data.CodonTable import TranslationError
from Bio.Seq import CodonTable

import pandas as pd
import numpy as np
from itertools import izip, combinations

In [2]:
def safe_translate(sequence, report_exceptions=False):
    """Returns an amino acid translation of the given nucleotide sequence accounting
    for gaps in the given sequence.

    Optionally, returns a tuple of the translated sequence and whether an
    exception was raised during initial translation.

    >>> safe_translate("ATG")
    'M'
    >>> safe_translate("ATGGT-")
    'MX'
    >>> safe_translate("ATG---")
    'M-'
    >>> safe_translate("ATGTAG")
    'M*'
    >>> safe_translate("")
    ''
    >>> safe_translate("ATGT")
    'M'
    >>> safe_translate("ATG", report_exceptions=True)
    ('M', False)
    >>> safe_translate("ATGA-G", report_exceptions=True)
    ('MX', True)
    """
    translation_exception = False

    try:
        # Attempt translation by extracting the sequence according to the
        # BioPhython SeqFeature in frame gaps of three will translate as '-'
        translated_sequence = str(Seq(sequence).translate(gap='-'))
    except TranslationError:
        translation_exception = True

        # Any other codon like '-AA' or 'NNT' etc will fail. Translate codons
        # one by one.
        codon_table  = CodonTable.ambiguous_dna_by_name['Standard'].forward_table
        str_seq = str(sequence)
        codons = np.fromstring(str_seq[:len(str_seq) - len(str_seq) % 3], dtype='S3')
        assert len(codons) > 0
        aas = []

        for c in codons:
            # Parse result of single codon translation, add amino acids as
            # appropriate.
            try:
                aa = codon_table.get(c)
                if aa is None:
                    if c == '---':
                        aas.append('-')
                    else:
                        aas.append('X')
                else:
                    aas.append(aa)
            except (TranslationError, ValueError):
                aas.append('X')

        translated_sequence = "".join(aas)

    if report_exceptions:
        return translated_sequence, translation_exception
    else:
        return translated_sequence

In [4]:
seqs = list(SeqIO.parse(open('./titered_strains_alignment.mfa', 'r'), 'fasta'))
aa_seqs = {s.id : safe_translate(str(s.seq)) for s in seqs}

In [5]:
ofile = open('./titered_E_aa.mfa', 'w')
for name, seq in aa_seqs.items():
    ofile.write('>'+name+'\n'+seq+'\n')
ofile.close()

In [17]:
def aa_dist(seq1,seq2):
    return sum([1 for (aa1,aa2) in izip(seq1,seq2) if aa1 != aa2])

edit_distances = []
for strain1, strain2 in combinations(aa_seqs.keys(), 2):
    seq1, seq2 = aa_seqs[strain1], aa_seqs[strain2]
    edit_distances.append({'strain1': strain1, 'strain2': strain2, 'edit': aa_dist(seq1,seq2)})
    

In [18]:
edit_distances = pd.DataFrame(edit_distances)

In [20]:
edit_distances.to_csv('./titered_strains_aa_dist.csv', header=False, index=False)