In [1]:
import pandas as pd
import subprocess
from tRNA_position import *
pd.set_option('display.max_colwidth',1000)
pd.set_option('display.max_rows',1000)

What if we only used the complete eukaryotic model to align all tRNAs? Then comparing identity elements across isotypes by position becomes much easier.

Also: track acceptor arm length, loop length, etc.

In [2]:
isotypes = ['Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Gln', 'Glu', 'Gly', 'His', 'Ile', 'iMet', 'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Thr', 'Trp', 'Tyr', 'Val']
for isotype in isotypes:
  # create new alignment file
  model = '/projects/lowelab/users/blin/tRNAscan/models/current/TRNAinf-euk.cm'
  fasta = '/projects/lowelab/users/blin/tRNAscan/models/1.6/fasta/euk-' + isotype + '-r2-031616.fa'
  alignment = 'alignments/euk-' + isotype + '.sto'
  subprocess.call('cmalign -g --notrunc --matchonly -o {} {} {} > /dev/null'.format(alignment, model, fasta), shell=True)

In [27]:
identities = pd.DataFrame()
for isotype in isotypes:
  # get positions
  alignment = 'alignments/euk-' + isotype + '.sto'
  positions = get_positions(alignment)
  
  # get identities
  df = pd.concat(pd.DataFrame({'Position': [position.position], 
                               'Symbol': [symbol],
                               'Frequency': [freq],
                               'Counts': [position.counts],
                               'Isotype': isotype,
                               'Clade': 'Eukaryota'}) for position, symbol, freq in position_generator(positions, threshold=0.95))
  # combine into larger df
  identities = pd.concat([identities, df])
  
identities.head()

Unnamed: 0,Clade,Counts,Frequency,Isotype,Position,Symbol
0,Mammalia,"{'G:U': 23, 'G:G': 1, 'U:C': 9, 'C:C': 4, 'A:A': 23, 'U:G': 3, '-:-': 4, 'G:A': 9, 'C:A': 1, 'N:C': 2, 'A:U': 6, 'C:G': 12, 'A:G': 52, 'G:C': 2022, 'U:A': 38, 'A:C': 13}",0.909991,Ala,1:89,G:C
0,Mammalia,"{'G:U': 41, 'G:G': 2, 'C:C': 23, 'U:U': 1, '-:C': 2, 'C:U': 3, 'A:A': 1, 'U:G': 1, '-:-': 4, 'G:A': 8, 'C:A': 2, 'N:C': 1, 'C:G': 12, 'A:G': 1, 'U:C': 28, 'G:C': 2074, 'U:A': 8, 'A:C': 10}",0.933393,Ala,2:88,G:C
0,Mammalia,"{'G:-': 2, 'G:U': 1935, 'G:G': 15, 'N:U': 1, '-:A': 1, 'U:U': 12, 'A:A': 6, 'U:G': 71, '-:-': 1, '-:G': 1, 'N:G': 1, 'G:A': 44, 'C:A': 1, 'A:U': 47, 'C:G': 16, '-:U': 2, 'A:G': 2, 'G:C': 43, 'U:A': 20, 'A:C': 1}",0.90279,Ala,3:87,W:O
0,Mammalia,"{'G:U': 42, 'C:A': 3, 'C:C': 3, '-:C': 1, 'C:U': 2, 'A:A': 1, '-:-': 2, '-:G': 1, 'G:A': 2, 'N:C': 1, 'A:U': 18, 'C:G': 282, 'U:C': 1, 'G:C': 1848, 'U:A': 7, 'A:C': 8}",0.958596,Ala,4:86,S:S
0,Mammalia,"{'G:-': 1, 'G:U': 104, 'G:G': 1, 'N:U': 1, 'C:C': 4, 'U:U': 7, 'C:U': 1, 'A:A': 7, 'U:G': 14, '-:U': 1, '-:G': 1, 'A:-': 2, 'A:U': 1082, 'C:G': 101, 'A:G': 1, 'U:C': 2, 'G:C': 793, 'U:A': 34, 'A:C': 60, 'G:A': 5}",0.889289,Ala,5:85,V:B


In [41]:
identities[(identities.Isotype == "Gly") & (identities.Position == "55")]

Unnamed: 0,Clade,Counts,Frequency,Isotype,Position,Symbol,Element
0,Mammalia,"{'G': 5, 'C': 42, 'U': 1641}",0.972156,Gly,55,U,55U


Great table, but I want to aggregate it by position. Later, I will want to add Sprinzl numbering too. It's best to do so now while I have the full data, and it's best to do it by hand. Here's the alignment:

```
(((((((,,<<<<________>>>>,<<<<<_______>>>>>,,<<<<<<<____>>>>>>>,,<<<<<_______>>>>>))))))):
123456789012345678901234567890123456789012345VVVVVVVV6VVVVVVVVV789012345678901234567890123
```

For some reason, the alignment puts base 46 in the middle of the variable loop.

In [38]:
# Switch to Sprinzl numbering
sprinzl_dict = {'1:89': '1:72', '2:88': '2:71', '3:87': '3:70', '4:86': '4:69', '5:85': '5:68', '6:84': '6:67', '7:83': '7:66', '8': '8', '9': '9', '10:25': '10:25', '11:24': '11:24', '12:23': '12:23', '13:22': '13:22', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '26': '26', '27:43': '27:43', '28:42': '28:42', '29:41': '29:41', '30:40': '30:40', '31:39': '31:39', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '44': '44', '45': '45', '46:63': 'V11:V21', '47:62': 'V12:V22', '48:61': 'V13:V23', '49:60': 'V14:V24', '50:59': 'V15:V25', '51:58': 'V16:V26', '52:57': 'V17:V27', '53': 'V1', '54': 'V2', '55': 'V3', '56': 'V4', '64': '47', '65': '48', '66:82': '49:65', '67:81': '50:64', '68:80': '51:63', '69:79': '52:62', '70:78': '53:61', '71': '54', '72': '55', '73': '56', '74': '57', '75': '58', '76': '59', '77': '69', '90': '73'}
identities.Position = [sprinzl_dict[position] for position in identities.Position]

In [39]:
# Add column for pretty printing for identity elements
def combine(position, symbol):
  if ':' in position and ':' in symbol:
    position1, position2 = position.split(':')
    symbol1, symbol2 = symbol.split(':')
    return '{}{}:{}{}'.format(position1, symbol1, position2, symbol2)
  elif symbol == '':
    if ':' in position: return '{}-:{}-'.format(position.split(':')[0], position.split(':')[1])
    else: return position + '-'
  else:
    return position + symbol

identities['Element'] = [combine(row.Position, row.Symbol) for i, row in identities.iterrows()]
identities

Unnamed: 0,Clade,Counts,Frequency,Isotype,Position,Symbol,Element
0,Mammalia,"{'G:U': 23, 'G:G': 1, 'U:C': 9, 'C:C': 4, 'A:A': 23, 'U:G': 3, '-:-': 4, 'G:A': 9, 'C:A': 1, 'N:C': 2, 'A:U': 6, 'C:G': 12, 'A:G': 52, 'G:C': 2022, 'U:A': 38, 'A:C': 13}",0.909991,Ala,1:72,G:C,1G:72C
0,Mammalia,"{'G:U': 41, 'G:G': 2, 'C:C': 23, 'U:U': 1, '-:C': 2, 'C:U': 3, 'A:A': 1, 'U:G': 1, '-:-': 4, 'G:A': 8, 'C:A': 2, 'N:C': 1, 'C:G': 12, 'A:G': 1, 'U:C': 28, 'G:C': 2074, 'U:A': 8, 'A:C': 10}",0.933393,Ala,2:71,G:C,2G:71C
0,Mammalia,"{'G:-': 2, 'G:U': 1935, 'G:G': 15, 'N:U': 1, '-:A': 1, 'U:U': 12, 'A:A': 6, 'U:G': 71, '-:-': 1, '-:G': 1, 'N:G': 1, 'G:A': 44, 'C:A': 1, 'A:U': 47, 'C:G': 16, '-:U': 2, 'A:G': 2, 'G:C': 43, 'U:A': 20, 'A:C': 1}",0.902790,Ala,3:70,W:O,3W:70O
0,Mammalia,"{'G:U': 42, 'C:A': 3, 'C:C': 3, '-:C': 1, 'C:U': 2, 'A:A': 1, '-:-': 2, '-:G': 1, 'G:A': 2, 'N:C': 1, 'A:U': 18, 'C:G': 282, 'U:C': 1, 'G:C': 1848, 'U:A': 7, 'A:C': 8}",0.958596,Ala,4:69,S:S,4S:69S
0,Mammalia,"{'G:-': 1, 'G:U': 104, 'G:G': 1, 'N:U': 1, 'C:C': 4, 'U:U': 7, 'C:U': 1, 'A:A': 7, 'U:G': 14, '-:U': 1, '-:G': 1, 'A:-': 2, 'A:U': 1082, 'C:G': 101, 'A:G': 1, 'U:C': 2, 'G:C': 793, 'U:A': 34, 'A:C': 60, 'G:A': 5}",0.889289,Ala,5:68,V:B,5V:68B
0,Mammalia,"{'G:U': 24, 'C:A': 5, 'U:-': 3, 'C:C': 1, 'U:U': 153, 'C:U': 1, 'A:A': 4, 'U:G': 71, 'A:-': 1, '-:A': 2, 'G:A': 1, 'A:U': 445, 'C:G': 84, 'U:C': 6, 'G:C': 11, 'U:A': 1408, 'A:C': 2}",0.871737,Ala,6:67,H:D,6H:67D
0,Mammalia,"{'G:U': 86, 'G:G': 1, 'U:U': 2, 'U:A': 433, 'C:U': 1, 'A:A': 2, 'U:G': 3, '-:-': 2, '-:G': 1, 'G:A': 4, 'C:A': 1, 'A:-': 1, 'A:U': 244, 'C:G': 19, 'U:C': 3, 'G:C': 1397, '-:C': 2, 'A:C': 20}",0.933393,Ala,7:66,D:H,7D:66H
0,Mammalia,"{'G': 2, 'C': 6, 'U': 2214}",0.996400,Ala,8,U,8U
0,Mammalia,"{'A': 1983, 'G': 236, 'U': 3}",0.998650,Ala,9,R,9R
0,Mammalia,"{'U:G': 2, 'C:G': 1, 'G:U': 252, 'G:G': 3, 'A:U': 6, 'U:U': 3, 'U:C': 3, 'G:C': 1944, 'G:A': 5, 'A:C': 3}",0.874887,Ala,10:25,G:C,10G:25C


First, let's print all of the identity elements for each isotype.

In [6]:
identities.ix[(identities.Frequency >= 0.95) & (identities.Symbol != "C:O") & (identities.Symbol != "W:C")].groupby("Isotype").apply(lambda x: ', '.join(x.Element)).reset_index()

Unnamed: 0,Isotype,0
0,Ala,"8U, 9R, 12S:23S, 14A, 15G, 16Y, 18G, 19G, 20Y, 21A, 26R, 27Y:43R, 31B:39V, 32Y, 33U, 37A, 38H, 44A, 45V, 47R, 48Y, 50B:64V, 52R:62Y, 53G:61C, 54U, 55U, 56Y, 57R, 58A, 59D, 69H, 73W"
1,Arg,"8U, 9R, 12S:23S, 14A, 15G, 16Y, 18G, 19G, 20Y, 21A, 26R, 27Y:43R, 31B:39V, 32Y, 33U, 37A, 38H, 44A, 45V, 47R, 48Y, 50B:64V, 52R:62Y, 53G:61C, 54U, 55U, 56Y, 57R, 58A, 59D, 69H, 73W"
2,Asn,"8U, 9R, 12S:23S, 14A, 15G, 16Y, 18G, 19G, 20Y, 21A, 26R, 27Y:43R, 31B:39V, 32Y, 33U, 37A, 38H, 44A, 45V, 47R, 48Y, 50B:64V, 52R:62Y, 53G:61C, 54U, 55U, 56Y, 57R, 58A, 59D, 69H, 73W"
3,Asp,"8U, 9R, 12S:23S, 14A, 15G, 16Y, 18G, 19G, 20Y, 21A, 26R, 27Y:43R, 31B:39V, 32Y, 33U, 37A, 38H, 44A, 45V, 47R, 48Y, 50B:64V, 52R:62Y, 53G:61C, 54U, 55U, 56Y, 57R, 58A, 59D, 69H, 73W"
4,Cys,"8U, 9R, 12S:23S, 14A, 15G, 16Y, 18G, 19G, 20Y, 21A, 26R, 27Y:43R, 31B:39V, 32Y, 33U, 37A, 38H, 44A, 45V, 47R, 48Y, 50B:64V, 52R:62Y, 53G:61C, 54U, 55U, 56Y, 57R, 58A, 59D, 69H, 73W"
5,Gln,"8U, 9R, 12S:23S, 14A, 15G, 16Y, 18G, 19G, 20Y, 21A, 26R, 27Y:43R, 31B:39V, 32Y, 33U, 37A, 38H, 44A, 45V, 47R, 48Y, 50B:64V, 52R:62Y, 53G:61C, 54U, 55U, 56Y, 57R, 58A, 59D, 69H, 73W"
6,Glu,"8U, 9R, 12S:23S, 14A, 15G, 16Y, 18G, 19G, 20Y, 21A, 26R, 27Y:43R, 31B:39V, 32Y, 33U, 37A, 38H, 44A, 45V, 47R, 48Y, 50B:64V, 52R:62Y, 53G:61C, 54U, 55U, 56Y, 57R, 58A, 59D, 69H, 73W"
7,Gly,"8U, 9R, 12S:23S, 14A, 15G, 16Y, 18G, 19G, 20Y, 21A, 26R, 27Y:43R, 31B:39V, 32Y, 33U, 37A, 38H, 44A, 45V, 47R, 48Y, 50B:64V, 52R:62Y, 53G:61C, 54U, 55U, 56Y, 57R, 58A, 59D, 69H, 73W"
8,His,"8U, 9R, 12S:23S, 14A, 15G, 16Y, 18G, 19G, 20Y, 21A, 26R, 27Y:43R, 31B:39V, 32Y, 33U, 37A, 38H, 44A, 45V, 47R, 48Y, 50B:64V, 52R:62Y, 53G:61C, 54U, 55U, 56Y, 57R, 58A, 59D, 69H, 73W"
9,Ile,"8U, 9R, 12S:23S, 14A, 15G, 16Y, 18G, 19G, 20Y, 21A, 26R, 27Y:43R, 31B:39V, 32Y, 33U, 37A, 38H, 44A, 45V, 47R, 48Y, 50B:64V, 52R:62Y, 53G:61C, 54U, 55U, 56Y, 57R, 58A, 59D, 69H, 73W"


Next, we print all the isotypes sharing the same identity element, for each position.

In [7]:
# Aggregate isotypes
key_ides = identities.ix[identities.Frequency >= 0.95].groupby(["Position", "Symbol"])
key_ides = key_ides.apply(lambda x: list(x.Isotype)).reset_index()
key_ides.columns = ['Position', "Symbol", "Isotypes"]

# Sort by position
df = pd.DataFrame()
for position in sorted(set(key_ides['Position']), key=lambda x: int(x.split(':')[0]) if 'V' not in x else 45):
  df = pd.concat([df, key_ides[key_ides.Position == position]])

key_ides = df
key_ides

Unnamed: 0,Position,Symbol,Isotypes
30,6:67,C:O,"[Ala, Arg, Asn, Asp, Cys, Gln, Glu, Gly, His, Ile, iMet, Leu, Lys, Met, Phe, Pro, Ser, Thr, Trp, Tyr, Val]"
32,8,U,"[Ala, Arg, Asn, Asp, Cys, Gln, Glu, Gly, His, Ile, iMet, Leu, Lys, Met, Phe, Pro, Ser, Thr, Trp, Tyr, Val]"
33,9,R,"[Ala, Arg, Asn, Asp, Cys, Gln, Glu, Gly, His, Ile, iMet, Leu, Lys, Met, Phe, Pro, Ser, Thr, Trp, Tyr, Val]"
0,12:23,S:S,"[Ala, Arg, Asn, Asp, Cys, Gln, Glu, Gly, His, Ile, iMet, Leu, Lys, Met, Phe, Pro, Ser, Thr, Trp, Tyr, Val]"
1,14,A,"[Ala, Arg, Asn, Asp, Cys, Gln, Glu, Gly, His, Ile, iMet, Leu, Lys, Met, Phe, Pro, Ser, Thr, Trp, Tyr, Val]"
2,15,G,"[Ala, Arg, Asn, Asp, Cys, Gln, Glu, Gly, His, Ile, iMet, Leu, Lys, Met, Phe, Pro, Ser, Thr, Trp, Tyr, Val]"
3,16,Y,"[Ala, Arg, Asn, Asp, Cys, Gln, Glu, Gly, His, Ile, iMet, Leu, Lys, Met, Phe, Pro, Ser, Thr, Trp, Tyr, Val]"
4,18,G,"[Ala, Arg, Asn, Asp, Cys, Gln, Glu, Gly, His, Ile, iMet, Leu, Lys, Met, Phe, Pro, Ser, Thr, Trp, Tyr, Val]"
5,19,G,"[Ala, Arg, Asn, Asp, Cys, Gln, Glu, Gly, His, Ile, iMet, Leu, Lys, Met, Phe, Pro, Ser, Thr, Trp, Tyr, Val]"
6,20,Y,"[Ala, Arg, Asn, Asp, Cys, Gln, Glu, Gly, His, Ile, iMet, Leu, Lys, Met, Phe, Pro, Ser, Thr, Trp, Tyr, Val]"


Even better. Now, I'd like to categorize isotypes into ranked groups, much like for bases and base pairs.

In [8]:
def resolve_isotype_group(isotypes):
  amino_acids = set(['Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Gln', 'Glu', 'Gly', 'His', 'Ile', 'iMet', 'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Thr', 'Trp', 'Tyr', 'Val'])
  combos = {'Type I': amino_acids - set(['Leu', 'Ser', 'Tyr']),
            'Type II': set(['Leu', 'Ser', 'Tyr']),
            'Synthetase class I': set(['Arg', 'Cys', 'Glu', 'Gln', 'Ile', 'iMet', 'Leu', 'Met', 'Trp', 'Tyr', 'Val']),
            'Synthetase class II': amino_acids - set(['Arg', 'Cys', 'Glu', 'Gln', 'Ile', 'iMet', 'Leu', 'Met', 'Trp', 'Tyr', 'Val']),
            'Methionine': set(['Met', 'iMet'])
           }
  if len(isotypes) == 1: return isotypes[0]
  if len(isotypes) == 21: return "Universal"
  if len(amino_acids - set(isotypes)) == 1: return "Not " + (amino_acids - set(isotypes)).pop()
  for combo in combos:
    if set(isotypes) == combos[combo] or (set(isotypes) < combos[combo] and len(set(isotypes)) >= len(combos[combo]) - 1):
      return combo
  for combo in combos:
    if set(isotypes) == amino_acids - combos[combo]:
      return "Not " + combo
  return "N/A"
key_ides['Isotype group'] = [resolve_isotype_group(isotypes) for isotypes in key_ides.Isotypes]
key_ides

Unnamed: 0,Position,Symbol,Isotypes,Isotype group
30,6:67,C:O,"[Ala, Arg, Asn, Asp, Cys, Gln, Glu, Gly, His, Ile, iMet, Leu, Lys, Met, Phe, Pro, Ser, Thr, Trp, Tyr, Val]",Universal
32,8,U,"[Ala, Arg, Asn, Asp, Cys, Gln, Glu, Gly, His, Ile, iMet, Leu, Lys, Met, Phe, Pro, Ser, Thr, Trp, Tyr, Val]",Universal
33,9,R,"[Ala, Arg, Asn, Asp, Cys, Gln, Glu, Gly, His, Ile, iMet, Leu, Lys, Met, Phe, Pro, Ser, Thr, Trp, Tyr, Val]",Universal
0,12:23,S:S,"[Ala, Arg, Asn, Asp, Cys, Gln, Glu, Gly, His, Ile, iMet, Leu, Lys, Met, Phe, Pro, Ser, Thr, Trp, Tyr, Val]",Universal
1,14,A,"[Ala, Arg, Asn, Asp, Cys, Gln, Glu, Gly, His, Ile, iMet, Leu, Lys, Met, Phe, Pro, Ser, Thr, Trp, Tyr, Val]",Universal
2,15,G,"[Ala, Arg, Asn, Asp, Cys, Gln, Glu, Gly, His, Ile, iMet, Leu, Lys, Met, Phe, Pro, Ser, Thr, Trp, Tyr, Val]",Universal
3,16,Y,"[Ala, Arg, Asn, Asp, Cys, Gln, Glu, Gly, His, Ile, iMet, Leu, Lys, Met, Phe, Pro, Ser, Thr, Trp, Tyr, Val]",Universal
4,18,G,"[Ala, Arg, Asn, Asp, Cys, Gln, Glu, Gly, His, Ile, iMet, Leu, Lys, Met, Phe, Pro, Ser, Thr, Trp, Tyr, Val]",Universal
5,19,G,"[Ala, Arg, Asn, Asp, Cys, Gln, Glu, Gly, His, Ile, iMet, Leu, Lys, Met, Phe, Pro, Ser, Thr, Trp, Tyr, Val]",Universal
6,20,Y,"[Ala, Arg, Asn, Asp, Cys, Gln, Glu, Gly, His, Ile, iMet, Leu, Lys, Met, Phe, Pro, Ser, Thr, Trp, Tyr, Val]",Universal
