In [1]:
import pandas as pd
import subprocess
from tRNA_position import *
pd.set_option('display.max_colwidth',1000)
pd.set_option('display.max_rows',1000)

What if we only used the complete eukaryotic model to align all tRNAs? Then comparing identity elements across isotypes by position becomes much easier.

Also: track acceptor arm length, loop length, etc.

In [2]:
isotypes = ['Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Gln', 'Glu', 'Gly', 'His', 'Ile', 'iMet', 'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Thr', 'Trp', 'Tyr', 'Val']
identities = pd.DataFrame()
for isotype in isotypes:
  # create new alignment file
  model = '/projects/lowelab/users/blin/tRNAscan/models/current/TRNAinf-euk.cm'
  fasta = '/projects/lowelab/users/blin/tRNAscan/models/1.6/fasta/euk-' + isotype + '-r2-031616.fa'
  alignment = 'alignments/euk-' + isotype + '.sto'
  subprocess.call('cmalign -g --notrunc --matchonly -o {} {} {} > /dev/null'.format(alignment, model, fasta), shell=True)
  
  # get positions
  positions = get_positions(alignment)
  
  # get identities
  df = pd.concat(pd.DataFrame({'Position': [position.position], 
                               'Symbol': [symbol],
                               'Frequency': [freq],
                               'Isotype': isotype,
                               'Clade': 'Mammalia'}) for position, symbol, freq in position_generator(positions))
  # combine into larger df
  identities = pd.concat([identities, df])
  
identities.head()

Unnamed: 0,Clade,Frequency,Isotype,Position,Symbol
0,Mammalia,0.909991,Ala,1:89,G:C
0,Mammalia,0.933393,Ala,2:88,G:C
0,Mammalia,0.90279,Ala,3:87,W:O
0,Mammalia,0.958596,Ala,4:86,S:S
0,Mammalia,0.889289,Ala,5:85,V:B


Great table, but I want to aggregate it by position. Later, I will want to add Sprinzl numbering too. It's best to do so now while I have the full data, and it's best to do it by hand. Here's the alignment:

```
(((((((,,<<<<________>>>>,<<<<<_______>>>>>,,<<<<<<<____>>>>>>>,,<<<<<_______>>>>>))))))):
123456789012345678901234567890123456789012345VVVVVVVV6VVVVVVVVV789012345678901234567890123
```

For some reason, the alignment puts base 46 in the middle of the variable loop.

In [3]:
# Switch to Sprinzl numbering
sprinzl_dict = {'1:89': '1:72', '2:88': '2:71', '3:87': '3:70', '4:86': '4:69', '5:85': '5:68', '6:84': '6:67', '7:83': '7:66', '8': '8', '9': '9', '10:25': '10:25', '11:24': '11:24', '12:23': '12:23', '13:22': '13:22', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '26': '26', '27:43': '27:43', '28:42': '28:42', '29:41': '29:41', '30:40': '30:40', '31:39': '31:39', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '44': '44', '45': '45', '46:63': 'V11:V21', '47:62': 'V12:V22', '48:61': 'V13:V23', '49:60': 'V14:V24', '50:59': 'V15:V25', '51:58': 'V16:V26', '52:57': 'V17:V27', '53': 'V1', '54': 'V2', '55': 'V3', '56': 'V4', '64': '47', '65': '48', '66:82': '49:65', '67:81': '50:64', '68:80': '51:63', '69:79': '52:62', '70:78': '53:61', '71': '54', '72': '55', '73': '56', '74': '57', '75': '58', '76': '59', '77': '69', '90': '73'}
identities.Position = [sprinzl_dict[position] for position in identities.Position]

In [4]:
# Add column for pretty printing for identity elements
def combine(position, symbol):
  if ':' in position and ':' in symbol:
    position1, position2 = position.split(':')
    symbol1, symbol2 = symbol.split(':')
    return '{}{}:{}{}'.format(position1, symbol1, position2, symbol2)
  elif symbol == '':
    if ':' in position: return '{}-:{}-'.format(position.split(':')[0], position.split(':')[1])
    else: return position + '-'
  else:
    return position + symbol

identities['Element'] = [combine(row.Position, row.Symbol) for i, row in identities.iterrows()]
identities

Unnamed: 0,Clade,Frequency,Isotype,Position,Symbol,Element
0,Mammalia,0.909991,Ala,1:72,G:C,1G:72C
0,Mammalia,0.933393,Ala,2:71,G:C,2G:71C
0,Mammalia,0.902790,Ala,3:70,W:O,3W:70O
0,Mammalia,0.958596,Ala,4:69,S:S,4S:69S
0,Mammalia,0.889289,Ala,5:68,V:B,5V:68B
0,Mammalia,0.871737,Ala,6:67,H:D,6H:67D
0,Mammalia,0.933393,Ala,7:66,D:H,7D:66H
0,Mammalia,0.996400,Ala,8,U,8U
0,Mammalia,0.998650,Ala,9,R,9R
0,Mammalia,0.874887,Ala,10:25,G:C,10G:25C


First, let's print all of the identity elements for each isotype.

In [9]:
identities.ix[(identities.Frequency >= 0.95) & (identities.Symbol != "C:O") & (identities.Symbol != "W:C")].groupby("Isotype").apply(lambda x: ', '.join(x.Element)).reset_index()

Unnamed: 0,Isotype,0
0,Ala,"4S:69S, 8U, 9R, 11Y:24R, 14A, 15R, 16D, 18G, 19G, 20U, 21W, 26R, 27Y:43R, 29Y:41R, 31C:39G, 32Y, 33U, 37A, 38H, 44A, 45R, 47G, 48V, 52G:62C, 53G:61C, 54W, 55U, 56C, 57R, 58A, 59H, 73W"
1,Arg,"3B:70V, 8U, 9R, 10G:25C, 11C:24G, 12B:23V, 14A, 15R, 16H, 18G, 19G, 20H, 21A, 26R, 28Y:42R, 32Y, 33U, 37R, 38H, 44D, 47R, 48Y, 51R:63Y, 52G:62C, 53G:61C, 54U, 55U, 56C, 57R, 58A, 69Y, 73D"
2,Asn,"3Y:70R, 8U, 9R, 14A, 15R, 16Y, 18G, 19G, 20Y, 21A, 26R, 31R:39Y, 32C, 33U, 37A, 38A, 44H, 45R, 47R, 48Y, 52R:62Y, 53G:61C, 54U, 55U, 56Y, 57R, 58A, 69Y, 73R"
3,Asp,"8Y, 9R, 11Y:24R, 12H:23D, 14A, 15R, 16H, 18G, 19G, 20H, 21A, 30G:40N, 32Y, 33U, 37R, 38H, 44D, 47G, 48Y, 52G:62C, 53G:61C, 54U, 55U, 56Y, 57R, 58A, 59U, 69Y, 73R"
4,Cys,"4G:69N, 8U, 9R, 11C:24G, 14A, 15G, 16B, 18G, 19G, 20Y, 21A, 26R, 30G:40C, 31A:39U, 32Y, 33U, 37R, 38W, 44W, 45R, 47R, 48Y, 50Y:64R, 53G:61C, 54U, 55U, 56C, 57R, 58A, 59D, 69Y, 73W"
5,Gln,"1G:72C, 2S:71S, 5B:68V, 6Y:67R, 7R:66Y, 8U, 9R, 10G:25C, 11U:24A, 12S:23S, 14A, 15R, 16Y, 18G, 19G, 20Y, 21A, 26D, 29V:41B, 30G:40C, 31R:39Y, 32Y, 33U, 37A, 38W, 44H, 47R, 48Y, 49V:65B, 51V:63B, 52R:62Y, 53G:61C, 54U, 55U, 56C, 57R, 58A, 59D, 69Y, 73W"
6,Glu,"1U:72A, 2C:71G, 3C:70G, 8U, 9V, 11U:24A, 12S:23S, 14A, 15R, 16B, 18G, 19G, 20Y, 21A, 26H, 32Y, 33U, 37A, 38H, 44H, 47R, 48Y, 49C:65G, 52G:62C, 53G:61C, 54U, 55U, 56Y, 57R, 58W, 59B, 69Y, 73R"
7,Gly,"8U, 9R, 11U:24A, 14A, 15R, 16Y, 18G, 19G, 20Y, 21A, 32Y, 33Y, 37R, 38H, 47G, 48Y, 49V:65B, 53G:61C, 54U, 55Y, 56Y, 57R, 58A, 59H, 69U, 73A"
8,His,"1G:72C, 2B:71V, 8U, 9V, 11Y:24R, 14A, 15R, 16U, 18G, 19G, 20W, 21A, 26D, 29D:41H, 32Y, 33U, 37R, 38H, 44H, 47R, 48Y, 50Y:64R, 53G:61C, 54Y, 55U, 56C, 57R, 58A, 59W, 69U"
9,Ile,"1G:72C, 2S:71S, 3B:70V, 7D:66H, 8U, 9R, 12B:23V, 14A, 15R, 16H, 18G, 19G, 20U, 21A, 26D, 27Y:43R, 32C, 33U, 37R, 38A, 44V, 45D, 47R, 48Y, 51V:63B, 52R:62Y, 53G:61C, 54U, 55U, 56Y, 57R, 58A, 69H, 73A"


Next, we print all the isotypes sharing the same identity element, for each position.

In [10]:
# Aggregate isotypes
key_ides = identities.ix[identities.Frequency >= 0.95].groupby(["Position", "Symbol"])
key_ides = key_ides.apply(lambda x: list(x.Isotype)).reset_index()
key_ides.columns = ['Position', "Symbol", "Isotypes"]

# Sort by position
df = pd.DataFrame()
for position in sorted(set(key_ides['Position']), key=lambda x: int(x.split(':')[0]) if 'V' not in x else 45):
  df = pd.concat([df, key_ides[key_ides.Position == position]])

key_ides = df
key_ides

Unnamed: 0,Position,Symbol,Isotypes
28,1:72,A:U,[iMet]
29,1:72,G:C,"[Gln, His, Ile, Pro, Ser]"
30,1:72,R:Y,"[Leu, Met]"
31,1:72,S:S,[Tyr]
32,1:72,U:A,[Glu]
61,2:71,B:V,"[His, Tyr]"
62,2:71,C:G,[Glu]
63,2:71,S:S,"[Gln, Ile, Pro, Thr]"
64,2:71,W:C,[Asn]
87,3:70,B:V,"[Arg, Ile, Met, Tyr]"


Even better. Now, I'd like to categorize isotypes into ranked groups, much like for bases and base pairs.

In [11]:
def resolve_isotype_group(isotypes):
  amino_acids = set(['Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Gln', 'Glu', 'Gly', 'His', 'Ile', 'iMet', 'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Thr', 'Trp', 'Tyr', 'Val'])
  combos = {'Type I': amino_acids - set(['Leu', 'Ser', 'Tyr']),
            'Type II': set(['Leu', 'Ser', 'Tyr']),
            'Synthetase class I': set(['Arg', 'Cys', 'Glu', 'Gln', 'Ile', 'iMet', 'Leu', 'Met', 'Trp', 'Tyr', 'Val']),
            'Synthetase class II': amino_acids - set(['Arg', 'Cys', 'Glu', 'Gln', 'Ile', 'iMet', 'Leu', 'Met', 'Trp', 'Tyr', 'Val']),
            'Methionine': set(['Met', 'iMet'])
           }
  if len(isotypes) == 1: return isotypes[0]
  if len(isotypes) == 21: return "Universal"
  if len(amino_acids - set(isotypes)) == 1: return "Not " + (amino_acids - set(isotypes)).pop()
  for combo in combos:
    if set(isotypes) == combos[combo] or (set(isotypes) < combos[combo] and len(set(isotypes)) >= len(combos[combo]) - 1):
      return combo
  for combo in combos:
    if set(isotypes) == amino_acids - combos[combo]:
      return "Not " + combo
  return "N/A"
key_ides['Isotype group'] = [resolve_isotype_group(isotypes) for isotypes in key_ides.Isotypes]
key_ides

Unnamed: 0,Position,Symbol,Isotypes,Isotype group
28,1:72,A:U,[iMet],iMet
29,1:72,G:C,"[Gln, His, Ile, Pro, Ser]",
30,1:72,R:Y,"[Leu, Met]",
31,1:72,S:S,[Tyr],Tyr
32,1:72,U:A,[Glu],Glu
61,2:71,B:V,"[His, Tyr]",
62,2:71,C:G,[Glu],Glu
63,2:71,S:S,"[Gln, Ile, Pro, Thr]",
64,2:71,W:C,[Asn],Asn
87,3:70,B:V,"[Arg, Ile, Met, Tyr]",
