In [17]:
import pandas as pd
import subprocess
import math
from tRNA_position import *
pd.set_option('display.max_colwidth',10000)
pd.set_option('display.width', 10000)
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows',1000)
isotypes = ['Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Gln', 'Glu', 'Gly', 'His', 'Ile', 'iMet', 'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Thr', 'Trp', 'Tyr', 'Val']

## Introduction

A global view of identity elements versus biological features would be a powerful tool for predicting tRNA function using primary sequence. To do this, I'll need to align eukaryotic tRNAs to our established models and annotate positions based on universal numbering. Then, I'll extract sequence information such as position, clade, or variable arm length.

## Process tRNAs
### Species information

In [3]:
species_table = pd.read_table('/projects/lowelab/users/blin/tRNAscan/models/taxonomy/genome_table_20151017.txt', header=None, names=['shortname', 'name', 'longname', 'domain', 'clade', 'subclade'])
# set NaN to empty strings
species_table.ix[species_table.name.apply(lambda x: type(x) is float), 'name'] = ''
# special cases
species_table.ix[species_table.shortname == "Bdist", 'name'] = "braDis1"
species_table.ix[species_table.shortname == "Gmax2", 'name'] = "glyMax2"
species_table.ix[species_table.shortname == "Mtrun", 'name'] = "mt3"
species_table.ix[species_table.shortname == "Phama1", 'name'] = "papHam1"
species_table.ix[species_table.shortname == "Ppate", 'name'] = "phyPat1.1"
species_table.ix[species_table.shortname == "Ptric", 'name'] = "popTri2"
species_table.ix[species_table.shortname == "Vvini", 'name'] = "vitVin1"
species_table.ix[species_table.shortname == "Zmays5", 'name'] = "zeaMay5"
species_table.head()

Unnamed: 0,shortname,name,longname,domain,clade,subclade
0,Pfalc,plasFalc1,Plasmodium falciparum (Oct 2007),eukaryota,Apicomplexa,
1,Ppate,phyPat1.1,Physcomitrella patens (Version 1.1),eukaryota,Bryophyta,
2,Spurp,,Strongylocentrotus purpuratus (Sea urchin) (Version 2.1),eukaryota,Echinodermata,
3,Lmajo,,Leishmania major (Version 5.0),eukaryota,Euglenozoa,
4,Ashb_goss_ATCC_10895,ashbGoss_ATCC10895,Ashbya gossypii ATCC 10895,eukaryota,Fungi,


### Get high-quality tRNA set

There was talk about using the high quality tRNAs to look at identity elements. Instead, I will just annotate specific tRNAs as high-quality or not.

### Parse alignment positions

In [38]:
def position_base(positions, seq):
  for position_index, position in enumerate(positions):
    if position.paired:
      index1, index2 = position.position.split(':')
      index1, index2 = int(index1), int(index2)
      base_pair = "{}:{}".format(seq[index1 - 1], seq[index2 - 1])
      yield position.sprinzl, base_pair
    else:
      index = int(position.position)
      base = seq[index - 1]
      yield position.sprinzl, base
      
isotypes = ['Ala']
identities = pd.DataFrame()

for isotype in isotypes:
  alignment = 'alignments/euk-' + isotype + '.sto'
  # get positions
  alignment_fhandle = open(alignment)
  positions = [] # list containing each position in the tRNA
  # first, get secondary structure
  for line in alignment_fhandle:
    if line[0:12] == '#=GC SS_cons':
      ss = line.strip().split()[-1]
  # parse secondary structure into regions and positions
  positions = annotate_positions(ss)
  # get counts for each position by parsing Stockholm file
  alignment_fhandle = open(alignment)
  for line in alignment_fhandle:
    if line[0] in ["#", '\n', '/']: continue
    seqname, seq = line.strip().split()
    species = re.split('_|\\.trna', seqname, 1)[0]
    if any(species_table.shortname == species): row = species_table[species_table.shortname == species]
    elif any(species_table.name == species): row = species_table[species_table.name == species]
    elif any(species_table.name.apply(lambda x: bool(re.search(species, x)))): row = species_table[species_table.name.apply(lambda x: bool(re.search(species, x)))]
    else: continue # skip species that need too much babying
    domain = row.domain.values[0]
    clade = row.clade.values[0]
    species_long = row.longname.values[0]
    trna = {'domain': domain, 'clade': clade, 'species': species, 'species_long': species_long, 'seqname': seqname, 'isotype': isotype}
    trna = {**trna, **{sprinzl: base for sprinzl, base in position_base(positions, seq)}}
    identities = identities.append([trna], ignore_index=True)

identities.head()

Unnamed: 0,10:25,11:24,12:23,13:22,14,14i1,15,16,17,17i1,18,19,1:72,1i1,20,20i1,20i2,21,22i1,26,27:43,27i1,28:42,29:41,2:71,30:40,31:39,32,33,34,35,35i1,36,37,37i1,37i10,37i11,37i12,37i13,37i14,37i15,37i16,37i17,37i18,37i19,37i2,37i20,37i21,37i22,37i23,37i24,37i25,37i26,37i27,37i28,37i29,37i3,37i30,37i31,37i32,37i33,37i34,37i35,37i36,37i37,37i38,37i39,37i4,37i40,37i41,37i42,37i43,37i44,37i45,37i46,37i47,37i48,37i49,37i5,37i50,37i51,37i52,37i53,37i54,37i55,37i56,37i57,37i58,37i59,37i6,37i60,37i61,37i62,37i63,37i64,37i65,37i66,37i7,37i8,37i9,38,38i1,3:70,3i1,3i2,44,44i1,44i2,44i3,44i4,45,47,47i1,47i2,48,49:65,49i1,4:69,50:64,50i1,51:63,51i1,52:62,53:61,54,54i1,55,56,57,58,59,59i1,5:68,5i1,5i2,60,64i1,64i2,65i1,65i2,6:67,6i1,70i1,70i2,72i1,72i2,72i3,7:66,7i1,7i2,7i3,7i4,7i5,8,9,V1,V11:V21,V12:V22,V13:V23,V14:V24,V15:V25,V16:V26,V17:V27,V2,V3,V4,clade,domain,isotype,seqname,species,species_long
0,G:C,C:G,U:A,C:U,A,.,G,U,-,.,G,G,G:C,.,U,.,.,A,.,G,C:G,.,U:G,C:U,G:U,G:G,C:U,U,U,N,N,.,N,A,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,U,.,G:C,.,.,A,.,.,.,.,A,G,.,u,C,C:G,.,G:C,C:G,.,G:C,.,G:C,G:C,U,.,U,C,A,A,U,.,A:G,.,.,C,.,.,.,.,C:G,.,.,.,.,.,.,G:C,.,.,.,.,.,U,A,-,-:-,-:-,-:-,-:-,-:-,-:-,-:-,-,-,-,Insecta,eukaryota,Ala,Agamb_chr2R.trna81-AlaCGC,Agamb,Anopheles gambiae
1,G:C,C:G,U:A,C:U,A,.,G,U,-,.,G,G,G:C,.,U,.,.,A,.,G,U:A,.,U:G,C:U,G:U,G:G,C:U,U,U,N,N,.,N,A,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,U,.,G:C,.,.,A,.,.,.,.,G,G,.,c,C,C:G,.,G:C,C:G,.,G:C,.,G:C,G:C,U,.,U,C,G,A,U,.,A:G,.,.,C,.,.,.,.,U:G,.,.,.,.,.,.,G:C,.,.,.,.,.,U,A,-,-:-,-:-,-:-,-:-,-:-,-:-,-:-,-,-,-,Insecta,eukaryota,Ala,Agamb_chr2R.trna114-AlaTGC,Agamb,Anopheles gambiae
2,G:C,C:G,U:A,C:U,A,.,G,U,-,.,G,G,G:C,.,U,.,.,A,.,G,U:A,.,U:G,C:U,G:U,G:G,C:U,U,U,N,N,.,N,A,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,U,.,G:C,.,.,A,.,.,.,.,G,G,.,c,C,C:G,.,G:U,C:G,.,G:C,.,G:C,G:C,U,.,U,C,G,A,U,.,A:G,.,.,C,.,.,.,.,U:G,.,.,.,.,.,.,A:C,.,.,.,.,.,U,A,-,-:-,-:-,-:-,-:-,-:-,-:-,-:-,-,-,-,Insecta,eukaryota,Ala,Agamb_chr2R.trna16-AlaTGC,Agamb,Anopheles gambiae
3,G:C,C:G,U:A,C:U,A,.,G,U,-,.,G,G,G:C,.,U,.,.,A,.,G,C:G,.,U:G,C:U,G:U,G:G,C:U,U,U,N,N,.,N,A,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,U,.,G:C,.,.,A,.,.,.,.,A,G,.,u,C,C:G,.,G:C,C:G,.,G:C,.,G:C,G:C,U,.,U,C,A,A,U,.,A:G,.,.,C,.,.,.,.,U:G,.,.,.,.,.,.,G:C,.,.,.,.,.,U,A,-,-:-,-:-,-:-,-:-,-:-,-:-,-:-,-,-,-,Insecta,eukaryota,Ala,Agamb_chr2L.trna13-AlaCGC,Agamb,Anopheles gambiae
4,G:C,C:G,U:A,C:U,A,.,G,A,U,.,G,G,G:C,.,U,.,.,A,.,G,C:G,.,U:G,C:U,G:U,G:G,C:U,U,U,N,N,.,N,A,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,U,.,G:C,.,.,A,.,.,.,.,G,G,.,u,A,C:G,.,G:C,C:G,.,G:C,.,G:C,G:C,A,.,U,C,G,A,U,.,G:G,.,.,A,.,.,.,.,U:G,.,.,.,.,.,.,G:C,.,.,.,.,.,U,A,-,-:-,-:-,-:-,-:-,-:-,-:-,-:-,-,-,-,Insecta,eukaryota,Ala,Agamb_chr2L.trna126-AlaAGC,Agamb,Anopheles gambiae


In [165]:
# covariance model doesn't contain base 48. It's annotated as position 47i2 instead.

## Additional sequence information

In [164]:
identities['anticodon'] = identities.seqname.apply(lambda x: x[-3:])
identities['isotype_ac'] = identities.seqname.apply(lambda x: x[-6:-3])

# GC content
paired_cols = identities.columns[list(map(lambda x: (':' in x), identities.columns))]
identities['GC'] = identities[paired_cols].apply(lambda x: sum((x == "G:C") | (x == "C:G"))/len(paired_cols), axis=1)

# Loop sizes
dloop_cols = []
def bounds_to_cols(cols, start, end):
  selected_cols = []
  for col in cols:
    matches = re.findall('\d+', col)
    if len(matches) < 1: continue
    index = int(matches[0])
    if (index >= start and index <= end or col[0:3] == '{}i'.format(start - 1)) and col[0] != 'V':
      selected_cols.append(col)
  return selected_cols

dloop_cols = bounds_to_cols(identities.columns, 14, 21)
identities['D-loop'] = identities[dloop_cols].apply(lambda x: len(x[(x != '.') & (x != '-')]), axis=1)

acloop_cols = bounds_to_cols(identities.columns, 32, 38)
identities['AC-loop'] = identities[acloop_cols].apply(lambda x: len(x[(x != '.') & (x != '-')]), axis=1)

tpcloop_cols = bounds_to_cols(identities.columns, 54, 60)
identities['TPC-loop'] = identities[tpcloop_cols].apply(lambda x: len(x[(x != '.') & (x != '-')]), axis=1)

varm_cols = list(filter(lambda x: 'V' in x, identities.columns))
identities['V-arm'] = identities[varm_cols].apply(lambda x: len(x[(x != '.') & (x != '-')]), axis=1)

# Insertions
insertion_cols = list(filter(lambda x: re.match('\d+i', x), identities.columns))
identities['insertions'] = identities[insertion_cols].apply(lambda x: sum(x != '.'), axis=1)

# Deletions at positions that are not the variable arm
base_cols = list(filter(lambda x: re.match('^((\d+)|(\d+:\d+))$', x), identities.columns))
identities['deletions'] = identities[base_cols].apply(lambda x: ''.join(x).count('-'), axis=1)


Unnamed: 0,10:25,11:24,12:23,13:22,14,14i1,15,16,17,17i1,18,19,1:72,1i1,20,20i1,20i2,21,22i1,26,27:43,27i1,28:42,29:41,2:71,30:40,31:39,32,33,34,35,35i1,36,37,37i1,37i10,37i11,37i12,37i13,37i14,37i15,37i16,37i17,37i18,37i19,37i2,37i20,37i21,37i22,37i23,37i24,37i25,37i26,37i27,37i28,37i29,37i3,37i30,37i31,37i32,37i33,37i34,37i35,37i36,37i37,37i38,37i39,37i4,37i40,37i41,37i42,37i43,37i44,37i45,37i46,37i47,37i48,37i49,37i5,37i50,37i51,37i52,37i53,37i54,37i55,37i56,37i57,37i58,37i59,37i6,37i60,37i61,37i62,37i63,37i64,37i65,37i66,37i7,37i8,37i9,38,38i1,3:70,3i1,3i2,44,44i1,44i2,44i3,44i4,45,47,47i1,47i2,48,49:65,49i1,4:69,50:64,50i1,51:63,51i1,52:62,53:61,54,54i1,55,56,57,58,59,59i1,5:68,5i1,5i2,60,64i1,64i2,65i1,65i2,6:67,6i1,70i1,70i2,72i1,72i2,72i3,7:66,7i1,7i2,7i3,7i4,7i5,8,9,V1,V11:V21,V12:V22,V13:V23,V14:V24,V15:V25,V16:V26,V17:V27,V2,V3,V4,clade,domain,isotype,seqname,species,species_long,anticodon,isotype_ac,GC,D-loop,AC-loop,TPC-loop,V-arm
0,G:C,C:G,U:A,C:U,A,.,G,U,-,.,G,G,G:C,.,U,.,.,A,.,G,C:G,.,U:G,C:U,G:U,G:G,C:U,U,U,N,N,.,N,A,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,U,.,G:C,.,.,A,.,.,.,.,A,G,.,u,C,C:G,.,G:C,C:G,.,G:C,.,G:C,G:C,U,.,U,C,A,A,U,.,A:G,.,.,C,.,.,.,.,C:G,.,.,.,.,.,.,G:C,.,.,.,.,.,U,A,-,-:-,-:-,-:-,-:-,-:-,-:-,-:-,-,-,-,Insecta,eukaryota,Ala,Agamb_chr2R.trna81-AlaCGC,Agamb,Anopheles gambiae,CGC,Ala,0.464286,7,7,7,7
1,G:C,C:G,U:A,C:U,A,.,G,U,-,.,G,G,G:C,.,U,.,.,A,.,G,U:A,.,U:G,C:U,G:U,G:G,C:U,U,U,N,N,.,N,A,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,U,.,G:C,.,.,A,.,.,.,.,G,G,.,c,C,C:G,.,G:C,C:G,.,G:C,.,G:C,G:C,U,.,U,C,G,A,U,.,A:G,.,.,C,.,.,.,.,U:G,.,.,.,.,.,.,G:C,.,.,.,.,.,U,A,-,-:-,-:-,-:-,-:-,-:-,-:-,-:-,-,-,-,Insecta,eukaryota,Ala,Agamb_chr2R.trna114-AlaTGC,Agamb,Anopheles gambiae,TGC,Ala,0.392857,7,7,7,7
2,G:C,C:G,U:A,C:U,A,.,G,U,-,.,G,G,G:C,.,U,.,.,A,.,G,U:A,.,U:G,C:U,G:U,G:G,C:U,U,U,N,N,.,N,A,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,U,.,G:C,.,.,A,.,.,.,.,G,G,.,c,C,C:G,.,G:U,C:G,.,G:C,.,G:C,G:C,U,.,U,C,G,A,U,.,A:G,.,.,C,.,.,.,.,U:G,.,.,.,.,.,.,A:C,.,.,.,.,.,U,A,-,-:-,-:-,-:-,-:-,-:-,-:-,-:-,-,-,-,Insecta,eukaryota,Ala,Agamb_chr2R.trna16-AlaTGC,Agamb,Anopheles gambiae,TGC,Ala,0.321429,7,7,7,7
3,G:C,C:G,U:A,C:U,A,.,G,U,-,.,G,G,G:C,.,U,.,.,A,.,G,C:G,.,U:G,C:U,G:U,G:G,C:U,U,U,N,N,.,N,A,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,U,.,G:C,.,.,A,.,.,.,.,A,G,.,u,C,C:G,.,G:C,C:G,.,G:C,.,G:C,G:C,U,.,U,C,A,A,U,.,A:G,.,.,C,.,.,.,.,U:G,.,.,.,.,.,.,G:C,.,.,.,.,.,U,A,-,-:-,-:-,-:-,-:-,-:-,-:-,-:-,-,-,-,Insecta,eukaryota,Ala,Agamb_chr2L.trna13-AlaCGC,Agamb,Anopheles gambiae,CGC,Ala,0.428571,7,7,7,7
4,G:C,C:G,U:A,C:U,A,.,G,A,U,.,G,G,G:C,.,U,.,.,A,.,G,C:G,.,U:G,C:U,G:U,G:G,C:U,U,U,N,N,.,N,A,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,U,.,G:C,.,.,A,.,.,.,.,G,G,.,u,A,C:G,.,G:C,C:G,.,G:C,.,G:C,G:C,A,.,U,C,G,A,U,.,G:G,.,.,A,.,.,.,.,U:G,.,.,.,.,.,.,G:C,.,.,.,.,.,U,A,-,-:-,-:-,-:-,-:-,-:-,-:-,-:-,-,-,-,Insecta,eukaryota,Ala,Agamb_chr2L.trna126-AlaAGC,Agamb,Anopheles gambiae,AGC,Ala,0.428571,8,7,7,7
