In [17]:
import pandas as pd
import subprocess
import math
from tRNA_position import *
pd.set_option('display.max_colwidth',10000)
pd.set_option('display.width', 10000)
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows',1000)
isotypes = ['Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Gln', 'Glu', 'Gly', 'His', 'Ile', 'iMet', 'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Thr', 'Trp', 'Tyr', 'Val']

## Introduction

A global view of identity elements versus biological features would be a powerful tool for predicting tRNA function using primary sequence. To do this, I'll need to align eukaryotic tRNAs to our established models and annotate positions based on universal numbering. Then, I'll extract sequence information such as position, clade, or variable arm length.

## Process tRNAs
### Species information

In [3]:
species_table = pd.read_table('/projects/lowelab/users/blin/tRNAscan/models/taxonomy/genome_table_20151017.txt', header=None, names=['shortname', 'name', 'longname', 'domain', 'clade', 'subclade'])
# set NaN to empty strings
species_table.ix[species_table.name.apply(lambda x: type(x) is float), 'name'] = ''
# special cases
species_table.ix[species_table.shortname == "Bdist", 'name'] = "braDis1"
species_table.ix[species_table.shortname == "Gmax2", 'name'] = "glyMax2"
species_table.ix[species_table.shortname == "Mtrun", 'name'] = "mt3"
species_table.ix[species_table.shortname == "Phama1", 'name'] = "papHam1"
species_table.ix[species_table.shortname == "Ppate", 'name'] = "phyPat1.1"
species_table.ix[species_table.shortname == "Ptric", 'name'] = "popTri2"
species_table.ix[species_table.shortname == "Vvini", 'name'] = "vitVin1"
species_table.ix[species_table.shortname == "Zmays5", 'name'] = "zeaMay5"
species_table.head()

Unnamed: 0,shortname,name,longname,domain,clade,subclade
0,Pfalc,plasFalc1,Plasmodium falciparum (Oct 2007),eukaryota,Apicomplexa,
1,Ppate,phyPat1.1,Physcomitrella patens (Version 1.1),eukaryota,Bryophyta,
2,Spurp,,Strongylocentrotus purpuratus (Sea urchin) (Version 2.1),eukaryota,Echinodermata,
3,Lmajo,,Leishmania major (Version 5.0),eukaryota,Euglenozoa,
4,Ashb_goss_ATCC_10895,ashbGoss_ATCC10895,Ashbya gossypii ATCC 10895,eukaryota,Fungi,


### Align tRNAs to isotype-specific model

TODO. Patricia will send me the high quality tRNAs soon, so those will be updated at that time.

### Parse alignment positions

In [None]:
def position_base(positions, seq):
  for position_index, position in enumerate(positions):
    if position.paired:
      index1, index2 = position.position.split(':')
      index1, index2 = int(index1), int(index2)
      base_pair = "{}:{}".format(seq[index1 - 1], seq[index2 - 1])
      yield position.sprinzl, base_pair
    else:
      index = int(position.position)
      base = seq[index - 1]
      yield position.sprinzl, base
      
isotypes = ['Ala']
identities = pd.DataFrame()

for isotype in isotypes:
  alignment = 'alignments/euk-' + isotype + '.sto'
  # get positions
  alignment_fhandle = open(alignment)
  positions = [] # list containing each position in the tRNA
  # first, get secondary structure
  for line in alignment_fhandle:
    if line[0:12] == '#=GC SS_cons':
      ss = line.strip().split()[-1]
  # parse secondary structure into regions and positions
  positions = annotate_positions(ss)
  # get counts for each position by parsing Stockholm file
  alignment_fhandle = open(alignment)
  for line in alignment_fhandle:
    if line[0] in ["#", '\n', '/']: continue
    seqname, seq = line.strip().split()
    species = re.split('_|\\.trna', seqname, 1)[0]
    if any(species_table.shortname == species): row = species_table[species_table.shortname == species]
    elif any(species_table.name == species): row = species_table[species_table.name == species]
    elif any(species_table.name.apply(lambda x: bool(re.search(species, x)))): row = species_table[species_table.name.apply(lambda x: bool(re.search(species, x)))]
    else: continue # skip species that need too much babying
    domain = row.domain.values[0]
    clade = row.clade.values[0]
    species_long = row.longname.values[0]
    trna = {'domain': domain, 'clade': clade, 'species': species, 'species_long': species_long, 'seqname': seqname}
    trna = {**trna, **{sprinzl: base for sprinzl, base in position_base(positions, seq)}}
    identities = identities.append([trna], ignore_index=True)

identities.head()

## Get sequence information

In [None]:
# Isodecoder


# GC content


# Stem and loop sizes


# Insertions and deletions

# Reorder, rename columns