In [74]:
import pandas as pd
import subprocess
import math
from tRNA_position import *
pd.set_option('display.max_colwidth',1000)
pd.set_option('display.max_rows',1000)

## Introduction

A global view of identity elements versus biological features would be a powerful tool for predicting tRNA function using primary sequence. To do this, I'll need to align eukaryotic tRNAs to our established models and annotate positions based on universal numbering. Then, I'll extract sequence information such as position, clade, or variable arm length.

## Process tRNAs
### Species information

In [143]:
species_table = pd.read_table('/projects/lowelab/users/blin/tRNAscan/models/taxonomy/genome_table_20151017.txt', header=None, names=['shortname', 'name', 'longname', 'domain', 'clade', 'subclade'])
# set NaN to empty strings
species_table.ix[species_table.name.apply(lambda x: type(x) is float), 'name'] = ''
# special cases
species_table.ix[species_table.shortname == "Bdist", 'name'] = "braDis1"
species_table.ix[species_table.shortname == "Gmax2", 'name'] = "glyMax2"
species_table.ix[species_table.shortname == "Mtrun", 'name'] = "mt3"
species_table.ix[species_table.shortname == "Phama1", 'name'] = "papHam1"
species_table.ix[species_table.shortname == "Ppate", 'name'] = "phyPat1.1"
species_table.ix[species_table.shortname == "Ptric", 'name'] = "popTri2"
species_table.ix[species_table.shortname == "Vvini", 'name'] = "vitVin1"
species_table.ix[species_table.shortname == "Zmays5", 'name'] = "zeaMay5"

### Sprinzl numbering for each isotype

Each isotype comes with a different consensus secondary structure, with different insertion points and loop lengths. Here, I build a dictionary of dictionaries for mapping covariance model positions to Sprinzl numbering.

### Align tRNAs to general model

In [None]:
isotypes = ['Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Gln', 'Glu', 'Gly', 'His', 'Ile', 'iMet', 'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Thr', 'Trp', 'Tyr', 'Val']
for isotype in isotypes:
  # create new alignment file
  model = '/projects/lowelab/users/blin/tRNAscan/models/current/TRNAinf-euk.cm'
  fasta = '/projects/lowelab/users/blin/tRNAscan/models/1.6/fasta/euk-' + isotype + '-r2-031616.fa'
  alignment = 'alignments/euk-' + isotype + '.sto'
  subprocess.call('cmalign -g --notrunc --matchonly -o {} {} {} > /dev/null'.format(alignment, model, fasta), shell=True)

### Parse alignment positions

In [25]:
def position_base(positions, seq):
  for position_index, position in enumerate(positions):
    if position.paired:
      index1, index2 = position.position.split(':')
      index1, index2 = int(index1), int(index2)
      base_pair = "{}:{}".format(seq[index1 - 1], seq[index2 - 1])
      yield base_pair
    else:
      index = int(position.position)
      base = seq[index - 1]
      yield base_pair
isotypes = ['Ala']
identities = pd.DataFrame()

for isotype in isotypes:
  alignment = 'alignments/euk-' + isotype + '.sto'
  # get positions
  alignment_fhandle = open(alignment)
  positions = [] # list containing each position in the tRNA
  # first, get secondary structure
  for line in alignment_fhandle:
    if line[0:12] == '#=GC SS_cons':
      ss = line.strip().split()[-1]
  # parse secondary structure into regions and positions
  positions = annotate_positions(ss)
  # get counts for each position by parsing Stockholm file
  alignment_fhandle = open(alignment)
  for line in alignment_fhandle:
    if line[0] in ["#", '\n', '/']: continue
    seqname, seq = line.strip().split()
    species = re.split('_|\\.trna', seqname, 1)[0]
    if any(species_table.shortname == species): row = species_table[species_table.shortname == species]
    elif any(species_table.name == species): row = species_table[species_table.name == species]
    elif any(species_table.name.apply(lambda x: bool(re.search(species, x)))): row = species_table[species_table.name.apply(lambda x: bool(re.search(species, x)))]
    else: continue # skip species that need too much babying
    domain = row.domain.values[0]
    clade = row.clade.values[0]
    trna = [domain, clade, species, seqname]
    trna.extend([base for base in position_base(positions, seq)])
    identities = identities.append([trna], ignore_index=True)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,63,64,65
0,eukaryota,Insecta,Agamb,Agamb_chr2R.trna81-AlaCGC,G:C,G:C,G:U,G:C,A:U,C:G,...,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C
1,eukaryota,Insecta,Agamb,Agamb_chr2R.trna114-AlaTGC,G:C,G:C,G:U,G:C,A:U,U:A,...,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C
2,eukaryota,Insecta,Agamb,Agamb_chr2R.trna16-AlaTGC,G:C,G:C,G:U,G:C,A:U,U:A,...,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C
3,eukaryota,Insecta,Agamb,Agamb_chr2L.trna13-AlaCGC,G:C,G:C,G:U,G:C,A:U,U:A,...,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C
4,eukaryota,Insecta,Agamb,Agamb_chr2L.trna126-AlaAGC,G:C,G:C,G:U,G:C,G:U,U:A,...,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C
5,eukaryota,Insecta,Agamb,Agamb_chr3R.trna35-AlaAGC,G:C,G:C,G:U,G:C,G:U,U:A,...,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C
6,eukaryota,Insecta,Agamb,Agamb_chr3R.trna14-AlaAGC,G:C,G:C,G:U,G:C,A:U,U:A,...,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C
7,eukaryota,Insecta,Agamb,Agamb_chr3R.trna51-AlaAGC,G:C,G:C,G:U,G:C,G:U,U:A,...,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C
8,eukaryota,Mammalia,ailMel1,ailMel1_GL193666.1.trna55-AlaAGC,G:C,G:C,G:U,G:C,A:U,U:A,...,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C
9,eukaryota,Mammalia,ailMel1,ailMel1_GL193897.1.trna17-AlaTGC,G:C,G:C,G:U,G:C,G:C,U:A,...,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C,G:C


## Calculate sequence information

In [None]:
### Isodecoder


### GC content


### Stem and loop sizes


### Insertions and deletions