In [118]:
import pandas as pd
import subprocess
import math
import glob
import os, sys
from collections import defaultdict, Counter
from Bio import SeqIO, SeqRecord, Seq
from tRNA_position import *
pd.set_option('display.max_colwidth',10000)
pd.set_option('display.width', 10000)
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows',1000)
isotypes = ['Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Gln', 'Glu', 'Gly', 'His', 'Ile', 'iMet', 'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Thr', 'Trp', 'Tyr', 'Val']

## Introduction

A global view of identity elements versus biological features would be a powerful tool for predicting tRNA function using primary sequence. To do this, I'll need to align eukaryotic tRNAs to our established models and annotate positions based on universal numbering. Then, I'll extract sequence information such as position, clade, or variable arm length.

## Process tRNAs
### Species information

In [44]:
species_table = pd.read_table('genome_table+.txt', header=None, names=['shortname', 'name', 'longname', 'domain', 'clade', 'subclade'])
# set NaN to empty strings
species_table.ix[species_table.name.apply(lambda x: type(x) is float), 'name'] = ''
species_table.head(20)

Unnamed: 0,shortname,name,longname,domain,clade,subclade
0,Bdist3,braDis3,Brachypodium distachyon Bd21 (JGI v3.0),eukaryota,Streptophyta,
1,Mtrun4,medTru4,Medicago truncatula (Mt4.0v1),eukaryota,Streptophyta,
2,Graim2,gosRai2,Gossypium raimondii (Cotton v2),eukaryota,Streptophyta,
3,Ptric3,popTri3,Populus trichocarpa (JGI v3.0),eukaryota,Streptophyta,
4,Sbico3,sorBic3,Sorghum bicolor (Version 3.1),eukaryota,Streptophyta,
5,Oarie1,oviAri3,Ovis aries (sheep) (Feb 2010),eukaryota,Mammalia,
6,Zmays6,zeaMay6,Zea mays B73 (Version AGPv3),eukaryota,Streptophyta,
7,Gmax2,glyMax2,Glycine max (soybean) (Wm82.a2),eukaryota,Streptophyta,
8,Rnorv6,rn6,Rattus norvegicus (Rat Jul. 2014 RGSC 6.0/rn6),eukaryota,Mammalia,
9,Vpaco2,vicPac2,Vicugna pacos (Alpaca Mar. 2013 Vicugna_pacos-2.0.1/vicPac2),eukaryota,Mammalia,


### Align tRNAs to existing CMs

We have a "quality set" of tRNAs, which include tRNAs from previous versions of genomes and newer versions of genomes. It is simpler and easier to create a new fasta file and alignment for each isotype, based on the `.iso` files from each run.

I also need to manually curate the genome table file to include the new species. And while I'm doing that, I might as well fix the broken entries.

In [39]:
species = sorted(glob.glob("iso/*.iso"))
species = [sp[4:-10] for sp in species]
isotype_seqs = defaultdict(list)
for sp in species:
  sys.stdout.write('processing ' + sp + '...')
  sys.stdout.flush()
  tRNA_file = 'tRNAs/' + sp + '-tRNAs.fa'
  if not os.path.exists(tRNA_file):
    print(sp + ' tRNA file not found, skipping')
    continue

  seqs = []
  seqids = []
  for seq in SeqIO.parse(tRNA_file, 'fasta'):
    if "pseudogene" in seq.description: continue
    score = float(re.findall('Sc: [\d\.]+', seq.description)[0].split()[-1])
    if score < 55: continue
    seqs.append(seq)
    seqids.append(seq.id)

  iso_scores = pd.read_table('iso/' + sp + '-tRNAs.iso', header=0)
  iso_scores = iso_scores[iso_scores.tRNAscanID.isin(seqids)]
  iso_scores['best'] = iso_scores.ix[:,2:].idxmax(axis=1)
  iso_scores['score'] = iso_scores.max(axis=1, numeric_only=True)
  iso_scores.index = iso_scores.tRNAscanID.values
  iso_scores = iso_scores[['best', 'score']]
  
  for seq in seqs:
    isotype = iso_scores.ix[seq.id].best
    if isotype == 'SeC' or 'mito' in isotype: continue
    isoscore = iso_scores.ix[seq.id].score
    seq.id = '{}_{} Iso: {}'.format(sp, seq.description, isoscore)
    seq.description = ''
    isotype_seqs[isotype].append(seq)

  print('finished')
  sys.stdout.flush()

processing Mus129S1_SvImJ_1509...Mus129S1_SvImJ_1509 tRNA file not found, skipping
processing MusAKR_J_1509...MusAKR_J_1509 tRNA file not found, skipping
processing MusA_J_1509...MusA_J_1509 tRNA file not found, skipping
processing MusBALB_cJ_1509...MusBALB_cJ_1509 tRNA file not found, skipping
processing MusC3H_HeJ_1509...MusC3H_HeJ_1509 tRNA file not found, skipping
processing MusC57BL_6NJ_1509...MusC57BL_6NJ_1509 tRNA file not found, skipping
processing MusCAROLI_EiJ_1509...MusCAROLI_EiJ_1509 tRNA file not found, skipping
processing MusCAST_EiJ_1509...MusCAST_EiJ_1509 tRNA file not found, skipping
processing MusCBA_J_1509...MusCBA_J_1509 tRNA file not found, skipping
processing MusDBA_2J_1509...MusDBA_2J_1509 tRNA file not found, skipping
processing MusFVB_NJ_1509...MusFVB_NJ_1509 tRNA file not found, skipping
processing MusLP_J_1509...MusLP_J_1509 tRNA file not found, skipping
processing MusNOD_ShiLtJ_1509...MusNOD_ShiLtJ_1509 tRNA file not found, skipping
processing MusNZO_HlLtJ_1

In [40]:
# Run this only after successfully running the above
for isotype in isotypes:
  fasta = 'fasta/euk-' + isotype + '.fa'
  if os.path.exists(fasta): # Force user to manually check whether it needs to be rewritten
    print("File exists: " + fasta)
    continue
  fasta_handle = open(fasta, 'w')
  SeqIO.write(isotype_seqs[isotype], fasta_handle, 'fasta')
  fasta_handle.close()

In [41]:
for isotype in isotypes: 
  fasta = 'fasta/euk-' + isotype + '.fa'
  alignment = 'alignments/euk-' + isotype + '.sto'
  model = '/projects/lowelab/users/blin/tRNAscan/models/current/TRNAinf-euk-' + isotype + '.cm'
  subprocess.call('cmalign -g --notrunc -o {} {} {} > /dev/null'.format(alignment, model, fasta), shell=True)

### Parse alignment positions

In [None]:
def position_base(positions, seq):
  for position_index, position in enumerate(positions):
    if position.paired:
      index1, index2 = position.position.split(':')
      index1, index2 = int(index1), int(index2)
      base_pair = "{}:{}".format(seq[index1 - 1], seq[index2 - 1])
      yield position.sprinzl, base_pair
    else:
      index = int(position.position)
      base = seq[index - 1]
      yield position.sprinzl, base
      
identities = pd.DataFrame()

for isotype in isotypes:
  sys.stdout.write('processing ' + isotype + '...')
  alignment = 'alignments/euk-' + isotype + '.sto'
  # get positions
  alignment_fhandle = open(alignment)
  positions = [] # list containing each position in the tRNA
  # first, get secondary structure
  for line in alignment_fhandle:
    if line[0:12] == '#=GC SS_cons':
      ss = line.strip().split()[-1]
  # parse secondary structure into regions and positions
  positions = annotate_positions(ss)
  # get counts for each position by parsing Stockholm file
  alignment_fhandle = open(alignment)
  for line in alignment_fhandle:
    if line[0] in ["#", '\n', '/']: continue
    seqname, seq = line.strip().split()
    species = re.split('_|\\.trna', seqname, 1)[0]
    if any(species_table.name == species): row = species_table[species_table.name == species]
    else: continue # skip species that need too much babying
    domain = row.domain.values[0]
    clade = row.clade.values[0]
    species_long = row.longname.values[0]
    trna = {'domain': domain, 'clade': clade, 'species': species, 'species_long': species_long, 'seqname': seqname, 'isotype': isotype}
    trna = {**trna, **{sprinzl: base for sprinzl, base in position_base(positions, seq)}}
    identities = identities.append([trna], ignore_index=True)
  
  print('done')
  
identities.head()

processing Ala...

## Additional sequence information

In [None]:
# Isotype, anticodon, score
seqinfo = []
for isotype in isotypes:
  alignment = 'alignments/euk-' + isotype + '.sto'
  for line in open(alignment):
    if line[0:4] != "#=GS": continue
    _, seqname, _, _, _, isotype, anticodon, _, _, _, score, _, isoscore = line.strip().split()
    seqinfo.append([seqname, isotype, anticodon[1:-1], float(score), float(isoscore)])
seqinfo = pd.DataFrame(seqinfo, columns=['seqname', 'isotype_ac', 'anticodon', 'score', 'isoscore'])
identities = identities.merge(seqinfo, on='seqname')

# GC content
paired_cols = identities.columns[list(map(lambda x: (':' in x), identities.columns))]
identities['GC'] = identities[paired_cols].apply(lambda x: sum((x == "G:C") | (x == "C:G"))/len(paired_cols), axis=1)

### Loop sizes

In [None]:
def bounds_to_cols(cols, start, end):
  selected_cols = []
  for col in cols:
    matches = re.findall('\d+', col)
    if len(matches) < 1: continue
    index = int(matches[0])
    if (index >= start and index <= end or col[0:3] == '{}i'.format(start - 1)) and col[0] != 'V':
      selected_cols.append(col)
  return selected_cols

dloop_cols = bounds_to_cols(identities.columns, 14, 21)
identities['D-loop'] = identities[dloop_cols].apply(lambda x: len(x[(x != '.') & (x != '-')]), axis=1)

acloop_cols = bounds_to_cols(identities.columns, 32, 38)
identities['AC-loop'] = identities[acloop_cols].apply(lambda x: len(x[(x != '.') & (x != '-')]), axis=1)

tpcloop_cols = bounds_to_cols(identities.columns, 54, 60)
identities['TPC-loop'] = identities[tpcloop_cols].apply(lambda x: len(x[(x != '.') & (x != '-')]), axis=1)

varm_cols = list(filter(lambda x: 'V' in x, identities.columns))
identities['V-arm'] = identities[varm_cols].apply(lambda x: len(x[(x != '.') & (x != '-')]), axis=1)

### Insertions/deletions

In [None]:
# Insertions
insertion_cols = list(filter(lambda x: re.match('\d+i', x), identities.columns))
identities['insertions'] = identities[insertion_cols].apply(lambda x: sum(x != '.'), axis=1)

# Deletions at positions that are not the variable arm
base_cols = list(filter(lambda x: re.match('^((\d+)|(\d+:\d+))$', x), identities.columns))
identities['deletions'] = identities[base_cols].apply(lambda x: ''.join(x).count('-'), axis=1)

### High-quality tRNA set

There was talk about using the high quality tRNAs to look at identity elements. Instead, I will just annotate specific tRNAs as high-quality or not. We can sort out the differences later.

In [None]:
# Import list of quality tRNAs
quality_trnas = [line.strip() for line in open('quality-set.out')]
identities['quality'] = identities.seqname.isin(quality_trnas)

### Restrict tRNAs by species

We may also want to limit the contribution of any single species, similar to how we built the isotype-specific models.

In [None]:
identities.head()