# Introduction

This is an exploratory run for high-throughput analysis of tRNA identity elements. 

Current gtRNAdb dump, run: grep "Fungi" genome_table_20151017.txt > gtrnadb-fungi.txt
# also: 


In [None]:

# now get all of the fungi tRNAs
def copy_tRNAs():
  import subprocess
  for line in open('gtrnadb-fungi.txt'):
    tabs = line.strip().split('\t')
    src_file_name = '/projects/lowelab/users/pchan/GtRNAdb2/tRNA-runs/Eukaryota/' + tabs[0] + '/' + tabs[1] + '-tRNAs.fa'
    dest_file_name = './genomes/' + tabs[1] + '-tRNAs.fa'
    subprocess.call('cp {} {}'.format(src_file_name, dest_file_name), shell = True)

copy_tRNAs()

# extract leucine tRNAs, since leucine is well studied
# Scere: A73, A35, G37
def isolate_leucine_tRNAs():
  from Bio import SeqIO
  leucine_tRNA_fhandle = open('fungi-Leu-tRNAs.fa', 'w')
  for line in open('gtrnadb-fungi.txt'):
    tabs = line.strip().split('\t')
    file_name = './genomes/' + tabs[1] + '-tRNAs.fa'
    for seq in SeqIO.parse(file_name, "fasta"):
      if "Leu" in seq.name:
        SeqIO.write(seq, leucine_tRNA_fhandle, 'fasta')

#isolate_leucine_tRNAs()

# align leucine tRNAs to leucine model (just for numbering)
def align_Leu_tRNAs():
  import subprocess
  from Bio import SeqIO
  # filter out tRNAs that are clearly not cytosolic leucines (using the eukaryotic model)
  # we can afford to be stringent with this, so set to 50 bit cutoff
  subprocess.call('cmsearch -g --notrunc --toponly -T 50 --tblout fungi-Leu-tRNAs.out /projects/lowelab/users/blin/tRNAscan/models/current/TRNAinf-euk-Leu.cm fungi-Leu-tRNAs.fa', shell = True)
  leucine_tRNAs = []
  for line in open('fungi-Leu-tRNAs.out'):
    if line[0] == '#': continue
    leucine_tRNAs.append(line.split()[0])
  filtered_Leu_fhandle = open('fungi-Leu-tRNAs-filtered.fa', 'w')
  for seq in SeqIO.parse('fungi-Leu-tRNAs.fa', 'fasta'):
    if seq.name in leucine_tRNAs: SeqIO.write(seq, filtered_Leu_fhandle, 'fasta')
  filtered_Leu_fhandle.close()

  # we use --matchonly to make parsing easier. But it's possible that there's a conserved insertion within a clade. Oh well. 
  subprocess.call('cmalign -g --notrunc --matchonly -o fungi-Leu-tRNAs.sto /projects/lowelab/users/blin/tRNAscan/models/current/TRNAinf-euk-Leu.cm fungi-Leu-tRNAs-filtered.fa', shell = True)

#align_Leu_tRNAs()

# now parse .sto file
# build a table with organism, tRNA, and the 3 positions
def parse_alignment():
  import re
  identities_fhandle = open('fungi-Leu-identities.txt', 'w')
  alignment_fhandle = open('fungi-Leu-tRNAs.sto')
  trnas = []
  for line in alignment_fhandle:
    if line[0] in ['#', '\n']:
      if line[0:12] != '#=GC SS_cons':
        continue
      else:
        break
    if "chrM" in line: continue
    trna = trna_identities()
    trna.name, trna.seq = line.strip().split()
    trna.organism = trna.name.split('tRNA')[0][:-1] # cut off tRNA and the underscore before it
    trna.anticodon = trna.name.split('tRNA-Leu-')[1][0:3]
    trnas.append(trna)
  
  # parse consensus secondary structure
  seq = line.strip().split()[2]
  # need the new position of A35 and G37. to get to the right position, we need to find the 2nd loop. the anticodon itself is standard.
  match_obj = re.finditer('_+', seq)
  # discard first loop (d loop)
  _ = match_obj.__next__()
  A35_position = match_obj.__next__().span()[0] + 3
  G37_position = A35_position + 2
  # A73 should just be the last position
  A73_position = len(seq) - 1 # subtract one since we are pulling out of a string and we need the right index

  # get consensus sequence
  # line = alignment_fhandle.readline()
  # seq = line.strip().split()[2]
  # A35_cons = seq[A35_position].upper()
  # G37_cons = seq[G37_position].upper()
  # A73_cons = seq[A73_position].upper()
  # print(seq[A35_position-1:G37_position+1])

  # identities_fhandle.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format('consensus', 'consensus', '---', A35_cons, G37_cons, A73_cons))
  for trna in trnas:
    identities_fhandle.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(trna.organism, trna.name, trna.anticodon, trna.seq[A35_position], trna.seq[G37_position], trna.seq[A73_position]))
  

class trna_identities:
  def __init__(self):
    pass

parse_alignment()