Skip to content

Commit

Permalink
Fixes #115
Browse files Browse the repository at this point in the history
Some maker data was provided, but it's technically incorrect.
I validated my results against the aragorn report, and found
inconsistencies with what Maker has done, and what ARAGORN
has done. This may be a result of maker using another tRNA caller
or just plain ol' bugs.

Nevertheless, despite the ugly code (a lot better than it was
previously), this should be technically correct.
  • Loading branch information
Eric Rasche committed May 7, 2015
1 parent 172105c commit 70e8d72
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 160 deletions.
11 changes: 7 additions & 4 deletions tools/rna_tools/trna_prediction/aragorn.xml
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@
$mtRNA
$mam_mtRNA
$topology
-fasta
$introns | python \$TRNAPRED_SCRIPT_PATH/aragorn_out_to_gff3.py > $gff3_output_file;
$introns
-w
| python \$TRNAPRED_SCRIPT_PATH/aragorn_out_to_gff3.py $gff3_model > $gff3_output_file;
#end if
]]>
</command>
Expand Down Expand Up @@ -65,6 +66,7 @@
<param name='introns' type='boolean' label='Search for tRNA genes with introns in anticodon loop ...' truevalue='-i' falsevalue='' checked="false" help='...with maximum length 3000 bases (-i).' />
<param name='secondary_structure' type='boolean' label='Print out secondary structure (-fasta,-fon)' truevalue='-fasta' falsevalue='-fon' checked="false" help='' />
<param name="gff3_output" type='boolean' label='Convert output to GFF3' truevalue='True' falsevalue='' checked="false" help='' />
<param name="gff3_model" type='boolean' label='Full gene model for GFF3 data' truevalue='--full' checked='false' help='' />
</inputs>
<outputs>
<data name="output" format="fasta">
Expand Down Expand Up @@ -197,8 +199,9 @@ Dean Laslett and Bjorn Canback
ARAGORN, a program to detect tRNA genes and tmRNA genes in nucleotide sequences Nucl. Acids Res. (2004) 32(1): 11-16
doi:10.1093/nar/gkh152
]]>
</help>
<citations>
<citation type="doi">10.1093/nar/gkh152</citation>
</citations>
</tool>
253 changes: 97 additions & 156 deletions tools/rna_tools/trna_prediction/aragorn_out_to_gff3.py
Original file line number Diff line number Diff line change
@@ -1,165 +1,106 @@
#!/usr/bin/env python
import re
import sys

def start_pattern(string):
return re.match(r'^[0-9]+\.$', string) \
or string.startswith('Number of possible') \
or string.startswith('Searching for')
full_gene_model = False
if '--full' in sys.argv:
full_gene_model = True

def blank_line(string):
return re.match(r'^\s*$', string)

def blocks(iterable):
accumulator = []
run_of_blanklines = 0
for line in iterable:
# Count blank lines
if blank_line(line):
run_of_blanklines += 1
else:
run_of_blanklines = 0

if start_pattern(line) or run_of_blanklines > 2 or 'Mean G+C' in line:
if accumulator:
yield accumulator
accumulator = [line]
else:
accumulator.append(line)
if accumulator:
yield accumulator

IMPORTANT_INFO = {
'trna': re.compile(r'tRNA-(?P<codon>[A-Za-z]{3})\((?P<anticodon>[A-Za-z]{3})\)'),
'trna-alt': re.compile(r'tRNA-\?\((?P<codon>[^\)]+)\)\((?P<anticodon>[A-Za-z]{2,})\)'),
'bases': re.compile(r'(?P<bases>[0-9]+) bases, %GC = (?P<gc>[0-9.]+)'),
'sequence': re.compile(r'Sequence (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'),
'possible_pseudogene': re.compile(r'(?P<pseudo>Possible Pseudogene)'),
}
INFO_GROUPS = ('codon', 'anticodon', 'bases', 'gc', 'complement', 'start', 'end', 'pseudo')

def important_info(block):
info = {}
for line in block:
for matcher in IMPORTANT_INFO:
matches = IMPORTANT_INFO[matcher].search(line)
if matches:
for group in INFO_GROUPS:
try:
info[group] = matches.group(group)
except:
pass
return info

IMPORTANT_INFO_TMRNA = {
'tag_peptide': re.compile(r'Tag peptide:\s+(?P<pep>[A-Z*]*)'),
'location': re.compile(r'Location (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'),
}
INFO_GROUPS_TMRNA = ('start', 'end', 'pep')

def important_info_tmrna(block):
info = {}
for line in block:
for matcher in IMPORTANT_INFO_TMRNA:
matches = IMPORTANT_INFO_TMRNA[matcher].search(line)
if matches:
for group in INFO_GROUPS_TMRNA:
try:
info[group] = matches.group(group)
except:
pass
return info

import fileinput
genome_id = None
stdin_data = []
for line in fileinput.input():
stdin_data.append(line)
KEY_ORDER = ('parent', 'source', 'type', 'start', 'end', 'score', 'strand',
'8', 'quals')

possible_blocks = [line for line in blocks(stdin_data)]
def output_line(gff3):
print '\t'.join(str(gff3[x]) for x in KEY_ORDER)

seqid = None
print '##gff-version-3'
# We're off to a GREAT start, if I'm accessing by index you just know that I'm going to do terrible
# awful things
for block_idx in range(len(possible_blocks)):
block = possible_blocks[block_idx]
data = None
fasta_defline = None

if block[0].startswith('Searching for') or 'nucleotides in sequence' in block[-1]:
# Try and get a sequence ID out of it
try:
fasta_defline = block[-2].strip()
except:
# Failing that, ignore it.
pass
print '##gff-version 3'
for line in sys.stdin:
if line.startswith('>'):
genome_id = line[1:].strip()
else:
# They DUPLICATE results in multiple places, including a fasta version
# in the 'full report'.
possible_ugliness = [x for x in block if x.startswith('>t')]
if len(possible_ugliness) > 0:
continue

# However, if it didn't have one of those all important pieces of
# information, then it's either a different important piece of
# information, or complete junk
data = important_info(block)

# I am not proud of any of this. We essentially say "if that block
# didn't come up with useful info, then try making it a tmrna"
if len(data.keys()) == 0:
data = important_info_tmrna(block)
# And if that fails, just none it.
if len(data.keys()) == 0:
data = None
else:
# But if it didn't, confirm that we're a tmRNA
data['type'] = 'tmRNA'
else:
# If we did have keys, and didn't pass through any of the tmRNA
# checks, we're tRNA
data['type'] = 'tRNA'

# If we got a sequence ID in this block, set the defline
if 'nucleotides in sequence' in block[-1]:
try:
fasta_defline = block[-2].strip()
except:
pass

# if a defline is available, try and extract the fasta header ID
if fasta_defline is not None:
try:
seqid = fasta_defline[0:fasta_defline.index(' ')]
except:
seqid = fasta_defline

# If there's data
if data is not None and len(data.keys()) > 1:

# Deal with our flags/notes.
if data['type'] == 'tRNA':
# Are these acceptable GFF3 tags?
notes = {
'Codon': data['codon'],
'Anticodon': data['anticodon'],
}
if 'pseudo' in data:
notes['Note'] = 'Possible pseudogene'
else:
notes = {
'Note': 'Tag peptide: ' + data['pep'] + ''
data = line.split()
if len(data) == 5:
# Parse data
strand = '-' if data[2].startswith('c') else '+'
start, end = data[2][data[2].index('[') + 1:-1].split(',')

gff3 = {
'parent': genome_id,
'source': 'aragorn',
'start': int(start),
'end': int(end),
'strand': strand,
'score': '.',
'8': '.',
}

notestr = ';'.join(['%s="%s"' % (k,v) for k,v in notes.iteritems()])

print '\t'.join([
seqid,
'aragorn',
data['type'],
data['start'],
data['end'],
'.',
'.',
'.',
notestr
])
if not full_gene_model:
gff3.update({
'type': 'tRNA',
'quals': 'ID=tRNA{};Name={}'.format(*data),
})
output_line(gff3)
else:
gff3.update({
'type': 'gene',
'quals': 'ID=gene{};Name={}-gene'.format(*data),
})
output_line(gff3)
gff3.update({
'type': 'tRNA',
'quals': 'ID=tRNA{0};Parent=gene{0};Name={1}'.format(*data),
})
output_line(gff3)

# If no introns
if ')i(' not in data[4]:
gff3['type'] = 'exon'
gff3['quals'] = 'Parent=tRNA{0}'.format(*data)
output_line(gff3)
else:
intron_location = data[4][data[4].rindex('(') + 1:-1].split(',')
intron_start, intron_length = map(int, intron_location)
if strand == '+':
original_end = gff3['end']
else:
original_end = gff3['start']

# EXON
gff3.update({
'type': 'exon',
'quals': 'Parent=tRNA{0}'.format(*data),
})
if strand == '+':
gff3['end'] = gff3['start'] + intron_start - 2
else:
gff3['start'] = gff3['end'] - intron_start + 2

output_line(gff3)

# INTRON
gff3.update({
'type': 'intron',
'quals': 'Parent=tRNA{0}'.format(*data),
})
if strand == '+':
gff3['start'] = gff3['end'] + 1
gff3['end'] = gff3['start'] + intron_length + 2
else:
gff3['end'] = gff3['start'] - 1
gff3['start'] = gff3['end'] - intron_length + 1

output_line(gff3)

# EXON
gff3.update({
'type': 'exon',
'quals': 'Parent=tRNA{0}'.format(*data),
})
if strand == '+':
gff3['start'] = gff3['end'] + 1
gff3['end'] = original_end
else:
gff3['end'] = gff3['start'] - 1
gff3['start'] = original_end

output_line(gff3)

0 comments on commit 70e8d72

Please sign in to comment.