Fixes #115

Some maker data was provided, but it's technically incorrect. I validated my results against the aragorn report, and found inconsistencies with what Maker has done, and what ARAGORN has done. This may be a result of maker using another tRNA caller or just plain ol' bugs. Nevertheless, despite the ugly code (a lot better than it was previously), this should be technically correct.
bgruening · May 7, 2015 · 70e8d72 · 70e8d72
1 parent 172105c
commit 70e8d72
Show file tree

Hide file tree

Showing 2 changed files with 104 additions and 160 deletions.
diff --git a/tools/rna_tools/trna_prediction/aragorn.xml b/tools/rna_tools/trna_prediction/aragorn.xml
@@ -27,8 +27,9 @@
             $mtRNA
             $mam_mtRNA
             $topology
-            -fasta
-            $introns | python \$TRNAPRED_SCRIPT_PATH/aragorn_out_to_gff3.py > $gff3_output_file;
+            $introns
+            -w
+             | python \$TRNAPRED_SCRIPT_PATH/aragorn_out_to_gff3.py $gff3_model > $gff3_output_file;
 #end if
 ]]>
     </command>
@@ -65,6 +66,7 @@
         <param name='introns' type='boolean' label='Search for tRNA genes with introns in anticodon loop ...' truevalue='-i' falsevalue='' checked="false" help='...with maximum length 3000 bases (-i).' />
         <param name='secondary_structure' type='boolean' label='Print out secondary structure (-fasta,-fon)' truevalue='-fasta' falsevalue='-fon' checked="false" help='' />
         <param name="gff3_output" type='boolean' label='Convert output to GFF3' truevalue='True' falsevalue='' checked="false" help='' />
+        <param name="gff3_model" type='boolean' label='Full gene model for GFF3 data' truevalue='--full' checked='false' help='' />
     </inputs>
     <outputs>
         <data name="output" format="fasta">
@@ -197,8 +199,9 @@ Dean Laslett and Bjorn Canback
 
 ARAGORN, a program to detect tRNA genes and tmRNA genes in nucleotide sequences Nucl. Acids Res. (2004) 32(1): 11-16
 
-doi:10.1093/nar/gkh152
-
 ]]>
     </help>
+    <citations>
+        <citation type="doi">10.1093/nar/gkh152</citation>
+    </citations>
 </tool>
diff --git a/tools/rna_tools/trna_prediction/aragorn_out_to_gff3.py b/tools/rna_tools/trna_prediction/aragorn_out_to_gff3.py
@@ -1,165 +1,106 @@
 #!/usr/bin/env python
-import re
+import sys
 
-def start_pattern(string):
-    return re.match(r'^[0-9]+\.$', string) \
-        or string.startswith('Number of possible') \
-        or string.startswith('Searching for')
+full_gene_model = False
+if '--full' in sys.argv:
+    full_gene_model = True
 
-def blank_line(string):
-    return re.match(r'^\s*$', string)
-
-def blocks(iterable):
-    accumulator = []
-    run_of_blanklines = 0
-    for line in iterable:
-        # Count blank lines
-        if blank_line(line):
-            run_of_blanklines += 1
-        else:
-            run_of_blanklines = 0
-
-        if start_pattern(line) or run_of_blanklines > 2 or 'Mean G+C' in line:
-            if accumulator:
-                yield accumulator
-                accumulator = [line]
-        else:
-            accumulator.append(line)
-    if accumulator:
-        yield accumulator
-
-IMPORTANT_INFO = {
-    'trna': re.compile(r'tRNA-(?P<codon>[A-Za-z]{3})\((?P<anticodon>[A-Za-z]{3})\)'),
-    'trna-alt': re.compile(r'tRNA-\?\((?P<codon>[^\)]+)\)\((?P<anticodon>[A-Za-z]{2,})\)'),
-    'bases': re.compile(r'(?P<bases>[0-9]+) bases, %GC = (?P<gc>[0-9.]+)'),
-    'sequence': re.compile(r'Sequence (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'),
-    'possible_pseudogene': re.compile(r'(?P<pseudo>Possible Pseudogene)'),
-}
-INFO_GROUPS = ('codon', 'anticodon', 'bases', 'gc', 'complement', 'start', 'end', 'pseudo')
-
-def important_info(block):
-    info = {}
-    for line in block:
-        for matcher in IMPORTANT_INFO:
-            matches = IMPORTANT_INFO[matcher].search(line)
-            if matches:
-                for group in INFO_GROUPS:
-                    try:
-                        info[group] = matches.group(group)
-                    except:
-                        pass
-    return info
-
-IMPORTANT_INFO_TMRNA = {
-    'tag_peptide': re.compile(r'Tag peptide:\s+(?P<pep>[A-Z*]*)'),
-    'location': re.compile(r'Location (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'),
-}
-INFO_GROUPS_TMRNA = ('start', 'end', 'pep')
-
-def important_info_tmrna(block):
-    info = {}
-    for line in block:
-        for matcher in IMPORTANT_INFO_TMRNA:
-            matches = IMPORTANT_INFO_TMRNA[matcher].search(line)
-            if matches:
-                for group in INFO_GROUPS_TMRNA:
-                    try:
-                        info[group] = matches.group(group)
-                    except:
-                        pass
-    return info
-
-import fileinput
+genome_id = None
 stdin_data = []
-for line in fileinput.input():
-    stdin_data.append(line)
+KEY_ORDER = ('parent', 'source', 'type', 'start', 'end', 'score', 'strand',
+             '8', 'quals')
 
-possible_blocks = [line for line in blocks(stdin_data)]
+def output_line(gff3):
+    print '\t'.join(str(gff3[x]) for x in KEY_ORDER)
 
-seqid = None
-print '##gff-version-3'
-# We're off to a GREAT start, if I'm accessing by index you just know that I'm going to do terrible
-# awful things
-for block_idx in range(len(possible_blocks)):
-    block = possible_blocks[block_idx]
-    data = None
-    fasta_defline = None
-
-    if block[0].startswith('Searching for') or 'nucleotides in sequence' in block[-1]:
-        # Try and get a sequence ID out of it
-        try:
-            fasta_defline = block[-2].strip()
-        except:
-            # Failing that, ignore it.
-            pass
+print '##gff-version 3'
+for line in sys.stdin:
+    if line.startswith('>'):
+        genome_id = line[1:].strip()
     else:
-        # They DUPLICATE results in multiple places, including a fasta version
-        # in the 'full report'.
-        possible_ugliness = [x for x in block if x.startswith('>t')]
-        if len(possible_ugliness) > 0:
-            continue
-
-        # However, if it didn't have one of those all important pieces of
-        # information, then it's either a different important piece of
-        # information, or complete junk
-        data = important_info(block)
-
-        # I am not proud of any of this. We essentially say "if that block
-        # didn't come up with useful info, then try making it a tmrna"
-        if len(data.keys()) == 0:
-            data = important_info_tmrna(block)
-            # And if that fails, just none it.
-            if len(data.keys()) == 0:
-                data = None
-            else:
-                # But if it didn't, confirm that we're a tmRNA
-                data['type'] = 'tmRNA'
-        else:
-            # If we did have keys, and didn't pass through any of the tmRNA
-            # checks, we're tRNA
-            data['type'] = 'tRNA'
-
-    # If we got a sequence ID in this block, set the defline
-    if 'nucleotides in sequence' in block[-1]:
-        try:
-            fasta_defline = block[-2].strip()
-        except:
-            pass
-
-    # if a defline is available, try and extract the fasta header ID
-    if fasta_defline is not None:
-        try:
-            seqid = fasta_defline[0:fasta_defline.index(' ')]
-        except:
-            seqid = fasta_defline
-
-    # If there's data
-    if data is not None and len(data.keys()) > 1:
-
-        # Deal with our flags/notes.
-        if data['type'] == 'tRNA':
-            # Are these acceptable GFF3 tags?
-            notes = {
-                'Codon': data['codon'],
-                'Anticodon': data['anticodon'],
-            }
-            if 'pseudo' in data:
-                notes['Note'] = 'Possible pseudogene'
-        else:
-            notes = {
-                'Note': 'Tag peptide: ' + data['pep'] + ''
+        data = line.split()
+        if len(data) == 5:
+            # Parse data
+            strand = '-' if data[2].startswith('c') else '+'
+            start, end = data[2][data[2].index('[') + 1:-1].split(',')
+
+            gff3 = {
+                'parent': genome_id,
+                'source': 'aragorn',
+                'start': int(start),
+                'end': int(end),
+                'strand': strand,
+                'score': '.',
+                '8': '.',
             }
 
-        notestr = ';'.join(['%s="%s"' % (k,v) for k,v in notes.iteritems()])
-
-        print '\t'.join([
-            seqid,
-            'aragorn',
-            data['type'],
-            data['start'],
-            data['end'],
-            '.',
-            '.',
-            '.',
-            notestr
-        ])
+            if not full_gene_model:
+                gff3.update({
+                    'type': 'tRNA',
+                    'quals': 'ID=tRNA{};Name={}'.format(*data),
+                })
+                output_line(gff3)
+            else:
+                gff3.update({
+                    'type': 'gene',
+                    'quals': 'ID=gene{};Name={}-gene'.format(*data),
+                })
+                output_line(gff3)
+                gff3.update({
+                    'type': 'tRNA',
+                    'quals': 'ID=tRNA{0};Parent=gene{0};Name={1}'.format(*data),
+                })
+                output_line(gff3)
+
+                # If no introns
+                if ')i(' not in data[4]:
+                    gff3['type'] = 'exon'
+                    gff3['quals'] = 'Parent=tRNA{0}'.format(*data)
+                    output_line(gff3)
+                else:
+                    intron_location = data[4][data[4].rindex('(') + 1:-1].split(',')
+                    intron_start, intron_length = map(int, intron_location)
+                    if strand == '+':
+                        original_end = gff3['end']
+                    else:
+                        original_end = gff3['start']
+
+                    # EXON
+                    gff3.update({
+                        'type': 'exon',
+                        'quals': 'Parent=tRNA{0}'.format(*data),
+                    })
+                    if strand == '+':
+                        gff3['end'] = gff3['start'] + intron_start - 2
+                    else:
+                        gff3['start'] = gff3['end'] - intron_start + 2
+
+                    output_line(gff3)
+
+                    # INTRON
+                    gff3.update({
+                        'type': 'intron',
+                        'quals': 'Parent=tRNA{0}'.format(*data),
+                    })
+                    if strand == '+':
+                        gff3['start'] = gff3['end'] + 1
+                        gff3['end'] = gff3['start'] + intron_length + 2
+                    else:
+                        gff3['end'] = gff3['start'] - 1
+                        gff3['start'] = gff3['end'] - intron_length + 1
+
+                    output_line(gff3)
+
+                    # EXON
+                    gff3.update({
+                        'type': 'exon',
+                        'quals': 'Parent=tRNA{0}'.format(*data),
+                    })
+                    if strand == '+':
+                        gff3['start'] = gff3['end'] + 1
+                        gff3['end'] = original_end
+                    else:
+                        gff3['end'] = gff3['start'] - 1
+                        gff3['start'] = original_end
+
+                    output_line(gff3)