-
Notifications
You must be signed in to change notification settings - Fork 226
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Some maker data was provided, but it's technically incorrect. I validated my results against the aragorn report, and found inconsistencies with what Maker has done, and what ARAGORN has done. This may be a result of maker using another tRNA caller or just plain ol' bugs. Nevertheless, despite the ugly code (a lot better than it was previously), this should be technically correct.
- Loading branch information
Eric Rasche
committed
May 7, 2015
1 parent
172105c
commit 70e8d72
Showing
2 changed files
with
104 additions
and
160 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
253 changes: 97 additions & 156 deletions
253
tools/rna_tools/trna_prediction/aragorn_out_to_gff3.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,165 +1,106 @@ | ||
#!/usr/bin/env python | ||
import re | ||
import sys | ||
|
||
def start_pattern(string): | ||
return re.match(r'^[0-9]+\.$', string) \ | ||
or string.startswith('Number of possible') \ | ||
or string.startswith('Searching for') | ||
full_gene_model = False | ||
if '--full' in sys.argv: | ||
full_gene_model = True | ||
|
||
def blank_line(string): | ||
return re.match(r'^\s*$', string) | ||
|
||
def blocks(iterable): | ||
accumulator = [] | ||
run_of_blanklines = 0 | ||
for line in iterable: | ||
# Count blank lines | ||
if blank_line(line): | ||
run_of_blanklines += 1 | ||
else: | ||
run_of_blanklines = 0 | ||
|
||
if start_pattern(line) or run_of_blanklines > 2 or 'Mean G+C' in line: | ||
if accumulator: | ||
yield accumulator | ||
accumulator = [line] | ||
else: | ||
accumulator.append(line) | ||
if accumulator: | ||
yield accumulator | ||
|
||
IMPORTANT_INFO = { | ||
'trna': re.compile(r'tRNA-(?P<codon>[A-Za-z]{3})\((?P<anticodon>[A-Za-z]{3})\)'), | ||
'trna-alt': re.compile(r'tRNA-\?\((?P<codon>[^\)]+)\)\((?P<anticodon>[A-Za-z]{2,})\)'), | ||
'bases': re.compile(r'(?P<bases>[0-9]+) bases, %GC = (?P<gc>[0-9.]+)'), | ||
'sequence': re.compile(r'Sequence (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'), | ||
'possible_pseudogene': re.compile(r'(?P<pseudo>Possible Pseudogene)'), | ||
} | ||
INFO_GROUPS = ('codon', 'anticodon', 'bases', 'gc', 'complement', 'start', 'end', 'pseudo') | ||
|
||
def important_info(block): | ||
info = {} | ||
for line in block: | ||
for matcher in IMPORTANT_INFO: | ||
matches = IMPORTANT_INFO[matcher].search(line) | ||
if matches: | ||
for group in INFO_GROUPS: | ||
try: | ||
info[group] = matches.group(group) | ||
except: | ||
pass | ||
return info | ||
|
||
IMPORTANT_INFO_TMRNA = { | ||
'tag_peptide': re.compile(r'Tag peptide:\s+(?P<pep>[A-Z*]*)'), | ||
'location': re.compile(r'Location (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'), | ||
} | ||
INFO_GROUPS_TMRNA = ('start', 'end', 'pep') | ||
|
||
def important_info_tmrna(block): | ||
info = {} | ||
for line in block: | ||
for matcher in IMPORTANT_INFO_TMRNA: | ||
matches = IMPORTANT_INFO_TMRNA[matcher].search(line) | ||
if matches: | ||
for group in INFO_GROUPS_TMRNA: | ||
try: | ||
info[group] = matches.group(group) | ||
except: | ||
pass | ||
return info | ||
|
||
import fileinput | ||
genome_id = None | ||
stdin_data = [] | ||
for line in fileinput.input(): | ||
stdin_data.append(line) | ||
KEY_ORDER = ('parent', 'source', 'type', 'start', 'end', 'score', 'strand', | ||
'8', 'quals') | ||
|
||
possible_blocks = [line for line in blocks(stdin_data)] | ||
def output_line(gff3): | ||
print '\t'.join(str(gff3[x]) for x in KEY_ORDER) | ||
|
||
seqid = None | ||
print '##gff-version-3' | ||
# We're off to a GREAT start, if I'm accessing by index you just know that I'm going to do terrible | ||
# awful things | ||
for block_idx in range(len(possible_blocks)): | ||
block = possible_blocks[block_idx] | ||
data = None | ||
fasta_defline = None | ||
|
||
if block[0].startswith('Searching for') or 'nucleotides in sequence' in block[-1]: | ||
# Try and get a sequence ID out of it | ||
try: | ||
fasta_defline = block[-2].strip() | ||
except: | ||
# Failing that, ignore it. | ||
pass | ||
print '##gff-version 3' | ||
for line in sys.stdin: | ||
if line.startswith('>'): | ||
genome_id = line[1:].strip() | ||
else: | ||
# They DUPLICATE results in multiple places, including a fasta version | ||
# in the 'full report'. | ||
possible_ugliness = [x for x in block if x.startswith('>t')] | ||
if len(possible_ugliness) > 0: | ||
continue | ||
|
||
# However, if it didn't have one of those all important pieces of | ||
# information, then it's either a different important piece of | ||
# information, or complete junk | ||
data = important_info(block) | ||
|
||
# I am not proud of any of this. We essentially say "if that block | ||
# didn't come up with useful info, then try making it a tmrna" | ||
if len(data.keys()) == 0: | ||
data = important_info_tmrna(block) | ||
# And if that fails, just none it. | ||
if len(data.keys()) == 0: | ||
data = None | ||
else: | ||
# But if it didn't, confirm that we're a tmRNA | ||
data['type'] = 'tmRNA' | ||
else: | ||
# If we did have keys, and didn't pass through any of the tmRNA | ||
# checks, we're tRNA | ||
data['type'] = 'tRNA' | ||
|
||
# If we got a sequence ID in this block, set the defline | ||
if 'nucleotides in sequence' in block[-1]: | ||
try: | ||
fasta_defline = block[-2].strip() | ||
except: | ||
pass | ||
|
||
# if a defline is available, try and extract the fasta header ID | ||
if fasta_defline is not None: | ||
try: | ||
seqid = fasta_defline[0:fasta_defline.index(' ')] | ||
except: | ||
seqid = fasta_defline | ||
|
||
# If there's data | ||
if data is not None and len(data.keys()) > 1: | ||
|
||
# Deal with our flags/notes. | ||
if data['type'] == 'tRNA': | ||
# Are these acceptable GFF3 tags? | ||
notes = { | ||
'Codon': data['codon'], | ||
'Anticodon': data['anticodon'], | ||
} | ||
if 'pseudo' in data: | ||
notes['Note'] = 'Possible pseudogene' | ||
else: | ||
notes = { | ||
'Note': 'Tag peptide: ' + data['pep'] + '' | ||
data = line.split() | ||
if len(data) == 5: | ||
# Parse data | ||
strand = '-' if data[2].startswith('c') else '+' | ||
start, end = data[2][data[2].index('[') + 1:-1].split(',') | ||
|
||
gff3 = { | ||
'parent': genome_id, | ||
'source': 'aragorn', | ||
'start': int(start), | ||
'end': int(end), | ||
'strand': strand, | ||
'score': '.', | ||
'8': '.', | ||
} | ||
|
||
notestr = ';'.join(['%s="%s"' % (k,v) for k,v in notes.iteritems()]) | ||
|
||
print '\t'.join([ | ||
seqid, | ||
'aragorn', | ||
data['type'], | ||
data['start'], | ||
data['end'], | ||
'.', | ||
'.', | ||
'.', | ||
notestr | ||
]) | ||
if not full_gene_model: | ||
gff3.update({ | ||
'type': 'tRNA', | ||
'quals': 'ID=tRNA{};Name={}'.format(*data), | ||
}) | ||
output_line(gff3) | ||
else: | ||
gff3.update({ | ||
'type': 'gene', | ||
'quals': 'ID=gene{};Name={}-gene'.format(*data), | ||
}) | ||
output_line(gff3) | ||
gff3.update({ | ||
'type': 'tRNA', | ||
'quals': 'ID=tRNA{0};Parent=gene{0};Name={1}'.format(*data), | ||
}) | ||
output_line(gff3) | ||
|
||
# If no introns | ||
if ')i(' not in data[4]: | ||
gff3['type'] = 'exon' | ||
gff3['quals'] = 'Parent=tRNA{0}'.format(*data) | ||
output_line(gff3) | ||
else: | ||
intron_location = data[4][data[4].rindex('(') + 1:-1].split(',') | ||
intron_start, intron_length = map(int, intron_location) | ||
if strand == '+': | ||
original_end = gff3['end'] | ||
else: | ||
original_end = gff3['start'] | ||
|
||
# EXON | ||
gff3.update({ | ||
'type': 'exon', | ||
'quals': 'Parent=tRNA{0}'.format(*data), | ||
}) | ||
if strand == '+': | ||
gff3['end'] = gff3['start'] + intron_start - 2 | ||
else: | ||
gff3['start'] = gff3['end'] - intron_start + 2 | ||
|
||
output_line(gff3) | ||
|
||
# INTRON | ||
gff3.update({ | ||
'type': 'intron', | ||
'quals': 'Parent=tRNA{0}'.format(*data), | ||
}) | ||
if strand == '+': | ||
gff3['start'] = gff3['end'] + 1 | ||
gff3['end'] = gff3['start'] + intron_length + 2 | ||
else: | ||
gff3['end'] = gff3['start'] - 1 | ||
gff3['start'] = gff3['end'] - intron_length + 1 | ||
|
||
output_line(gff3) | ||
|
||
# EXON | ||
gff3.update({ | ||
'type': 'exon', | ||
'quals': 'Parent=tRNA{0}'.format(*data), | ||
}) | ||
if strand == '+': | ||
gff3['start'] = gff3['end'] + 1 | ||
gff3['end'] = original_end | ||
else: | ||
gff3['end'] = gff3['start'] - 1 | ||
gff3['start'] = original_end | ||
|
||
output_line(gff3) |