Skip to content

Commit

Permalink
Merge pull request #41 from antgonza/mv-list-to-files
Browse files Browse the repository at this point in the history
Refactor to move large lists to simple files
  • Loading branch information
ElDeveloper committed Dec 22, 2015
2 parents 3b9e790 + 06ba3ac commit 405de6e
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 66 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Expand Up @@ -4,6 +4,9 @@ Platypus Conquistador 0.9.0-dev (changes since Platypus Conquistador 0.9.0 go he
-----------------------------------------------------------------------------------

- Changed parse.parse_m9 so it doesn't load the full file into memory
- Improved memory requirements by moving the labels and counts of hits to the databases
to simple files with all the labels of the hits. Basically, now those files have all the
labels of the hit sequences. For a summary, you could simply do: `cat your_file | sort | uniq -c`.

Version 0.9.0 (2015-04-26)
--------------------------
Expand Down
24 changes: 3 additions & 21 deletions platypus/commands.py
Expand Up @@ -8,7 +8,6 @@
from __future__ import division

from os.path import join, basename
from operator import itemgetter

from click import BadParameter
from skbio.util import create_dir
Expand Down Expand Up @@ -50,10 +49,10 @@ def compare(interest_fp, other_fp, output_dir='blast-results-compare',
other database search results. If None is passed, it defaults to
`[50]`.
hits_to_first : bool, optional defaults to False
Outputs the labels and counts of the sequences being hit in the first
Outputs all the labels of the sequences being hit in the first
database.
hits_to_second : bool, optional defaults to False
Outputs the labels and counts of the sequences being hit in the second
Outputs all the labels of the sequences being hit in the second
database.
Raises
Expand Down Expand Up @@ -102,7 +101,7 @@ def compare(interest_fp, other_fp, output_dir='blast-results-compare',
# parse results
results = process_results(interest_pcts, interest_alg_lens,
other_pcts, other_alg_lens, best_hits,
output_dir)
output_dir, hits_to_first, hits_to_second)

# Collating output and writing full results
for i, item in enumerate(results):
Expand All @@ -126,23 +125,6 @@ def compare(interest_fp, other_fp, output_dir='blast-results-compare',
combined_results[4].append(str(item['equal']))
combined_results[5].append(str(no_hits))

# tiny helper function to save hits files
def save_hits(data, name):

s_hits = sorted(data, key=itemgetter(1), reverse=True)
filename = join(output_dir, name)
with open(filename, 'w') as fd:
fd.write('\n'.join(['%s\t%d' % (k, v)
for k, v in s_hits if v != 0]))

if hits_to_first:
save_hits(item['db_seqs_counts']['a'].items(),
"hits_to_first_db_%s.txt" % item['filename'])

if hits_to_second:
save_hits(item['db_seqs_counts']['b'].items(),
"hits_to_second_db_%s.txt" % item['filename'])

# saving collated results
with open(join(output_dir, "compile_output.txt"), 'w') as compiled_output:
compiled_output.write('\n'.join(['\t'.join(item)
Expand Down
71 changes: 42 additions & 29 deletions platypus/parse.py
Expand Up @@ -198,7 +198,8 @@ def parse_second_database(db, best_hits, percentage_ids_other,


def process_results(percentage_ids, alignment_lengths, percentage_ids_other,
alignment_lengths_other, best_hits, output_dir):
alignment_lengths_other, best_hits, output_dir,
hits_to_first, hits_to_second):
"""Format the results into a summary dictionary
Parameters
Expand All @@ -219,6 +220,12 @@ def process_results(percentage_ids, alignment_lengths, percentage_ids_other,
A dictionary with the best hits found in the databases.
output_dir : str
File path to the output directory.
hits_to_first : bool
Outputs all the labels of the sequences being hit in the first
database.
hits_to_second : bool
Outputs all the labels of the sequences being hit in the second
database.
Returns
-------
Expand All @@ -232,19 +239,29 @@ def process_results(percentage_ids, alignment_lengths, percentage_ids_other,
iter_b = product(percentage_ids_other, alignment_lengths_other)

for (perc_id_a, aln_len_a), (perc_id_b, aln_len_b) in izip(iter_a, iter_b):
filename = "p1_%d-a1_%d_p2_%d-a2_%d" % (perc_id_a, aln_len_a,
perc_id_b, aln_len_b)
summary_filename = join(output_dir, "summary_" + filename + ".txt")
summary_fh = open(summary_filename, 'w')
# basic filename for each combination of options
fn = "p1_%d-a1_%d_p2_%d-a2_%d" % (perc_id_a, aln_len_a,
perc_id_b, aln_len_b)
# filename, handler and header for the summary results
summary_fn = join(output_dir, "summary_" + fn + ".txt")
summary_fh = open(summary_fn, 'w')
summary_fh.write('#SeqId\tFirst\tSecond\n')
results.append({
'filename': filename,
'db_interest': 0,
'db_other': 0,
'perfect_interest': 0,
'equal': 0,
'summary_fh': summary_fh,
'db_seqs_counts': {'a': {}, 'b': {}}})
# filename for the hits to first/second databases
hits_to_first_fn = join(output_dir, "hits_to_first_db_%s.txt" % fn)
hits_to_second_fn = join(output_dir, "hits_to_second_db_%s.txt" % fn)
# generating basic element
tmp = {'filename': fn,
'db_interest': 0,
'db_other': 0,
'perfect_interest': 0,
'equal': 0,
'summary_fh': summary_fh,
'db_seqs_counts': {'a': None, 'b': None}}
if hits_to_first:
tmp['db_seqs_counts']['a'] = open(hits_to_first_fn, 'w')
if hits_to_second:
tmp['db_seqs_counts']['b'] = open(hits_to_second_fn, 'w')
results.append(tmp)

for seq_name, values in best_hits.items():
seq_name = seq_name.split(' ')[0].strip()
Expand All @@ -256,38 +273,34 @@ def process_results(percentage_ids, alignment_lengths, percentage_ids_other,
db_seqs_counts_a = results[i]['db_seqs_counts']['a']
db_seqs_counts_b = results[i]['db_seqs_counts']['b']

# validating duplicated results in the databases
# do this step in a different script early in the pipeline
if subject_id_a not in db_seqs_counts_a:
db_seqs_counts_a[subject_id_a] = 0
if subject_id_a == db_seqs_counts_b:
raise Warning("%s is in both databases" % subject_id_a)
if subject_id_b not in db_seqs_counts_b:
db_seqs_counts_b[subject_id_b] = 0
if subject_id_b == db_seqs_counts_a:
raise Warning("%s is in both databases" % subject_id_b)

# Comparing bit_scores to create outputs
if vals['a']['bit_score'] == vals['b']['bit_score']:
results[i]['equal'] += 1
results[i]['summary_fh'].write('%s\t%s\t%s\n' % (
seq_name, subject_id_a, subject_id_b))
db_seqs_counts_a[subject_id_a] += 1
db_seqs_counts_b[subject_id_b] += 1
if db_seqs_counts_a:
db_seqs_counts_a.write('%s\n' % subject_id_a)
if db_seqs_counts_b:
db_seqs_counts_b.write('%s\n' % subject_id_b)
elif vals['a']['bit_score'] > vals['b']['bit_score']:
if not subject_id_b:
results[i]['perfect_interest'] += 1
results[i]['summary_fh'].write('%s\t%s\t\n' % (
seq_name, subject_id_a))
db_seqs_counts_a[subject_id_a] += 1
if db_seqs_counts_a:
db_seqs_counts_a.write('%s\n' % subject_id_a)
else:
results[i]['db_other'] += 1
results[i]['summary_fh'].write('%s\t\t\n' % (seq_name))

db_seqs_counts_b[subject_id_b] += 1
if db_seqs_counts_b:
db_seqs_counts_b.write('%s\n' % subject_id_b)

# closing files handlers
for r in results:
r['summary_fh'].close()
if r['db_seqs_counts']['a']:
r['db_seqs_counts']['a'].close()
if r['db_seqs_counts']['b']:
r['db_seqs_counts']['b'].close()

return results
8 changes: 4 additions & 4 deletions scripts/platypus
Expand Up @@ -54,11 +54,11 @@ DIR_TYPE = click.Path(exists=False, file_okay=False, dir_okay=True,
'in the other database search results. If None is passed the '
'values from --interest_alg_lengths will be used.', default=None)
@click.option('--hits_to_first', required=False, is_flag=True, default=False,
help='Outputs the labels and counts of the sequences being hit '
'in the first database.', show_default=True)
help='Outputs all the labels of the sequences being hit in the '
'first database.', show_default=True)
@click.option('--hits_to_second', required=False, is_flag=True, default=False,
help='Outputs the labels and counts of the sequences being hit '
'in the second database.', show_default=True)
help='Outputs all the labels of the sequences being hit in the '
'second database.', show_default=True)
def compare(interest_fp, other_fp, output_dir='blast-results-compare',
interest_pcts=None, interest_alg_lens=None, other_pcts=None,
other_alg_lens=None, hits_to_first=None, hits_to_second=None):
Expand Down
@@ -1,3 +1,3 @@
NZ_ACZD01000120_647000262 1
NZ_ABEH01000005_641736102 1
NZ_ABEH01000018_641736102 1
NZ_ACZD01000120_647000262
NZ_ABEH01000005_641736102
NZ_ABEH01000018_641736102
15 changes: 6 additions & 9 deletions tests/test_parse.py
Expand Up @@ -199,20 +199,17 @@ def test_process_results(self):
'alg_length': 10},
'b': {'evalue': 0.0, 'subject_id': 'RESULT-B',
'bit_score': 959.0, 'percentage_id': 100,
'alg_length': 900}}]
'alg_length': 900}}],
'NO-VALS': [None]
}

out_results = process_results([0.80], [50], [0.30], [30], best_hits,
self.base)
# removing the summary_fh pointer so we don't need to test
self.base, False, False)
# removing the file pointers so we don't need to test
out_results[0].pop('summary_fh')
out_results[0].pop('db_seqs_counts')
self.assertEquals(out_results, [{
'db_interest': 0, 'db_other': 1, 'db_seqs_counts': {
'a': {'NZ_ABEH01000005_641736102': 1,
'RESULT-A': 1,
'NZ_ABEH01000018_641736102': 1},
'b': {None: 0, 'RESULT-B': 2}},
'perfect_interest': 2, 'equal': 1,
'db_interest': 0, 'db_other': 1, 'perfect_interest': 2, 'equal': 1,
'filename': 'p1_0-a1_50_p2_0-a2_30'}])

if __name__ == "__main__":
Expand Down

0 comments on commit 405de6e

Please sign in to comment.