Merge pull request #41 from antgonza/mv-list-to-files

Refactor to move large lists to simple files
biocore · Dec 22, 2015 · 405de6e · 405de6e
2 parents 3b9e790 + 06ba3ac
commit 405de6e
Show file tree

Hide file tree

Showing 6 changed files with 61 additions and 66 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,9 @@ Platypus Conquistador 0.9.0-dev (changes since Platypus Conquistador 0.9.0 go he
 -----------------------------------------------------------------------------------
 
 - Changed parse.parse_m9 so it doesn't load the full file into memory
+- Improved memory requirements by moving the labels and counts of hits to the databases
+to simple files with all the labels of the hits. Basically, now those files have all the
+labels of the hit sequences. For a summary, you could simply do: `cat your_file | sort | uniq -c`.
 
 Version 0.9.0 (2015-04-26)
 --------------------------

diff --git a/platypus/commands.py b/platypus/commands.py
@@ -8,7 +8,6 @@
 from __future__ import division
 
 from os.path import join, basename
-from operator import itemgetter
 
 from click import BadParameter
 from skbio.util import create_dir
@@ -50,10 +49,10 @@ def compare(interest_fp, other_fp, output_dir='blast-results-compare',
         other database search results. If None is passed, it defaults to
         `[50]`.
     hits_to_first : bool, optional defaults to False
-        Outputs the labels and counts of the sequences being hit in the first
+        Outputs all the labels of the sequences being hit in the first
         database.
     hits_to_second : bool, optional defaults to False
-        Outputs the labels and counts of the sequences being hit in the second
+        Outputs all the labels of the sequences being hit in the second
         database.
 
     Raises
@@ -102,7 +101,7 @@ def compare(interest_fp, other_fp, output_dir='blast-results-compare',
     # parse results
     results = process_results(interest_pcts, interest_alg_lens,
                               other_pcts, other_alg_lens, best_hits,
-                              output_dir)
+                              output_dir, hits_to_first, hits_to_second)
 
     # Collating output and writing full results
     for i, item in enumerate(results):
@@ -126,23 +125,6 @@ def compare(interest_fp, other_fp, output_dir='blast-results-compare',
         combined_results[4].append(str(item['equal']))
         combined_results[5].append(str(no_hits))
 
-        # tiny helper function to save hits files
-        def save_hits(data, name):
-
-            s_hits = sorted(data, key=itemgetter(1), reverse=True)
-            filename = join(output_dir, name)
-            with open(filename, 'w') as fd:
-                fd.write('\n'.join(['%s\t%d' % (k, v)
-                                    for k, v in s_hits if v != 0]))
-
-        if hits_to_first:
-            save_hits(item['db_seqs_counts']['a'].items(),
-                      "hits_to_first_db_%s.txt" % item['filename'])
-
-        if hits_to_second:
-            save_hits(item['db_seqs_counts']['b'].items(),
-                      "hits_to_second_db_%s.txt" % item['filename'])
-
     # saving collated results
     with open(join(output_dir, "compile_output.txt"), 'w') as compiled_output:
         compiled_output.write('\n'.join(['\t'.join(item)

diff --git a/platypus/parse.py b/platypus/parse.py
@@ -198,7 +198,8 @@ def parse_second_database(db, best_hits, percentage_ids_other,
 
 
 def process_results(percentage_ids, alignment_lengths, percentage_ids_other,
-                    alignment_lengths_other, best_hits, output_dir):
+                    alignment_lengths_other, best_hits, output_dir,
+                    hits_to_first, hits_to_second):
     """Format the results into a summary dictionary
 
     Parameters
@@ -219,6 +220,12 @@ def process_results(percentage_ids, alignment_lengths, percentage_ids_other,
         A dictionary with the best hits found in the databases.
     output_dir : str
         File path to the output directory.
+    hits_to_first : bool
+        Outputs all the labels of the sequences being hit in the first
+        database.
+    hits_to_second : bool
+        Outputs all the labels of the sequences being hit in the second
+        database.
 
     Returns
     -------
@@ -232,19 +239,29 @@ def process_results(percentage_ids, alignment_lengths, percentage_ids_other,
     iter_b = product(percentage_ids_other, alignment_lengths_other)
 
     for (perc_id_a, aln_len_a), (perc_id_b, aln_len_b) in izip(iter_a, iter_b):
-        filename = "p1_%d-a1_%d_p2_%d-a2_%d" % (perc_id_a, aln_len_a,
-                                                perc_id_b, aln_len_b)
-        summary_filename = join(output_dir, "summary_" + filename + ".txt")
-        summary_fh = open(summary_filename, 'w')
+        # basic filename for each combination of options
+        fn = "p1_%d-a1_%d_p2_%d-a2_%d" % (perc_id_a, aln_len_a,
+                                          perc_id_b, aln_len_b)
+        # filename, handler and header for the summary results
+        summary_fn = join(output_dir, "summary_" + fn + ".txt")
+        summary_fh = open(summary_fn, 'w')
         summary_fh.write('#SeqId\tFirst\tSecond\n')
-        results.append({
-            'filename': filename,
-            'db_interest': 0,
-            'db_other': 0,
-            'perfect_interest': 0,
-            'equal': 0,
-            'summary_fh': summary_fh,
-            'db_seqs_counts': {'a': {}, 'b': {}}})
+        # filename for the hits to first/second databases
+        hits_to_first_fn = join(output_dir, "hits_to_first_db_%s.txt" % fn)
+        hits_to_second_fn = join(output_dir, "hits_to_second_db_%s.txt" % fn)
+        # generating basic element
+        tmp = {'filename': fn,
+               'db_interest': 0,
+               'db_other': 0,
+               'perfect_interest': 0,
+               'equal': 0,
+               'summary_fh': summary_fh,
+               'db_seqs_counts': {'a': None, 'b': None}}
+        if hits_to_first:
+            tmp['db_seqs_counts']['a'] = open(hits_to_first_fn, 'w')
+        if hits_to_second:
+            tmp['db_seqs_counts']['b'] = open(hits_to_second_fn, 'w')
+        results.append(tmp)
 
     for seq_name, values in best_hits.items():
         seq_name = seq_name.split(' ')[0].strip()
@@ -256,38 +273,34 @@ def process_results(percentage_ids, alignment_lengths, percentage_ids_other,
             db_seqs_counts_a = results[i]['db_seqs_counts']['a']
             db_seqs_counts_b = results[i]['db_seqs_counts']['b']
 
-            # validating duplicated results in the databases
-            # do this step in a different script early in the pipeline
-            if subject_id_a not in db_seqs_counts_a:
-                db_seqs_counts_a[subject_id_a] = 0
-                if subject_id_a == db_seqs_counts_b:
-                    raise Warning("%s is in both databases" % subject_id_a)
-            if subject_id_b not in db_seqs_counts_b:
-                db_seqs_counts_b[subject_id_b] = 0
-                if subject_id_b == db_seqs_counts_a:
-                    raise Warning("%s is in both databases" % subject_id_b)
-
             # Comparing bit_scores to create outputs
             if vals['a']['bit_score'] == vals['b']['bit_score']:
                 results[i]['equal'] += 1
                 results[i]['summary_fh'].write('%s\t%s\t%s\n' % (
                     seq_name, subject_id_a, subject_id_b))
-                db_seqs_counts_a[subject_id_a] += 1
-                db_seqs_counts_b[subject_id_b] += 1
+                if db_seqs_counts_a:
+                    db_seqs_counts_a.write('%s\n' % subject_id_a)
+                if db_seqs_counts_b:
+                    db_seqs_counts_b.write('%s\n' % subject_id_b)
             elif vals['a']['bit_score'] > vals['b']['bit_score']:
                 if not subject_id_b:
                     results[i]['perfect_interest'] += 1
                     results[i]['summary_fh'].write('%s\t%s\t\n' % (
                         seq_name, subject_id_a))
-                db_seqs_counts_a[subject_id_a] += 1
+                if db_seqs_counts_a:
+                    db_seqs_counts_a.write('%s\n' % subject_id_a)
             else:
                 results[i]['db_other'] += 1
                 results[i]['summary_fh'].write('%s\t\t\n' % (seq_name))
-
-                db_seqs_counts_b[subject_id_b] += 1
+                if db_seqs_counts_b:
+                    db_seqs_counts_b.write('%s\n' % subject_id_b)
 
     # closing files handlers
     for r in results:
         r['summary_fh'].close()
+        if r['db_seqs_counts']['a']:
+            r['db_seqs_counts']['a'].close()
+        if r['db_seqs_counts']['b']:
+            r['db_seqs_counts']['b'].close()
 
     return results
diff --git a/scripts/platypus b/scripts/platypus
@@ -54,11 +54,11 @@ DIR_TYPE = click.Path(exists=False, file_okay=False, dir_okay=True,
               'in the other database search results. If None is passed the '
               'values from --interest_alg_lengths will be used.', default=None)
 @click.option('--hits_to_first', required=False, is_flag=True, default=False,
-              help='Outputs the labels and counts of the sequences being hit '
-              'in the first database.', show_default=True)
+              help='Outputs all the labels of the sequences being hit in the '
+              'first database.', show_default=True)
 @click.option('--hits_to_second', required=False, is_flag=True, default=False,
-              help='Outputs the labels and counts of the sequences being hit '
-              'in the second database.', show_default=True)
+              help='Outputs all the labels of the sequences being hit in the '
+              'second database.', show_default=True)
 def compare(interest_fp, other_fp, output_dir='blast-results-compare',
             interest_pcts=None, interest_alg_lens=None, other_pcts=None,
             other_alg_lens=None, hits_to_first=None, hits_to_second=None):

diff --git a/tests/support_files/compare-tests/hits_to_first_db_p1_70-a1_50_p2_70-a2_50.txt b/tests/support_files/compare-tests/hits_to_first_db_p1_70-a1_50_p2_70-a2_50.txt
@@ -1,3 +1,3 @@
-NZ_ACZD01000120_647000262	1
-NZ_ABEH01000005_641736102	1
-NZ_ABEH01000018_641736102	1
+NZ_ACZD01000120_647000262
+NZ_ABEH01000005_641736102
+NZ_ABEH01000018_641736102
diff --git a/tests/test_parse.py b/tests/test_parse.py
@@ -199,20 +199,17 @@ def test_process_results(self):
                       'alg_length': 10},
                 'b': {'evalue': 0.0, 'subject_id': 'RESULT-B',
                       'bit_score': 959.0, 'percentage_id': 100,
-                      'alg_length': 900}}]
+                      'alg_length': 900}}],
+            'NO-VALS': [None]
         }
 
         out_results = process_results([0.80], [50], [0.30], [30], best_hits,
-                                      self.base)
-        # removing the summary_fh pointer so we don't need to test
+                                      self.base, False, False)
+        # removing the file pointers so we don't need to test
         out_results[0].pop('summary_fh')
+        out_results[0].pop('db_seqs_counts')
         self.assertEquals(out_results, [{
-            'db_interest': 0, 'db_other': 1, 'db_seqs_counts': {
-                'a': {'NZ_ABEH01000005_641736102': 1,
-                      'RESULT-A': 1,
-                      'NZ_ABEH01000018_641736102': 1},
-                'b': {None: 0, 'RESULT-B': 2}},
-            'perfect_interest': 2, 'equal': 1,
+            'db_interest': 0, 'db_other': 1, 'perfect_interest': 2, 'equal': 1,
             'filename': 'p1_0-a1_50_p2_0-a2_30'}])
 
 if __name__ == "__main__":