Merge pull request #338 from broadinstitute/dp-empty-data

refactor tests and some kraken
broadinstitute · Jun 13, 2016 · 8062715 · 8062715
2 parents 887c5d3 + 569331a
commit 8062715
Show file tree

Hide file tree

Showing 10 changed files with 247 additions and 272 deletions.
diff --git a/metagenomics.py b/metagenomics.py
@@ -412,46 +412,29 @@ def kraken(inBam, db, outReport=None, outReads=None,
            filterThreshold=None, numThreads=1):
     assert outReads or outReport, (
         'Either --outReads or --outReport must be specified.')
-
-    tmp_fastq1 = util.file.mkstempfname('.1.fastq')
-    tmp_fastq2 = util.file.mkstempfname('.2.fastq')
-    picard = tools.picard.SamToFastqTool()
-    picard_opts = {
-        'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute,
-        'CLIPPING_ACTION': 'X'
-    }
-    picard.execute(inBam, tmp_fastq1, tmp_fastq2,
-                   picardOptions=tools.picard.PicardTools.dict_to_picard_opts(picard_opts),
-                   JVMmemory=picard.jvmMemDefault)
-
     kraken_tool = tools.kraken.Kraken()
+
+    # kraken classify
     tmp_reads = util.file.mkstempfname('.kraken')
-    opts = {
-        '--paired': None,
-        '--threads': min(int(numThreads), util.misc.available_cpu_count()),
-    }
-    # Could be optimized in 3.5 piping directly to kraken-filter.
-    kraken_tool.classify(db, [tmp_fastq1, tmp_fastq2], tmp_reads, options=opts)
+    kraken_tool.classify(inBam, db, tmp_reads, numThreads=numThreads)
 
+    # kraken filter
     if filterThreshold:
-        opts = {
-            '--threshold': filterThreshold,
-        }
-
         tmp_filtered_reads = util.file.mkstempfname('.filtered-kraken')
-        kraken_tool.execute('kraken-filter', db, tmp_filtered_reads, args=[tmp_reads],
-                            options=opts)
+        kraken_tool.filter(tmp_reads, db, tmp_filtered_reads, filterThreshold)
+        os.unlink(tmp_reads)
     else:
         tmp_filtered_reads = tmp_reads
 
+    # copy outReads
     if outReads:
         with open(tmp_filtered_reads, 'rb') as f_in:
             with gzip.open(outReads, 'wb') as f_out:
                 shutil.copyfileobj(f_in, f_out)
 
+    # kraken report
     if outReport:
-        kraken_tool.execute('kraken-report', db, outReport,
-                            args=[tmp_filtered_reads])
+        kraken_tool.report(tmp_filtered_reads, db, outReport)
 
 
 def krona(inTsv, db, outHtml, queryColumn=None, taxidColumn=None,

diff --git a/test/input/empty.bam b/test/input/empty.bam
diff --git a/test/integration/test_assembly.py b/test/integration/test_assembly.py
@@ -47,3 +47,45 @@ def test_ref_assisted_assembly(self):
 
 # in order to test the actual de novo pipeline, we need to add a clip db for trimmomatic
 # then we should test from G5012.3.testreads.bam all the way through the assembly pipe
+
+class TestRefineAssembly(TestCaseWithTmp):
+    def test_ebov_refine1(self):
+        inDir = util.file.get_test_input_path(self)
+        inFasta = os.path.join(inDir, 'impute.ebov.fasta')
+        imputeFasta = util.file.mkstempfname('.imputed.fasta')
+        refine1Fasta = util.file.mkstempfname('.refine1.fasta')
+        shutil.copy(inFasta, imputeFasta)
+        tools.picard.CreateSequenceDictionaryTool().execute(imputeFasta, overwrite=True)
+        tools.novoalign.NovoalignTool().index_fasta(imputeFasta)
+        assembly.refine_assembly(
+            imputeFasta,
+            os.path.join(util.file.get_test_input_path(), 'G5012.3.testreads.bam'),
+            refine1Fasta,
+            # normally -r Random, but for unit tests, we want deterministic behavior
+            novo_params='-r None -l 30 -x 20 -t 502',
+            min_coverage=2,
+            threads=4)
+        actual = str(Bio.SeqIO.read(refine1Fasta, 'fasta').seq)
+        expected = str(Bio.SeqIO.read(os.path.join(inDir, 'expected.ebov.refine1.fasta'), 'fasta').seq)
+        self.assertEqual(actual, expected)
+
+    def test_ebov_refine2(self):
+        inDir = util.file.get_test_input_path(self)
+        inFasta = os.path.join(inDir, 'expected.ebov.refine1.fasta')
+        refine1Fasta = util.file.mkstempfname('.refine1.fasta')
+        refine2Fasta = util.file.mkstempfname('.refine2.fasta')
+        shutil.copy(inFasta, refine1Fasta)
+        tools.picard.CreateSequenceDictionaryTool().execute(refine1Fasta, overwrite=True)
+        tools.novoalign.NovoalignTool().index_fasta(refine1Fasta)
+        assembly.refine_assembly(
+            refine1Fasta,
+            os.path.join(util.file.get_test_input_path(), 'G5012.3.testreads.bam'),
+            refine2Fasta,
+            # normally -r Random, but for unit tests, we want deterministic behavior
+            novo_params='-r None -l 40 -x 20 -t 100',
+            min_coverage=3,
+            threads=4)
+        actual = str(Bio.SeqIO.read(refine2Fasta, 'fasta').seq)
+        expected = str(Bio.SeqIO.read(os.path.join(inDir, 'expected.ebov.refine2.fasta'), 'fasta').seq)
+        self.assertEqual(actual, expected)
+
diff --git a/test/integration/test_intrahost.py b/test/integration/test_intrahost.py
@@ -0,0 +1,80 @@
+# Unit tests for intrahost.py
+
+__author__ = "dpark@broadinstitute.org"
+
+# built-ins
+from collections import OrderedDict
+import os
+import os.path
+import shutil
+import tempfile
+import itertools
+import argparse
+import unittest
+
+# third-party
+import Bio
+import Bio.SeqRecord
+import Bio.Seq
+
+# module-specific
+import intrahost
+import util.file
+import util.vcf
+import test
+from intrahost import AlleleFieldParser
+import interhost
+import tools.mafft
+
+class TestPerSample(test.TestCaseWithTmp):
+    ''' This tests step 1 of the iSNV calling process
+        (intrahost.vphaser_one_sample), which runs V-Phaser2 on
+        a single sample, reformats the output slightly, and performs
+        strand-bias filtering and adds library-bias statistics.
+    '''
+
+    def test_vphaser_one_sample_indels(self):
+        # Files here were created as follows:
+        # ref.indels.fasta is Seed-stock-137_S2_L001_001.fasta
+        # in.indels.bam was created from Seed-stock-137_S2_L001_001.mapped.bam
+        #     as follows:
+        # Created two .sam files using samtools view, restricting to ranges
+        # 6811-7011 and 13081-13281, respectively. Paired reads were removed
+        # from those files by throwing out the second occurence of any read name
+        # and anding the flag fields with 16. Then, a random 90% of reads were
+        # removed, except that any reads containing the indel variants at
+        # 6911 and 13181 were kept. Then the resulting 2 files were combined.
+        myInputDir = util.file.get_test_input_path(self)
+        inBam = os.path.join(myInputDir, 'in.indels.bam')
+        refFasta = os.path.join(myInputDir, 'ref.indels.fasta')
+        outTab = util.file.mkstempfname('.txt')
+        intrahost.vphaser_one_sample(inBam, refFasta, outTab, vphaserNumThreads=4, minReadsEach=0)
+        expected = os.path.join(myInputDir, 'vphaser_one_sample_indels_expected.txt')
+        self.assertEqualContents(outTab, expected)
+
+    def test_vphaser_one_sample_2libs(self):
+        # in.2libs.bam was created by "manually" editing in.bam and moving about
+        # 1/3 of the reads to ReadGroup2.
+        myInputDir = util.file.get_test_input_path(self)
+        inBam = os.path.join(myInputDir, 'in.2libs.bam')
+        refFasta = os.path.join(myInputDir, 'ref.fasta')
+        outTab = util.file.mkstempfname('.txt')
+        intrahost.vphaser_one_sample(inBam, refFasta, outTab, vphaserNumThreads=4, minReadsEach=6, maxBias=3)
+        expected = os.path.join(myInputDir, 'vphaser_one_sample_2libs_expected.txt')
+        self.assertEqualContents(outTab, expected)
+
+    def test_vphaser_one_sample_3libs_and_chi2(self):
+        # In addition to testing that we can handle 3 libraries, this is testing
+        #    the chi2_contingency approximation to fisher_exact. The 4th, 5th,
+        #    and 6th rows have large enough minor allele count that their
+        #    p-values are calculated using the chi2 approximation. The other
+        #    rows are testing the 2 x 3 case of fisher_exact.
+        # in.3libs.bam was created by "manually" editing in.2libs.bam and moving
+        # about 1/2 of the reads in ReadGroup2 to ReadGroup3.
+        myInputDir = util.file.get_test_input_path(self)
+        inBam = os.path.join(myInputDir, 'in.3libs.bam')
+        refFasta = os.path.join(myInputDir, 'ref.fasta')
+        outTab = util.file.mkstempfname('.txt')
+        intrahost.vphaser_one_sample(inBam, refFasta, outTab, vphaserNumThreads=4, minReadsEach=6, maxBias=3)
+        expected = os.path.join(myInputDir, 'vphaser_one_sample_3libs_expected.txt')
+        self.assertEqualContents(outTab, expected)
diff --git a/test/integration/test_kraken.py b/test/integration/test_kraken.py
@@ -27,22 +27,6 @@ def fastq_to_sam():
     return tools.picard.FastqToSamTool()
 
 
-@pytest.fixture(scope='session')
-def sam_to_fastq():
-    return tools.picard.SamToFastqTool()
-
-
-def input_fastq_paths():
-    data_dir = join(util.file.get_test_input_path(), 'TestMetagenomicsSimple')
-    return [os.path.join(data_dir, f)
-            for f in ['zaire_ebola.1.fastq', 'zaire_ebola.2.fastq']]
-
-
-def input_bam_paths():
-    data_dir = join(util.file.get_test_input_path(), 'TestMetagenomicsViralMix')
-    return join(data_dir, 'test-reads.bam')
-
-
 @pytest.fixture(scope='session')
 def input_bam(request, tmpdir_factory, fastq_to_sam, db_type):
     data_dir = join(util.file.get_test_input_path(), db_type)
@@ -55,25 +39,8 @@ def input_bam(request, tmpdir_factory, fastq_to_sam, db_type):
         fastq_to_sam.execute(fastqs[0], fastqs[1], '', bam)
         return bam
 
-    return input_bam_paths()
-
-
-@pytest.fixture(scope='session')
-def input_fastqs(request, tmpdir_factory, sam_to_fastq, db_type):
-    data_dir = join(util.file.get_test_input_path(), db_type)
-    if db_type == 'TestMetagenomicsSimple':
-        fastqs = [join(data_dir, f)
-                  for f in ['zaire_ebola.1.fastq', 'zaire_ebola.2.fastq']]
-        return fastqs
-
-    bam = join(data_dir, 'test-reads.bam')
-    basename = os.path.basename(bam)
-    fastq1 = join(str(tmpdir_factory.getbasetemp()),
-                  '{}.1.fastq'.format(basename))
-    fastq2 = join(str(tmpdir_factory.getbasetemp()),
-                  '{}.2.fastq'.format(basename))
-    sam_to_fastq.execute(bam, fastq1, fastq2)
-    return fastq1, fastq2
+    data_dir = join(util.file.get_test_input_path(), 'TestMetagenomicsViralMix')
+    return join(data_dir, 'test-reads.bam')
 
 
 @pytest.fixture(scope='session')
@@ -138,18 +105,16 @@ def krona_db(request, tmpdir_factory, krona, db_type):
     return db
 
 
-def test_kraken_tool(tmpdir, kraken, kraken_db, input_fastqs):
+def test_kraken_tool(tmpdir, kraken, kraken_db, input_bam):
     outdir = tempfile.mkdtemp('-kraken')
     out = join(outdir, 'zaire_ebola.kraken')
     out_filtered = join(outdir, 'zaire_ebola.filtered-kraken')
     out_report = join(outdir, 'zaire_ebola.kraken-report')
-    assert kraken.classify(kraken_db, input_fastqs, out).returncode == 0
-    result = kraken.execute(
-        'kraken-filter', kraken_db, out_filtered, [out],
-        options={'--threshold': 0.05})
+    result = kraken.classify(input_bam, kraken_db, out)
+    assert result.returncode == 0
+    result = kraken.filter(out, kraken_db, out_filtered, 0.05)
     assert result.returncode == 0
-    result = kraken.execute(
-        'kraken-report', kraken_db, out_report, [out_filtered])
+    result = kraken.report(out_filtered, kraken_db, out_report)
     assert result.returncode == 0
 
     assert os.path.getsize(out_report) > 0

diff --git a/test/unit/test_assembly.py b/test/unit/test_assembly.py
@@ -41,7 +41,7 @@ class TestAssembleTrinity(TestCaseWithTmp):
 
     def test_execution(self):
         inDir = util.file.get_test_input_path(self)
-        inBam = os.path.join(inDir, 'G3952.1.subsamp.bam')
+        inBam = os.path.join(inDir, '..', 'G5012.3.subset.bam')
         clipDb = os.path.join(inDir, 'clipDb.fasta')
         outFasta = util.file.mkstempfname('.fasta')
         assembly.assemble_trinity(inBam, outFasta, clipDb, threads=4)
@@ -229,47 +229,6 @@ def test_small_mummer(self):
             str(Bio.SeqIO.read(outFasta, 'fasta').seq),
             str(Bio.SeqIO.read(expected, 'fasta').seq))
 
-class TestRefineAssembly(TestCaseWithTmp):
-    def test_ebov_refine1(self):
-        inDir = util.file.get_test_input_path(self)
-        inFasta = os.path.join(inDir, 'impute.ebov.fasta')
-        imputeFasta = util.file.mkstempfname('.imputed.fasta')
-        refine1Fasta = util.file.mkstempfname('.refine1.fasta')
-        shutil.copy(inFasta, imputeFasta)
-        tools.picard.CreateSequenceDictionaryTool().execute(imputeFasta, overwrite=True)
-        tools.novoalign.NovoalignTool().index_fasta(imputeFasta)
-        assembly.refine_assembly(
-            imputeFasta,
-            os.path.join(util.file.get_test_input_path(), 'G5012.3.testreads.bam'),
-            refine1Fasta,
-            # normally -r Random, but for unit tests, we want deterministic behavior
-            novo_params='-r None -l 30 -x 20 -t 502',
-            min_coverage=2,
-            threads=4)
-        actual = str(Bio.SeqIO.read(refine1Fasta, 'fasta').seq)
-        expected = str(Bio.SeqIO.read(os.path.join(inDir, 'expected.ebov.refine1.fasta'), 'fasta').seq)
-        self.assertEqual(actual, expected)
-
-    def test_ebov_refine2(self):
-        inDir = util.file.get_test_input_path(self)
-        inFasta = os.path.join(inDir, 'expected.ebov.refine1.fasta')
-        refine1Fasta = util.file.mkstempfname('.refine1.fasta')
-        refine2Fasta = util.file.mkstempfname('.refine2.fasta')
-        shutil.copy(inFasta, refine1Fasta)
-        tools.picard.CreateSequenceDictionaryTool().execute(refine1Fasta, overwrite=True)
-        tools.novoalign.NovoalignTool().index_fasta(refine1Fasta)
-        assembly.refine_assembly(
-            refine1Fasta,
-            os.path.join(util.file.get_test_input_path(), 'G5012.3.testreads.bam'),
-            refine2Fasta,
-            # normally -r Random, but for unit tests, we want deterministic behavior
-            novo_params='-r None -l 40 -x 20 -t 100',
-            min_coverage=3,
-            threads=4)
-        actual = str(Bio.SeqIO.read(refine2Fasta, 'fasta').seq)
-        expected = str(Bio.SeqIO.read(os.path.join(inDir, 'expected.ebov.refine2.fasta'), 'fasta').seq)
-        self.assertEqual(actual, expected)
-
 
 class TestMutableSequence(unittest.TestCase):
     ''' Test the MutableSequence class '''

diff --git a/test/unit/test_intrahost.py b/test/unit/test_intrahost.py
@@ -1,6 +1,6 @@
 # Unit tests for intrahost.py
 
-__author__ = "PLACEHOLDER"
+__author__ = "dpark@broadinstitute.org"
 
 # built-ins
 from collections import OrderedDict
@@ -130,7 +130,7 @@ def __iter__(self):
                            for acount in acounts]
 
 
-class TestPerSample(test.TestCaseWithTmp):
+class TestIntrahostFilters(unittest.TestCase):
     ''' This tests step 1 of the iSNV calling process
         (intrahost.vphaser_one_sample), which runs V-Phaser2 on
         a single sample, reformats the output slightly, and performs
@@ -151,6 +151,13 @@ def test_single_strand_bias_hard_filter(self):
         self.assertAlmostEqual(float(output[0][6]), expected[6], places=4)
         self.assertEqual(output[0][7:], expected[7:])
 
+class TestPerSample(test.TestCaseWithTmp):
+    ''' This tests step 1 of the iSNV calling process
+        (intrahost.vphaser_one_sample), which runs V-Phaser2 on
+        a single sample, reformats the output slightly, and performs
+        strand-bias filtering and adds library-bias statistics.
+    '''
+
     def test_vphaser_one_sample(self):
         # Files here were created as follows:
         # - in.bam was copied from input directory for TestVPhaser2; see notes
@@ -172,52 +179,6 @@ def test_vphaser_one_sample(self):
         expected = os.path.join(myInputDir, 'vphaser_one_sample_expected.txt')
         self.assertEqualContents(outTab, expected)
 
-    def test_vphaser_one_sample_indels(self):
-        # Files here were created as follows:
-        # ref.indels.fasta is Seed-stock-137_S2_L001_001.fasta
-        # in.indels.bam was created from Seed-stock-137_S2_L001_001.mapped.bam
-        #     as follows:
-        # Created two .sam files using samtools view, restricting to ranges
-        # 6811-7011 and 13081-13281, respectively. Paired reads were removed
-        # from those files by throwing out the second occurence of any read name
-        # and anding the flag fields with 16. Then, a random 90% of reads were
-        # removed, except that any reads containing the indel variants at
-        # 6911 and 13181 were kept. Then the resulting 2 files were combined.
-        myInputDir = util.file.get_test_input_path(self)
-        inBam = os.path.join(myInputDir, 'in.indels.bam')
-        refFasta = os.path.join(myInputDir, 'ref.indels.fasta')
-        outTab = util.file.mkstempfname('.txt')
-        intrahost.vphaser_one_sample(inBam, refFasta, outTab, vphaserNumThreads=4, minReadsEach=0)
-        expected = os.path.join(myInputDir, 'vphaser_one_sample_indels_expected.txt')
-        self.assertEqualContents(outTab, expected)
-
-    def test_vphaser_one_sample_2libs(self):
-        # in.2libs.bam was created by "manually" editing in.bam and moving about
-        # 1/3 of the reads to ReadGroup2.
-        myInputDir = util.file.get_test_input_path(self)
-        inBam = os.path.join(myInputDir, 'in.2libs.bam')
-        refFasta = os.path.join(myInputDir, 'ref.fasta')
-        outTab = util.file.mkstempfname('.txt')
-        intrahost.vphaser_one_sample(inBam, refFasta, outTab, vphaserNumThreads=4, minReadsEach=6, maxBias=3)
-        expected = os.path.join(myInputDir, 'vphaser_one_sample_2libs_expected.txt')
-        self.assertEqualContents(outTab, expected)
-
-    def test_vphaser_one_sample_3libs_and_chi2(self):
-        # In addition to testing that we can handle 3 libraries, this is testing
-        #    the chi2_contingency approximation to fisher_exact. The 4th, 5th,
-        #    and 6th rows have large enough minor allele count that their
-        #    p-values are calculated using the chi2 approximation. The other
-        #    rows are testing the 2 x 3 case of fisher_exact.
-        # in.3libs.bam was created by "manually" editing in.2libs.bam and moving
-        # about 1/2 of the reads in ReadGroup2 to ReadGroup3.
-        myInputDir = util.file.get_test_input_path(self)
-        inBam = os.path.join(myInputDir, 'in.3libs.bam')
-        refFasta = os.path.join(myInputDir, 'ref.fasta')
-        outTab = util.file.mkstempfname('.txt')
-        intrahost.vphaser_one_sample(inBam, refFasta, outTab, vphaserNumThreads=4, minReadsEach=6, maxBias=3)
-        expected = os.path.join(myInputDir, 'vphaser_one_sample_3libs_expected.txt')
-        self.assertEqualContents(outTab, expected)
-
 
 class VcfMergeRunner:
     ''' This creates test data and feeds it to intrahost.merge_to_vcf