Skip to content

Commit

Permalink
Merge pull request #338 from broadinstitute/dp-empty-data
Browse files Browse the repository at this point in the history
refactor tests and some kraken
  • Loading branch information
dpark01 committed Jun 13, 2016
2 parents 887c5d3 + 569331a commit 8062715
Show file tree
Hide file tree
Showing 10 changed files with 247 additions and 272 deletions.
35 changes: 9 additions & 26 deletions metagenomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,46 +412,29 @@ def kraken(inBam, db, outReport=None, outReads=None,
filterThreshold=None, numThreads=1):
assert outReads or outReport, (
'Either --outReads or --outReport must be specified.')

tmp_fastq1 = util.file.mkstempfname('.1.fastq')
tmp_fastq2 = util.file.mkstempfname('.2.fastq')
picard = tools.picard.SamToFastqTool()
picard_opts = {
'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute,
'CLIPPING_ACTION': 'X'
}
picard.execute(inBam, tmp_fastq1, tmp_fastq2,
picardOptions=tools.picard.PicardTools.dict_to_picard_opts(picard_opts),
JVMmemory=picard.jvmMemDefault)

kraken_tool = tools.kraken.Kraken()

# kraken classify
tmp_reads = util.file.mkstempfname('.kraken')
opts = {
'--paired': None,
'--threads': min(int(numThreads), util.misc.available_cpu_count()),
}
# Could be optimized in 3.5 piping directly to kraken-filter.
kraken_tool.classify(db, [tmp_fastq1, tmp_fastq2], tmp_reads, options=opts)
kraken_tool.classify(inBam, db, tmp_reads, numThreads=numThreads)

# kraken filter
if filterThreshold:
opts = {
'--threshold': filterThreshold,
}

tmp_filtered_reads = util.file.mkstempfname('.filtered-kraken')
kraken_tool.execute('kraken-filter', db, tmp_filtered_reads, args=[tmp_reads],
options=opts)
kraken_tool.filter(tmp_reads, db, tmp_filtered_reads, filterThreshold)
os.unlink(tmp_reads)
else:
tmp_filtered_reads = tmp_reads

# copy outReads
if outReads:
with open(tmp_filtered_reads, 'rb') as f_in:
with gzip.open(outReads, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)

# kraken report
if outReport:
kraken_tool.execute('kraken-report', db, outReport,
args=[tmp_filtered_reads])
kraken_tool.report(tmp_filtered_reads, db, outReport)


def krona(inTsv, db, outHtml, queryColumn=None, taxidColumn=None,
Expand Down
Binary file added test/input/empty.bam
Binary file not shown.
42 changes: 42 additions & 0 deletions test/integration/test_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,45 @@ def test_ref_assisted_assembly(self):

# in order to test the actual de novo pipeline, we need to add a clip db for trimmomatic
# then we should test from G5012.3.testreads.bam all the way through the assembly pipe

class TestRefineAssembly(TestCaseWithTmp):
def test_ebov_refine1(self):
inDir = util.file.get_test_input_path(self)
inFasta = os.path.join(inDir, 'impute.ebov.fasta')
imputeFasta = util.file.mkstempfname('.imputed.fasta')
refine1Fasta = util.file.mkstempfname('.refine1.fasta')
shutil.copy(inFasta, imputeFasta)
tools.picard.CreateSequenceDictionaryTool().execute(imputeFasta, overwrite=True)
tools.novoalign.NovoalignTool().index_fasta(imputeFasta)
assembly.refine_assembly(
imputeFasta,
os.path.join(util.file.get_test_input_path(), 'G5012.3.testreads.bam'),
refine1Fasta,
# normally -r Random, but for unit tests, we want deterministic behavior
novo_params='-r None -l 30 -x 20 -t 502',
min_coverage=2,
threads=4)
actual = str(Bio.SeqIO.read(refine1Fasta, 'fasta').seq)
expected = str(Bio.SeqIO.read(os.path.join(inDir, 'expected.ebov.refine1.fasta'), 'fasta').seq)
self.assertEqual(actual, expected)

def test_ebov_refine2(self):
inDir = util.file.get_test_input_path(self)
inFasta = os.path.join(inDir, 'expected.ebov.refine1.fasta')
refine1Fasta = util.file.mkstempfname('.refine1.fasta')
refine2Fasta = util.file.mkstempfname('.refine2.fasta')
shutil.copy(inFasta, refine1Fasta)
tools.picard.CreateSequenceDictionaryTool().execute(refine1Fasta, overwrite=True)
tools.novoalign.NovoalignTool().index_fasta(refine1Fasta)
assembly.refine_assembly(
refine1Fasta,
os.path.join(util.file.get_test_input_path(), 'G5012.3.testreads.bam'),
refine2Fasta,
# normally -r Random, but for unit tests, we want deterministic behavior
novo_params='-r None -l 40 -x 20 -t 100',
min_coverage=3,
threads=4)
actual = str(Bio.SeqIO.read(refine2Fasta, 'fasta').seq)
expected = str(Bio.SeqIO.read(os.path.join(inDir, 'expected.ebov.refine2.fasta'), 'fasta').seq)
self.assertEqual(actual, expected)

80 changes: 80 additions & 0 deletions test/integration/test_intrahost.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Unit tests for intrahost.py

__author__ = "dpark@broadinstitute.org"

# built-ins
from collections import OrderedDict
import os
import os.path
import shutil
import tempfile
import itertools
import argparse
import unittest

# third-party
import Bio
import Bio.SeqRecord
import Bio.Seq

# module-specific
import intrahost
import util.file
import util.vcf
import test
from intrahost import AlleleFieldParser
import interhost
import tools.mafft

class TestPerSample(test.TestCaseWithTmp):
''' This tests step 1 of the iSNV calling process
(intrahost.vphaser_one_sample), which runs V-Phaser2 on
a single sample, reformats the output slightly, and performs
strand-bias filtering and adds library-bias statistics.
'''

def test_vphaser_one_sample_indels(self):
# Files here were created as follows:
# ref.indels.fasta is Seed-stock-137_S2_L001_001.fasta
# in.indels.bam was created from Seed-stock-137_S2_L001_001.mapped.bam
# as follows:
# Created two .sam files using samtools view, restricting to ranges
# 6811-7011 and 13081-13281, respectively. Paired reads were removed
# from those files by throwing out the second occurence of any read name
# and anding the flag fields with 16. Then, a random 90% of reads were
# removed, except that any reads containing the indel variants at
# 6911 and 13181 were kept. Then the resulting 2 files were combined.
myInputDir = util.file.get_test_input_path(self)
inBam = os.path.join(myInputDir, 'in.indels.bam')
refFasta = os.path.join(myInputDir, 'ref.indels.fasta')
outTab = util.file.mkstempfname('.txt')
intrahost.vphaser_one_sample(inBam, refFasta, outTab, vphaserNumThreads=4, minReadsEach=0)
expected = os.path.join(myInputDir, 'vphaser_one_sample_indels_expected.txt')
self.assertEqualContents(outTab, expected)

def test_vphaser_one_sample_2libs(self):
# in.2libs.bam was created by "manually" editing in.bam and moving about
# 1/3 of the reads to ReadGroup2.
myInputDir = util.file.get_test_input_path(self)
inBam = os.path.join(myInputDir, 'in.2libs.bam')
refFasta = os.path.join(myInputDir, 'ref.fasta')
outTab = util.file.mkstempfname('.txt')
intrahost.vphaser_one_sample(inBam, refFasta, outTab, vphaserNumThreads=4, minReadsEach=6, maxBias=3)
expected = os.path.join(myInputDir, 'vphaser_one_sample_2libs_expected.txt')
self.assertEqualContents(outTab, expected)

def test_vphaser_one_sample_3libs_and_chi2(self):
# In addition to testing that we can handle 3 libraries, this is testing
# the chi2_contingency approximation to fisher_exact. The 4th, 5th,
# and 6th rows have large enough minor allele count that their
# p-values are calculated using the chi2 approximation. The other
# rows are testing the 2 x 3 case of fisher_exact.
# in.3libs.bam was created by "manually" editing in.2libs.bam and moving
# about 1/2 of the reads in ReadGroup2 to ReadGroup3.
myInputDir = util.file.get_test_input_path(self)
inBam = os.path.join(myInputDir, 'in.3libs.bam')
refFasta = os.path.join(myInputDir, 'ref.fasta')
outTab = util.file.mkstempfname('.txt')
intrahost.vphaser_one_sample(inBam, refFasta, outTab, vphaserNumThreads=4, minReadsEach=6, maxBias=3)
expected = os.path.join(myInputDir, 'vphaser_one_sample_3libs_expected.txt')
self.assertEqualContents(outTab, expected)
49 changes: 7 additions & 42 deletions test/integration/test_kraken.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,22 +27,6 @@ def fastq_to_sam():
return tools.picard.FastqToSamTool()


@pytest.fixture(scope='session')
def sam_to_fastq():
return tools.picard.SamToFastqTool()


def input_fastq_paths():
data_dir = join(util.file.get_test_input_path(), 'TestMetagenomicsSimple')
return [os.path.join(data_dir, f)
for f in ['zaire_ebola.1.fastq', 'zaire_ebola.2.fastq']]


def input_bam_paths():
data_dir = join(util.file.get_test_input_path(), 'TestMetagenomicsViralMix')
return join(data_dir, 'test-reads.bam')


@pytest.fixture(scope='session')
def input_bam(request, tmpdir_factory, fastq_to_sam, db_type):
data_dir = join(util.file.get_test_input_path(), db_type)
Expand All @@ -55,25 +39,8 @@ def input_bam(request, tmpdir_factory, fastq_to_sam, db_type):
fastq_to_sam.execute(fastqs[0], fastqs[1], '', bam)
return bam

return input_bam_paths()


@pytest.fixture(scope='session')
def input_fastqs(request, tmpdir_factory, sam_to_fastq, db_type):
data_dir = join(util.file.get_test_input_path(), db_type)
if db_type == 'TestMetagenomicsSimple':
fastqs = [join(data_dir, f)
for f in ['zaire_ebola.1.fastq', 'zaire_ebola.2.fastq']]
return fastqs

bam = join(data_dir, 'test-reads.bam')
basename = os.path.basename(bam)
fastq1 = join(str(tmpdir_factory.getbasetemp()),
'{}.1.fastq'.format(basename))
fastq2 = join(str(tmpdir_factory.getbasetemp()),
'{}.2.fastq'.format(basename))
sam_to_fastq.execute(bam, fastq1, fastq2)
return fastq1, fastq2
data_dir = join(util.file.get_test_input_path(), 'TestMetagenomicsViralMix')
return join(data_dir, 'test-reads.bam')


@pytest.fixture(scope='session')
Expand Down Expand Up @@ -138,18 +105,16 @@ def krona_db(request, tmpdir_factory, krona, db_type):
return db


def test_kraken_tool(tmpdir, kraken, kraken_db, input_fastqs):
def test_kraken_tool(tmpdir, kraken, kraken_db, input_bam):
outdir = tempfile.mkdtemp('-kraken')
out = join(outdir, 'zaire_ebola.kraken')
out_filtered = join(outdir, 'zaire_ebola.filtered-kraken')
out_report = join(outdir, 'zaire_ebola.kraken-report')
assert kraken.classify(kraken_db, input_fastqs, out).returncode == 0
result = kraken.execute(
'kraken-filter', kraken_db, out_filtered, [out],
options={'--threshold': 0.05})
result = kraken.classify(input_bam, kraken_db, out)
assert result.returncode == 0
result = kraken.filter(out, kraken_db, out_filtered, 0.05)
assert result.returncode == 0
result = kraken.execute(
'kraken-report', kraken_db, out_report, [out_filtered])
result = kraken.report(out_filtered, kraken_db, out_report)
assert result.returncode == 0

assert os.path.getsize(out_report) > 0
Expand Down
43 changes: 1 addition & 42 deletions test/unit/test_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class TestAssembleTrinity(TestCaseWithTmp):

def test_execution(self):
inDir = util.file.get_test_input_path(self)
inBam = os.path.join(inDir, 'G3952.1.subsamp.bam')
inBam = os.path.join(inDir, '..', 'G5012.3.subset.bam')
clipDb = os.path.join(inDir, 'clipDb.fasta')
outFasta = util.file.mkstempfname('.fasta')
assembly.assemble_trinity(inBam, outFasta, clipDb, threads=4)
Expand Down Expand Up @@ -229,47 +229,6 @@ def test_small_mummer(self):
str(Bio.SeqIO.read(outFasta, 'fasta').seq),
str(Bio.SeqIO.read(expected, 'fasta').seq))

class TestRefineAssembly(TestCaseWithTmp):
def test_ebov_refine1(self):
inDir = util.file.get_test_input_path(self)
inFasta = os.path.join(inDir, 'impute.ebov.fasta')
imputeFasta = util.file.mkstempfname('.imputed.fasta')
refine1Fasta = util.file.mkstempfname('.refine1.fasta')
shutil.copy(inFasta, imputeFasta)
tools.picard.CreateSequenceDictionaryTool().execute(imputeFasta, overwrite=True)
tools.novoalign.NovoalignTool().index_fasta(imputeFasta)
assembly.refine_assembly(
imputeFasta,
os.path.join(util.file.get_test_input_path(), 'G5012.3.testreads.bam'),
refine1Fasta,
# normally -r Random, but for unit tests, we want deterministic behavior
novo_params='-r None -l 30 -x 20 -t 502',
min_coverage=2,
threads=4)
actual = str(Bio.SeqIO.read(refine1Fasta, 'fasta').seq)
expected = str(Bio.SeqIO.read(os.path.join(inDir, 'expected.ebov.refine1.fasta'), 'fasta').seq)
self.assertEqual(actual, expected)

def test_ebov_refine2(self):
inDir = util.file.get_test_input_path(self)
inFasta = os.path.join(inDir, 'expected.ebov.refine1.fasta')
refine1Fasta = util.file.mkstempfname('.refine1.fasta')
refine2Fasta = util.file.mkstempfname('.refine2.fasta')
shutil.copy(inFasta, refine1Fasta)
tools.picard.CreateSequenceDictionaryTool().execute(refine1Fasta, overwrite=True)
tools.novoalign.NovoalignTool().index_fasta(refine1Fasta)
assembly.refine_assembly(
refine1Fasta,
os.path.join(util.file.get_test_input_path(), 'G5012.3.testreads.bam'),
refine2Fasta,
# normally -r Random, but for unit tests, we want deterministic behavior
novo_params='-r None -l 40 -x 20 -t 100',
min_coverage=3,
threads=4)
actual = str(Bio.SeqIO.read(refine2Fasta, 'fasta').seq)
expected = str(Bio.SeqIO.read(os.path.join(inDir, 'expected.ebov.refine2.fasta'), 'fasta').seq)
self.assertEqual(actual, expected)


class TestMutableSequence(unittest.TestCase):
''' Test the MutableSequence class '''
Expand Down
57 changes: 9 additions & 48 deletions test/unit/test_intrahost.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Unit tests for intrahost.py

__author__ = "PLACEHOLDER"
__author__ = "dpark@broadinstitute.org"

# built-ins
from collections import OrderedDict
Expand Down Expand Up @@ -130,7 +130,7 @@ def __iter__(self):
for acount in acounts]


class TestPerSample(test.TestCaseWithTmp):
class TestIntrahostFilters(unittest.TestCase):
''' This tests step 1 of the iSNV calling process
(intrahost.vphaser_one_sample), which runs V-Phaser2 on
a single sample, reformats the output slightly, and performs
Expand All @@ -151,6 +151,13 @@ def test_single_strand_bias_hard_filter(self):
self.assertAlmostEqual(float(output[0][6]), expected[6], places=4)
self.assertEqual(output[0][7:], expected[7:])

class TestPerSample(test.TestCaseWithTmp):
''' This tests step 1 of the iSNV calling process
(intrahost.vphaser_one_sample), which runs V-Phaser2 on
a single sample, reformats the output slightly, and performs
strand-bias filtering and adds library-bias statistics.
'''

def test_vphaser_one_sample(self):
# Files here were created as follows:
# - in.bam was copied from input directory for TestVPhaser2; see notes
Expand All @@ -172,52 +179,6 @@ def test_vphaser_one_sample(self):
expected = os.path.join(myInputDir, 'vphaser_one_sample_expected.txt')
self.assertEqualContents(outTab, expected)

def test_vphaser_one_sample_indels(self):
# Files here were created as follows:
# ref.indels.fasta is Seed-stock-137_S2_L001_001.fasta
# in.indels.bam was created from Seed-stock-137_S2_L001_001.mapped.bam
# as follows:
# Created two .sam files using samtools view, restricting to ranges
# 6811-7011 and 13081-13281, respectively. Paired reads were removed
# from those files by throwing out the second occurence of any read name
# and anding the flag fields with 16. Then, a random 90% of reads were
# removed, except that any reads containing the indel variants at
# 6911 and 13181 were kept. Then the resulting 2 files were combined.
myInputDir = util.file.get_test_input_path(self)
inBam = os.path.join(myInputDir, 'in.indels.bam')
refFasta = os.path.join(myInputDir, 'ref.indels.fasta')
outTab = util.file.mkstempfname('.txt')
intrahost.vphaser_one_sample(inBam, refFasta, outTab, vphaserNumThreads=4, minReadsEach=0)
expected = os.path.join(myInputDir, 'vphaser_one_sample_indels_expected.txt')
self.assertEqualContents(outTab, expected)

def test_vphaser_one_sample_2libs(self):
# in.2libs.bam was created by "manually" editing in.bam and moving about
# 1/3 of the reads to ReadGroup2.
myInputDir = util.file.get_test_input_path(self)
inBam = os.path.join(myInputDir, 'in.2libs.bam')
refFasta = os.path.join(myInputDir, 'ref.fasta')
outTab = util.file.mkstempfname('.txt')
intrahost.vphaser_one_sample(inBam, refFasta, outTab, vphaserNumThreads=4, minReadsEach=6, maxBias=3)
expected = os.path.join(myInputDir, 'vphaser_one_sample_2libs_expected.txt')
self.assertEqualContents(outTab, expected)

def test_vphaser_one_sample_3libs_and_chi2(self):
# In addition to testing that we can handle 3 libraries, this is testing
# the chi2_contingency approximation to fisher_exact. The 4th, 5th,
# and 6th rows have large enough minor allele count that their
# p-values are calculated using the chi2 approximation. The other
# rows are testing the 2 x 3 case of fisher_exact.
# in.3libs.bam was created by "manually" editing in.2libs.bam and moving
# about 1/2 of the reads in ReadGroup2 to ReadGroup3.
myInputDir = util.file.get_test_input_path(self)
inBam = os.path.join(myInputDir, 'in.3libs.bam')
refFasta = os.path.join(myInputDir, 'ref.fasta')
outTab = util.file.mkstempfname('.txt')
intrahost.vphaser_one_sample(inBam, refFasta, outTab, vphaserNumThreads=4, minReadsEach=6, maxBias=3)
expected = os.path.join(myInputDir, 'vphaser_one_sample_3libs_expected.txt')
self.assertEqualContents(outTab, expected)


class VcfMergeRunner:
''' This creates test data and feeds it to intrahost.merge_to_vcf
Expand Down
Loading

0 comments on commit 8062715

Please sign in to comment.