Skip to content

Commit

Permalink
Merge github.com:broadinstitute/viral-ngs into sy-pytest
Browse files Browse the repository at this point in the history
  • Loading branch information
yesimon committed May 4, 2016
2 parents 5fa0345 + 45cc8ed commit 2512d7c
Show file tree
Hide file tree
Showing 40 changed files with 1,194 additions and 71 deletions.
5 changes: 4 additions & 1 deletion reports.py
@@ -1,5 +1,5 @@
#!/usr/bin/env python
''' Reports
''' Functions to create reports from genomics pipeline data.
'''

__author__ = "dpark@broadinstitute.org"
Expand Down Expand Up @@ -166,6 +166,9 @@ def parser_assembly_stats(parser=argparse.ArgumentParser()):


def alignment_summary(inFastaFileOne, inFastaFileTwo, outfileName=None, printCounts=False):
""" Write or print pairwise alignment summary information for sequences in two FASTA
files, including SNPs, ambiguous bases, and indels.
"""
gap = '-'
ambiguous = 'N'
aligner = tools.muscle.MuscleTool()
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Expand Up @@ -3,6 +3,6 @@

biopython==1.64
future>=0.15.2
pysam==0.8.1
pytest==2.9.1
pysam==0.8.3
PyYAML==3.11
87 changes: 77 additions & 10 deletions taxon_filter.py
Expand Up @@ -261,10 +261,10 @@ def filter_lastal_bam(
inBam,
db,
outBam,
max_gapless_alignments_per_position,
min_length_for_initial_matches,
max_length_for_initial_matches,
max_initial_matches_per_position,
max_gapless_alignments_per_position=1,
min_length_for_initial_matches=5,
max_length_for_initial_matches=50,
max_initial_matches_per_position=100,
JVMmemory=None
):
''' Restrict input reads to those that align to the given
Expand Down Expand Up @@ -329,7 +329,7 @@ def filter_lastal(
max_initial_matches_per_position=100
):
''' Restrict input reads to those that align to the given
reference database using LASTAL. Also, remove duplicates with prinseq.
reference database using LASTAL. Also, remove duplicates with prinseq.
'''
assert outFastq.endswith('.fastq')
tempFilePath = mkstempfname('.hits')
Expand Down Expand Up @@ -736,7 +736,7 @@ def no_blast_hits(blastOutCombined, inFastq, outFastq):
outf.write(line1 + line2 + line3 + line4)


def deplete_blastn(inFastq, outFastq, refDbs, threads):
def deplete_blastn(inFastq, outFastq, refDbs, threads=1, chunkSize=1000000):
'Use blastn to remove reads that match at least one of the databases.'

# Convert to fasta
Expand All @@ -747,7 +747,7 @@ def deplete_blastn(inFastq, outFastq, refDbs, threads):
blastOutFiles = []
for db in refDbs:
log.info("running blastn on %s against %s", inFastq, db)
blastOutFiles += blastn_chunked_fasta(inFasta, db, threads)
blastOutFiles += blastn_chunked_fasta(inFasta, db, chunkSize, threads)

# Combine results from different databases
blastOutCombined = mkstempfname('.txt')
Expand Down Expand Up @@ -846,7 +846,7 @@ def deplete_blastn_bam(inBam, db, outBam, threads, chunkSize=1000000, JVMmemory=
os.unlink(fastq1)
os.unlink(fastq2)
log.info("running blastn on %s pair 2 against %s", inBam, db)
blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize)
blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize, threads)
with open(blast_hits, 'wt') as outf:
for blastOutFile in blastOutFiles:
with open(blastOutFile, 'rt') as inf:
Expand Down Expand Up @@ -899,15 +899,15 @@ def wrapper(inBam, db, outBam, threads, JVMmemory=None):


def lastal_build_db(inputFasta, outputDirectory, outputFilePrefix):

''' build a database for use with last based on an input fasta file '''
if outputFilePrefix:
outPrefix = outputFilePrefix
else:
baseName = os.path.basename(inputFasta)
fileNameSansExtension = os.path.splitext(baseName)[0]
outPrefix = fileNameSansExtension

tools.last.Lastdb().execute(inputFasta=inputFasta, outputDirectory=outputDirectory, outputFilePrefix=outPrefix)
tools.last.Lastdb().build_database(inputFasta, os.path.join(outputDirectory, outPrefix))


def parser_lastal_build_db(parser=argparse.ArgumentParser()):
Expand All @@ -924,6 +924,73 @@ def parser_lastal_build_db(parser=argparse.ArgumentParser()):

__commands__.append(('lastal_build_db', parser_lastal_build_db))


# ========================
# *** blastn_build_db ***
# ========================

def blastn_build_db(inputFasta, outputDirectory, outputFilePrefix):
""" Create a database for use with blastn from an input reference FASTA file
"""

if outputFilePrefix:
outPrefix = outputFilePrefix
else:
baseName = os.path.basename(inputFasta)
fileNameSansExtension = os.path.splitext(baseName)[0]
outPrefix = fileNameSansExtension

blastdb_path = tools.blast.MakeblastdbTool().build_database(inputFasta, os.path.join(outputDirectory, outPrefix))


def parser_blastn_build_db(parser=argparse.ArgumentParser()):
parser.add_argument('inputFasta', help='Location of the input FASTA file')
parser.add_argument('outputDirectory', help='Location for the output files')
parser.add_argument(
'--outputFilePrefix',
help='Prefix for the output file name (default: inputFasta name, sans ".fasta" extension)'
)
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, blastn_build_db, split_args=True)
return parser


__commands__.append(('blastn_build_db', parser_blastn_build_db))


# ========================
# *** bmtagger_build_db ***
# ========================

def bmtagger_build_db(inputFasta, outputDirectory, outputFilePrefix):
""" Create a database for use with Bmtagger from an input FASTA file.
"""
if outputFilePrefix:
outPrefix = outputFilePrefix
else:
baseName = os.path.basename(inputFasta)
fileNameSansExtension = os.path.splitext(baseName)[0]
outPrefix = fileNameSansExtension

bmtooldb_path = tools.bmtagger.BmtoolTool().build_database(inputFasta, os.path.join(outputDirectory, outPrefix+".bitmask"))
srprismdb_path = tools.bmtagger.SrprismTool().build_database(inputFasta, os.path.join(outputDirectory, outPrefix+".srprism"))

def parser_bmtagger_build_db(parser=argparse.ArgumentParser()):
parser.add_argument('inputFasta', help='Location of the input FASTA file')
parser.add_argument('outputDirectory', help='Location for the output files (Where *.bitmask and *.srprism files will be stored)')
parser.add_argument(
'--outputFilePrefix',
help='Prefix for the output file name (default: inputFasta name, sans ".fasta" extension)'
)
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, bmtagger_build_db, split_args=True)
return parser


__commands__.append(('bmtagger_build_db', parser_bmtagger_build_db))



# ========================


Expand Down
53 changes: 52 additions & 1 deletion test/__init__.py
Expand Up @@ -2,16 +2,67 @@

__author__ = "irwin@broadinstitute.org"

# built-ins
import filecmp
import os
import unittest
import util.file
import hashlib

# intra-project
import util.file
from tools.samtools import SamtoolsTool

def assert_equal_contents(testCase, filename1, filename2):
'Assert contents of two files are equal for a unittest.TestCase'
testCase.assertTrue(filecmp.cmp(filename1, filename2, shallow=False))

def assert_equal_bam_reads(testCase, bam_filename1, bam_filename2):
''' Assert that two bam files are equivalent
This function converts each file to sam (plaintext) format,
without header, since the header can be variable.
Test data should be stored in bam format rather than sam
to save space, and to ensure the bam->sam conversion
is the same for both files.
'''

samtools = SamtoolsTool()

sam_one = util.file.mkstempfname(".sam")
sam_two = util.file.mkstempfname(".sam")

# write the bam files to sam format, without header (no -h)
samtools.view(args=[], inFile=bam_filename1, outFile=sam_one)
samtools.view(args=[], inFile=bam_filename2, outFile=sam_two)

try:
testCase.assertTrue(filecmp.cmp(sam_one, sam_two, shallow=False))
finally:
for fname in [sam_one, sam_two]:
if os.path.exists(fname):
os.unlink(fname)

def assert_md5_equal_to_line_in_file(testCase, filename, checksum_file):
''' Compare the checksum of a test file with the expected checksum
stored in a second file
compare md5(test_output.bam) with expected_output.bam.md5:1
'''

hash_md5 = hashlib.md5()
with open(filename, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)

expected_checksum = ""
with open(checksum_file, "rb") as f:
expected_checksum = str(f.readline().decode("utf-8"))

expected_checksum = expected_checksum.replace("\r","").replace("\n","")

assert len(expected_checksum) > 0

testCase.assertEqual(hash_md5.hexdigest(), expected_checksum)

class TestCaseWithTmp(unittest.TestCase):
'Base class for tests that use tempDir'
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,21 @@
20
Bundibugyo_ebolavirus,_complete_genome
Bundibugyo_ebolavirus_isolate_EboBund-14_2012,_complete_genome
Cote_d'Ivoire_ebolavirus,_complete_genome
Reston_ebolavirus_isolate_RESTV/M.fascicularis-tc/PHL-USA/1996/Ferlite,_Philippines/Alice,_TX,_complete_genome
Reston_ebolavirus_isolate_RESTV/Sus-wt/PHL/2009/09A_Farm_A,_complete_genome
Reston_ebolavirus_-_Reston_strain_Reston08-C,_complete_genome
Reston_ebolavirus_-_Reston_strain_Reston08-E,_complete_genome
Reston_Ebola_virus_strain_Pennsylvania,_complete_genome
Sudan_ebolavirus_isolate_EboSud-609_2012,_complete_genome
Sudan_ebolavirus_isolate_EboSud-639,_complete_genome
Sudan_ebolavirus_isolate_EBOV-S-2004_from_Sudan,_complete_genome
Sudan_ebolavirus_-_Nakisamata,_complete_genome
Sudan_ebolavirus_strain_Gulu,_complete_genome
Zaire_ebolavirus_isolate_EBOV/H.sapiens-tc/COD/1977/Bonduni,_complete_genome
Zaire_ebolavirus_isolate_EBOV/H.sapiens-tc/COD/2007/43_Luebo,_complete_genome
Zaire_ebolavirus_isolate_EBOV/H.sapiens-tc/GAB/1996/1Eko,_complete_genome
Zaire_ebolavirus_isolate_EBOV/H.sapiens-tc/GAB/1996/2Nza,_complete_genome
Zaire_ebolavirus_isolate_EBOV/H.sapiens-tc/GAB/1996/Ilembe,_complete_genome
Zaire_ebolavirus_isolate_H.sapiens-wt/GIN/2014/Gueckedou-C05,_complete_genome
Zaire_ebolavirus_strain_Zaire_1995,_complete_genome
@@ -0,0 +1 @@
7ecc55c01c27f9569f3926587d5b1743
Binary file not shown.
Empty file.
Binary file not shown.
@@ -0,0 +1 @@
R
Binary file not shown.
85 changes: 85 additions & 0 deletions test/input/TestDepleteHuman/5kb_human_from_chr6.fasta
@@ -0,0 +1,85 @@
>human_5000bp random human subsequence
GAAGTTTGGATTATGATATGCTCATTAATTATAAAACAGACTTCAGTAGTCAGAGAATAG
TACACATAGGCCATTGAGCAAGTAGGAAATTAAAACAAAATGAAAAAATAAACACTTTCT
GCTTTTTTTTTCTTCTTATGTATTGTGAAGAAAATCCAGGGATGGAGTGAATTATCAGTT
TAGTTCCCACCTTTGTATATCTGATCTGTCACACACAAATCCACATATACACTTATATGT
GAATATATGGCATTAAATGCTCACAAGCAGTCTATGCATATATTATTTGAAAAGATAATC
ATTTTTCAATAGTTTGAGAATATATTCCCTGTTAAACTGAATTTACTAACTAAATATTTA
CTAAATATACTCTTAACCAGGGTGATTTATGAGATCATAGGAATAACAAAGAATAAAAAT
ACCTTCTGCAAAGGAAGCTTTAAATTTTAATTTGATTTGTTTGAAAATTCACATGAAGTC
TTTGTAATCAATACATTCAGATCAGTATTCTACGAATCTTTCTGCATGTCAGAAATTAAA
ATTTTATTTAGGATTGAAGAGTAAATCATTTGCCAAGTAGGTTTCTAAAAGTGAAAACAA
ATGTCAGTTTCAAAAAAGAACAATTTAGTTTTACATTTCTGAGGAAGTCACATATATTAT
GTATTAATAAATTAACAGTCTCACATTCTCTTTGATCATTGATTGTTTCTTGGCAGTTTG
TTACATACTAGATTCTACACATCTCACAGCTATAAATTCAGTACCAGAATTTAGGGAGAT
AATTTTAATGTAAAACAAAGATTGTTGTAAGAGacaacatgtcatagtgtttaagggtca
gactccctgggttccagtcccaatttccagctctgtggcattgacacttttacacacttc
tctaaacatcagggttttgttttgtttcgtttttttgttgtttctgttttctgtgaagtg
gtgatgattttagtgaaataattggaagactgaatgaggtaatgcatgtaaaatacttag
cataatgtttggactgtgtagtaaaggctcaataaatgttaaattccattaATTGGGTAA
TCCTGGAACTCTTCACTGCAATTAGAATTCTTTATTACAACAAATAAAATTTTGATTCAT
CATAAAAAGTAATCAATTTCTTTAATGGACCTCAGAGAAATTAAAATGCCTCCTGGACTG
TTAAGTCCTCAGAAATGTATTGCATCTTTAATTCTTTATTTCCTCTGAATACTCATGCAC
ATGAGAAAAAATTTCTAAGTTAAGACATGTCTATTCTCCTTTAGTCCTCATTTCAAATAT
CAACCACTGTTTACAGTATCACCACTTTTTCCTTTTAAAATGGACTAATCAGTTTTATTT
AGAGATTGTTTGACAAAGGGTGATTGCCCCAAATGATGGTTTCCTGAAGACCAGGATATA
CCAACCAAGAAAAGGCAAGTATATTTTGTCCACCTAAGTAACTTCTTCTTCTGTGTATTT
TTCTGCAAATATGGATGTTCCTTTGGACTAGTCCTTTCTCAATCATATTGAAGTTTAAAT
ATATTTACTTTAATTGGTAAAATCAACTTAAAAATCTCTATTCTGATTTTTTCCTAAGGA
TTGGTATGCATGTTAAAAGTAACCAAAGTATAATTTATGTCTTGGCCTGTATTGAATGCC
TTTTTCCTCCTGAGCAGAGTCAATTGAAAAATATCTTGCTTGCTGCCTCCATTTTTAACA
TTTATTCTTAGGCTTAATGTAGGAAACTAGCAATAAAATATAAATCCATATTCTATTTTT
TAAAAGATCATACTGTTTTATTTTCAATTTTATGAAACTTTTACTTTTAGATATTACAAA
CAAGAGAATTAGAAATGTGACGTTAGACTTTTAACTCTCTGTAGTCCTTTGTATTTAGGA
GAGAATTTGGATTAGAAAACCACCTTCTGGAGGAGATTTTGACATTActgtgtgagcaaa
ttcaggcaagtcacataatctcttagagagaacttcagtttgctcagttataaagatcaa
aataatcatctgtaacacactggattatatggaaagtgtaatatacacgaaatctattta
gaaaagtgaaaaccagcatgcaaaaatgttgttgtaattattattattGCCAACATAAAC
TTGTAAATTTGTTCACATTTATTAGTTTAGTTCATTGGCCTGTATGTATGTGAAATAATG
GTCCAGTCCTTTGATGATAAATATCACAAATTGTTGGATAGATTTGCTGGGTTTGACATT
TTAAATTAAGACTCTGATATTCTGTTAGGAAAACATTTATTTCTTTGGTTTTTCTTTTTA
CTTTCTATATCACTGGAAGCAATGAGTTTCggcttcatattcacatatttcagagttcca
ctcccaaccctgctttccactagtatgacctttggcagtttgcttcacagtttggttcat
agaaaatatgaactttattttctgcatctgcaaaatagaaatcataTCTATTCTTTCTTA
ATATAGTCTGTACCAAAAGCAATATTAAGATTAATGCACAAATAATGCAGAAGTGGGAAG
TTCTCTTAATTCAGGTTTTTTAAGTAATCTAAAACGTTGTTTTAGGCTGCCCATAGAAAA
TGAAGTATAATGACTTCGATTTTATTCAATTCCAGTTTTACTCATCTGTAGATGAAGATC
AGAAATATAATGAGTTGCTTTATAATAGGCTTCCACTCTTACATCAATCAGACAAGCAGC
CTAAAAATAAACACATCCTTTCTGTCCTTGTGTTTCCTGGGAAAGTCAAGAGTTAGCATT
AAAGGAAAATGAGTACAAATGAAGATTTATCAAAAAATTTCCCCAAGTGCTAAAAACAAA
AATGGTCTGGGAGATATTGTAGTTTTACAGAACTATTTGTTTTCTGTAGGGGGCAAAGGC
TATATATAAATAAGCCTAGGTCTCAGTGCCATGTCATGTAACTAAGAATTAAATGTATTA
GTCTTTAAAGAATGCAAATGAGGTAAATTTTACATCAGTAAAAATTTTTAAACAGACGAA
AATATGGAAACGTGTTAACAGAGCTTCAAAATTTTATGCTATGTTTTTGAATCAGTCTTG
AAAATACTACAGATTGGTGTCTGTCCTCCTCTGCAGGTGATAGGTGGGACTATTTCAGAG
GACAAATGTGATCAAGCAGCTGGCCATGGTTTTGTTCTGATGGGATTACTCAGCAAATTT
AGTAGATAAACTAAACATTTTCCTATTACTTTCTACTTAAAATACTTACTCAACTTGTAA
AGGGTTTTATCAAACCTTAGACTTTGTAAACACTTTTAAAAACACTCAGTTTTGTGCCTT
TGAACAAGTGTAGCAAAATTAATAGGTGAATCTATATGCAAATTTGTATAGATCTTTGCT
TAATTCAGGTAATATCAGCCTAATATATCTATTTTGATTAATTATGTATTTGATTAATTA
AACACAACTGGCATTGCACAACTTAACCCCAAATGTATAAACTTTTCTGTTAACCTGGCA
TCTATGGGAAATGGCTATTTTAGTAATCCGTTAATGGAAATTTACCATCTGTAGTAGATA
TTTTGAGAATTCAAATGGAACAAACAACAGACTACTAAGCCCATACTTTGTATTTATTTA
TTTGTGTATTTCataatcacaatactaatgaattattcctgtgtgtcagataaaatttta
aatattaactctttcattcttcctaagagccccatgagatagctgttattattacttttg
ttttctggatgaaaagactgagacacagaaaagttaagtgacttgcatagtatcacattg
agaagaggcagagctgaattcaagttcattctctgagcccttacactataccaccactct
gTAAAGATACATATAACACAATATTAGTTAAAAACTCAAAAAGAACTTAGCTTATCCCAT
GTTTTGTAACTATAGGACGTTTCAAATAGAAAAATATATGTATTTTAAATCCTGCAAAGT
ACATCTCCAGATATGCTTAGTGGCCTCTGTCTGGTAATGTTTCTTCCACCTAGTGTTTTC
CAAGCTATAGCCTGCATGTATCCTCACTAATAAAAGGACTTTCTTCTTTAACCAAGCCTA
TTGCTCAAAGAATTCTGCAATCTCATCTCTCCCAGGAAACCTCCAAGTGATAAAAAATGG
AGTATTGATTTCGACCTGCTATGAGACATGCAAAGTTTACTTTTTACTCACTAATACAAA
GTTTTTTGGATTGCTACATTTTAAATTAGTATACATATTTATTTAGATATAAGTTTAGTC
TATGTTTAACATTTTGTTTTAGAAAAATTTAAATGTCTTAGGTTACAGGGGCATAGATGT
TAAAGGCTTTGCTGTTTAATTACCTTGAACTTTACTTAGCACCATATTAACACAACGAGC
TGCTTAATCTTTTCTGTAATTATGATTATAGAAGATCAAAATAAAACTAGAATTACAATT
ATCTTTTAATGATTTTAAGTTGGTAAGATGTCATTTCTACTAGACAAAAATAATTTATAG
TTAGATTAAGCATCTCTAGTACCAGAAGATACAGGATACCTGTAAAAGAAAGAGAGGTGA
AAAGTTCATTCCCTGGCTTAGAGaatattttatttaatttttgtctttaaaataattttC
ATAAGTTCTTTTGTAACTCTCAACTTTTATTTACCTGAGAACCAAAAATAAATATCCTAT
TAGAAAAAAATAAATCCTGATCCATTCTAACTAAAATAATTTTTTTGTGCTATTTTCTTA
ATTGTGGTAACACTCAATTACTGAAAATCAAGCTGATGCAATTTTTTTTAAATGATAAGT
GATTCAATGTTTTGGGGATGTGACAGTTTCCTCAGAGGAACTAATCAAAACAAAATTATT
TTTCTTGATAAAAAACAGCTACTTCATTTTTTCTAAAAAAAAAAGACAATTCAGAGAACT
TATGTAGGTCATTGTCAGTA
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit 2512d7c

Please sign in to comment.