Skip to content

Commit

Permalink
Merge pull request #245 from broadinstitute/dp-mummer-sensitivity
Browse files Browse the repository at this point in the history
increase mummer alignment sensitivity, revert novoindex kmer settings
  • Loading branch information
dpark01 committed Apr 1, 2016
2 parents 4164f3e + 44599c0 commit cb8a366
Show file tree
Hide file tree
Showing 8 changed files with 98 additions and 13 deletions.
8 changes: 5 additions & 3 deletions assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def parser_assemble_trinity(parser=argparse.ArgumentParser()):

def order_and_orient(inFasta, inReference, outFasta,
breaklen=None, # aligner='nucmer', circular=False, trimmed_contigs=None,
maxgap=None, minmatch=None, mincluster=None,
maxgap=200, minmatch=10, mincluster=None,
min_pct_id=0.6, min_contig_len=200, min_pct_contig_aligned=0.6):
''' This step cleans up the de novo assembly with a known reference genome.
Uses MUMmer (nucmer or promer) to create a reference-based consensus
Expand Down Expand Up @@ -234,15 +234,17 @@ def parser_order_and_orient(parser=argparse.ArgumentParser()):
dest="breaklen")
parser.add_argument("--maxgap", "-g",
help="""Maximum gap between two adjacent matches in a cluster.
Our default is %(default)s.
nucmer default 90, promer default 30. Manual suggests going to 1000.""",
type=int,
default=None,
default=200,
dest="maxgap")
parser.add_argument("--minmatch", "-l",
help="""Minimum length of an maximal exact match.
Our default is %(default)s.
nucmer default 20, promer default 6.""",
type=int,
default=None,
default=10,
dest="minmatch")
parser.add_argument("--mincluster", "-c",
help="""Minimum cluster length.
Expand Down
2 changes: 1 addition & 1 deletion pipes/Broad_UGER/run-pipe.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ source "$VENVDIR/bin/activate"
# invoke Snakemake in cluster mode with custom wrapper scripts
snakemake --timestamp --rerun-incomplete --keep-going --nolock \
--jobs 100 --immediate-submit \
--latency-wait 20 \
--latency-wait 60 \
--config mode=UGER \
--directory . \
--jobscript "$BINDIR/pipes/Broad_UGER/jobscript.sh" \
Expand Down
4 changes: 2 additions & 2 deletions pipes/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ assembly_min_unambig: 0.80

# Alignment parameters for nucmer (MUMmer) during the scaffolding step.
# Shown below are defaults.
#assembly_nucmer_max_gap: 90
#assembly_nucmer_min_match: 20
#assembly_nucmer_max_gap: 200
#assembly_nucmer_min_match: 10
#assembly_nucmer_min_cluster: 65

# The number of threads to use for tools supporting multiple threads.
Expand Down
2 changes: 1 addition & 1 deletion pipes/rules/assembly.rules
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ rule map_reads_to_self:
params: LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'),
UGER=config.get('UGER_queues', {}).get('long', '-q long'),
logid="{sample}",
novoalign_options = "-r Random -l 40 -g 40 -x 20 -t 100 -k -c 3",
novoalign_options = "-r Random -l 40 -g 40 -x 20 -t 100 -k",
numThreads=str(config.get("number_of_threads", 1))
run:
makedirs([os.path.join(config["data_dir"], config["subdirs"]["align_self"]),
Expand Down
5 changes: 2 additions & 3 deletions pipes/rules/reports.rules
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,11 @@ rule spikein_report:
params: LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'),
UGER=config.get('UGER_queues', {}).get('short', '-q short'),
logid="{sample}",
spikeins_db=config["spikeins_db"],
numThreads=str(config.get("number_of_threads", 1))
spikeins_db=config["spikeins_db"]
run:
makedirs(os.path.join(config["reports_dir"], 'spike_count'))
makedirs(os.path.join(config["tmp_dir"], config["subdirs"]["depletion"]))
shell("{config[bin_dir]}/read_utils.py align_and_count_hits {input} {params.spikeins_db} {output} --threads={params.numThreads}")
shell("{config[bin_dir]}/read_utils.py align_and_count_hits {input} {params.spikeins_db} {output}")

rule consolidate_spike_count:
input: expand("{{dir}}/spike_count/{sample}.spike_count.txt", \
Expand Down
3 changes: 1 addition & 2 deletions read_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1036,7 +1036,7 @@ def parser_align_and_fix(parser=argparse.ArgumentParser()):
# =========================

def align_and_count_hits(inBam, refFasta, outCounts, includeZeros=False,
JVMmemory=None, threads=1):
JVMmemory=None):
''' Take reads, align to reference with Novoalign and return aligned
read counts for each reference sequence.
'''
Expand Down Expand Up @@ -1072,7 +1072,6 @@ def parser_align_and_count_hits(parser=argparse.ArgumentParser()):
action="store_true",
dest="includeZeros")
parser.add_argument('--JVMmemory', default='4g', help='JVM virtual memory size (default: %(default)s)')
parser.add_argument('--threads', default=8, help='Number of threads (default: %(default)s)')
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, align_and_count_hits, split_args=True)
return parser
Expand Down
85 changes: 85 additions & 0 deletions test/input/TestOrderAndOrient/expected.influenza.fasta
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,46 @@ GGGCAAAGAAGACAAGAGATATGGCCCAGCATTGAGCATCAATGAGCTGAGCAATCTTGC
AAAGGGAGAGAAGGCTAATGTGCTAATTGGGCAAGGAGACGTGGTGTTGGTGATGAAACG
GAAACGGGACTCTAGCATACTTACTGACAGCCAGACAGCGACCAAAAGAATTCGGATGGC
CATCAATTAGTGTCGAATTGTTTA
>gi|8486134|ref|NC_002021.1|
AACCATTTGAATGGATGTCAATCCGACTTTACTTTTCTTGAAAGTTCCAGCGCAAAATGC
CATAAGCACCACATTCCCGTACACTGGAGACCCTCCATACAGCCATGGAACAGGAACAGG
ATACACCATGGACACGGTCAACAGAACACATCAATATTCAGAAAAGGGGAAATGGACAAC
AAACTCAGAGACTGGAGCACCCCAACTTAACCCAATTGATGGACCACTGCCCGAGGACAA
TGAGCCAAGTGGATATGCACAAACGGACTGTGTCCTTGAAGCAATGGCTTTCCTTGAAGA
ATCCCACCCAGGAATCTTTGAAAACTCGTGTCTTGAGACGATGGAAGTTGTTCAACAAAC
AAGAGTAGACAAATTGACCCAAGGCCGTCAGACCTATGATTGGACATTAAACAGGAATCA
ACCGGCTGCAACTGCATTAGCTAATACTATAGAGGTATTCAGATCGAACGGTCTGACAGC
TAATGAATCAGGAAGGCTAATAGATTTTCTCAAGGATGTGATGGAATCAATGGATAAAGA
GGAAATGGAAATAACAACGCATTTCCAAAGGAAAAGAAGAGTAAGGGACAACATGACCAA
GAAAATGGTCACACAAAGGACAATAGGAAAGAAGAAGCAGAGGCTAAATAAAAAGAGCTA
TCTAATAAGAGCCTTGACACTGAACACAATGACAAAAGACGCCGAAAGAGGCAAATTAAA
GAGAAGAGCAATTGCAACACCCGGAATGCAAATCAGAGGATTTGTATACTTTGTTGAAAC
ATTAGCAAGGAGCATTTGTGAGAAGCTTGAACAATCTGGACTCCCAGTTGGAGGCAATGA
AAAGAAGGCTAAACTGGCAAATGTTGTGAGGAAAATGATGACTAATTCACAAGATACAGA
ACTCTCCTTCACAATCACTGGAGACAACACCAAATGGAATGAAAACCAGAACCCTAGGAT
GTTTCTGGCAATGATAACATATATAACAAGAAACCAACCTGAATGGTTTAGGAATGTCTT
GAGCATTGCACCCATAATGTTCTCAAACAAAATGGCAAGGCTAGGGAAAGGATATATGTT
CGAAAGTAAGAGCATGAAGCTTCGAACACAAATACCGGCAGAAATGCTAGCAAGTATTGA
TCTGAAATATTTTAATGAGTCGACAAAAAAGAAAATTGAAAAGATAAGACCTCTTCTAAT
AGATGGTACAGCCTCATTGAGTCCTGGAATGATGATGGGCATGTTCAACATGCTAAGTAC
AGTTTTGGGAGTTTCGATTTTAAATCTAGGACAAAAGAGGTACACCAAAACAACATACTG
GTGGGACGGGCTCCAATCCTCTGATGACTTTGCTCTCATAGTGAATGCCCCGAATCATGA
GGGGATACAAGCAGGAGTAGACAGATTCTATAGAACCTGCAAGCTGGTTGGAATCAACAT
GAGCAAAAAGAAGTCCTACATAAACAGGACAGGAACATTTGAATTCACAAGTTTTTTCTA
TCGCTATGGATTTGTAGCCAATTTCAGTATGGAGTTGCCCAGCTTTGGAGTGTCTGGGAT
TAATGAATCTGCAGACATGAGCATTGGAGTAACAGTGATAAAGAACAACATGATAAACAA
TGATCTTGGACCAGCAACAGCTCAAATGGCTCTTCAGCTGTTCATCAAGGATTACAGATA
CACGTATCGGTGTCACAGAGGAGACACACAAATTCAGACAAGGAGGTCATTCGAGCTGAA
GAAATTGTGGGAACAAACCCGCTCAAAAGCGGGACTGCTGGTCTCAGACGGAGGACCAAA
CCTATACAATATCCGGAATCTACACATTCCGGAAGTCTGCTTAAAATGGGAGCTGATGGA
CGAAGACTATCAGGGAAGGCTTTGCAACCCCCTGAATCCGTTTGTCAGCCACAAGGAGAT
AGAGTCTGTAAACAATGCTGTGGTGATGCCAGCTCATGGCCCAGCCAAGAGCATGGAGTA
TGATGCTGTGGCTACTACACACTCCTGGATACCTAAGAGGAATCGCTCTATCCTCAATAC
AAGCCAAAGGGGAATCCTTGAGGATGAACAGATGTATCAAAAGTGCTGCAATCTATTCGA
GAAATTCTTCCCTAGCAGCTCATACAGGAGACCGGTTGGAATTTCCAGCATGGTAGAGGC
CATGGTTTCTAGGGCCCGAATCGATGCGCGAATTGACTTCGAATCTGGACGGATCAAGAA
AGAGGAGTTTGCTGAGATCATGAAGATCTGTTCCACCATTGAAGAACTCAGACGGCAGAA
ATAGTGAATTTTGCTTGTCCTTCATGAAAAAATGCCTTG
>gi|8486136|ref|NC_002022.1|
CAAAATGGAAGACTTTGTGCGACAATGCTTCAATCCAATGATCGTCGAGCTTGCGGAAAA
GGCAATGAAAGAATATGGGGAAGATCCGAAAATTGAAACTAACAAGTTTGCTGCAATATG
Expand Down Expand Up @@ -76,6 +116,36 @@ AGCTGAATCGAGAAAATTGCTTCTCATTGTTCAGGCACTTAGGGACAACCTGGAACCTGG
GACCTTCGATCTTGGGGGGCTATATGAAGCAATCGAGGAGTGCCTGATTAATGATCCCTG
GGTTTTGCTCAATGCATCTTGGTTCAACTCCTTCCTCACACATGCACTGAAATAGTTGTG
GCAATGCTACTATTTGCTATCCATACTGTCCAAAAAAGTACCTTGTT
>gi|8486125|ref|NC_002017.1|
GCAAAAGCAGGGGAAATTCAAATCAACCAAGATGGAAGCAAAACTGTTTGTGCTATTCTG
TACATTCACTGTACTGAAAGCTGACACCATCTGTGTGGGCTACCATGCAAACAACTCTAC
GGACACTGTTGACACAGTATTGGAAAAGAATGTAACTGTGACTCACTCAGTGAATTTGCT
CGAAGACAGCCATAATGGGAAGCTCTGCAGTCTGAATGGGATAGCTCCTTTACAATTGGG
GAAGTGTAATGTAGCGGGGTGGCTCCTGGGCAATCCAGAATGTGATCTCCTACTCACTGC
AAACTCATGGTCCTATATAATAGAGACGTCCAATTCAGAGAACGGGACATGCTATCCCGG
TGAATTCATAGATTATGAGGAATTAAGGGAGCAGCTGAGTTCAGTTTCTTCTTTTGAAAA
GTTTGAAATTTTCCCCAGGGCAAACTCATGGCCAAACCATGAGACAACCATAGGTGTTAC
AGCTGCTTGCTCTTACTCTGGAGCTAGCAGTTTTTATCGAAATTTGCTGTGGATAACAAA
GAAGGGGACTTCATATCCAAAACTCAGCAAATCATACACGAACAATAAGGGGAAAGAGGT
GCTCGTGCTCTGGGGAGTGCACCATCCTCCGACCACCAGTGAACAACAAAGTCTATACCA
GAATACTGATGCATATGTCTCAGTTGGGTCATCAAAGTACAACCGGAGATTCACCCCAGA
GATAGCAGCTAGACCCAAAGTTAGAGGGCAGGCAGGCAGAATGAACTATTATTGGACACT
ATTAGATCAAGGGGACACCATAACATTTGAGGCCACTGGGAATCTGATAGCACCATGGTA
TGCCTTCGCATTAAATAAAGGATCTGATTCAGGAATTATAACATCAGATGCTCCAGTTCA
CAATTGCGACACAAGATGCCAAACCCCCCATGGAGCGTTGAACAGTAGCCTTCCTTTCCA
GAATGTGCATCCTATCACCATTGGAGAATGTCCCAAATATGTCAAGAGCACCAAGCTAAG
GATGGCAACAGGACTAAGAAATGTCCCATCCATTCAATCCAGAGGGCTGTTTGGAGCAAT
TGCCGGATTCATTGAAGGAGGATGGACAGGCATGATAGATGGCTGGTATGGGTACCATCA
TCAAAATGAACAAGGATCAGGATATGCCGCTGATCAGAAAAGCACACAGAATGCAATTGA
TGGAATAACAAATAAGGTGAATTCGGTAATTGAAAAAATGAACACTCAATTCACTGCAGT
GGGTAAAGAATTCAACAACCTAGAGAGGAGGATTGAGAATCTGAATAAAAAAGTCGATGA
TGGGTTCCTGGATGTTTGGACATACAATGCTGAATTGCTCGTTCTATTGGAAAATGAAAG
AACTCTTGATTTTCATGATTCCAATGTGAGGAATTTGTATGAGAGAGTCAAATCACAACT
GAGGAATAACGCCAAAGAAATTGGAAATGGTTGCTTTGAGTTCTATCACAAATGTGATGA
TGAATGTATGGAAAGTGTGAAGAACGGCACATATGATTATCCCAAATATTCAGAAGAGTC
TAAATTGAATCGAGAAGAAATAGACGGAGTAAAACTAGAGTCGATGGGAGTTTACCAAAT
TCTGGCGATCTATTCCACAGTCGCCAGTTCCCTGGTCTTGTTAGTCTCCCTGGGGGCAAT
CAGCTTCTGGATGTGCTCTAATGGGTCGTTGCAATGCAGAATATGCAT
>gi|8486129|ref|NC_002019.1|
GTAGATAATCACTCACCGAGTGACATCCACATCATGGCGTCTCAAGGCACCAAACGATCT
TATGAGCAAATGGAAACTGGTGGAGAACGCCAGAATGCCACTGAAATCAGAGCATCTGTT
Expand Down Expand Up @@ -142,3 +212,18 @@ ATTGCCGCAAGTATCATTGGGATCTTGCACTTGATATTGTGGATTCTTGATCGTCTTTTC
TTCAAATGCATTTATCGTCGCCTTAAATACGGTTTGAAAAGAGGGCCTTCTACGGAAGGA
GTGCCTGAGTCTATGAGGGAAGAATATCGGCAGGAACAGCAGAGTGCTGTGGATGTTGAC
AATGGTCATTTTGTCAACATAGAGCTGGAGTAAAAAACTACCT
>gi|8486131|ref|NC_002020.1|
CAGGTAGATTGCTATCTATGGCACATAAGAAAGCTGCTCAGCATGAGAGACATGTGTGAT
GCTCCCTTTGATGATAGACTCAGAAGAGATCAGAAGGCATTAAAGGGGAGAGGCAGCACA
CTTGGACTCGACNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNACCGACATGAGCATAGAGGAAATAAGCAGGGAGTGGTACATGCTCATGCCAAGGCAG
AAAATAACAGGGGGTCTGATGGTGAAAATGGACCAGGCCATCATGGACAAGAGGATAATA
CTCAAGGCAAACTTCTCTGTCCTATTTGATCAACTGGAAACATTAGTCTCACTGAGGGCT
TTCACAGACGATGGCGCCATTGTAGCTGAAATATCTCCTATTCCTTCCATGCCAGGACAT
TCTACAGAGGATGTCAAAAATGCAATTGGAATCCTCATCGGTGGACTTGAATGGAATGAT
AACTCAATTCGAGCGTCTGAAAATATACAGAGATTCGCTTGGGGAGTCCGTGATGAGAAT
GGGGGACCTCCACTCCCTCCAAAACAGAAACGCTACATGGCGAGAAGAGTTGAGTCAGAA
GTTTGAAGAGATCAGATGGCTAATTGCAGAGTGCAGAAACATATTAACCAAAACTGAAAA
CAGCTTCGAGCAGATAACGTTCTTGCAGGCATTGCAACTCTTACTTGAGGTTGAGAGTGA
GATAAGGACATTTTCTTTTCAGCTTATTTAGTACTAAAAAAC
2 changes: 1 addition & 1 deletion tools/novoalign.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def align_one_rg_bam(self, inBam, refFasta, outBam, rgid=None, options=None, min
JVMmemory=JVMmemory
)

def index_fasta(self, refFasta, k=9, s=1):
def index_fasta(self, refFasta, k=None, s=None):
''' Index a FASTA file (reference genome) for use with Novoalign.
The input file name must end in ".fasta". This will create a
new ".nix" file in the same directory. If it already exists,
Expand Down

0 comments on commit cb8a366

Please sign in to comment.