Merge branch 'master' into ct-intrahost-changes-from-alin

broadinstitute · Dec 7, 2016 · 7fc4eb1 · 7fc4eb1
2 parents 12cbaf7 + 5a83ae7
commit 7fc4eb1
Show file tree

Hide file tree

Showing 9 changed files with 164 additions and 10 deletions.
diff --git a/intrahost.py b/intrahost.py
@@ -123,10 +123,15 @@ def vphaser_one_sample(inBam, inConsFasta, outTab, vphaserNumThreads=None,
 
     bam_to_process = sorted_bam_file
     if removeDoublyMappedReads:
-        bam_to_process = util.file.mkstempfname('.mapped-withdoublymappedremoved.bam')
+        leading_or_trailing_indels_removed = util.file.mkstempfname('.mapped-leading_or_trailing_indels_removed.bam')
+        # vphaser crashes when cigar strings have leading or trailing indels
+        # so we will use filterByCigarString() with its default regex to remove such reads
+        samtoolsTool.filterByCigarString(sorted_bam_file, leading_or_trailing_indels_removed)
 
-        samtoolsTool.removeDoublyMappedReads(sorted_bam_file, bam_to_process)
+        bam_to_process = util.file.mkstempfname('.mapped-withdoublymappedremoved.bam')
+        samtoolsTool.removeDoublyMappedReads(leading_or_trailing_indels_removed, bam_to_process)
         samtoolsTool.index(bam_to_process)
+        os.unlink(leading_or_trailing_indels_removed)
 
     variantIter = Vphaser2Tool().iterate(bam_to_process, vphaserNumThreads)
     filteredIter = filter_strand_bias(variantIter, minReadsEach, maxBias)

diff --git a/pipes/config.yaml b/pipes/config.yaml
@@ -55,9 +55,15 @@ diamond_db: "/idi/sabeti-scratch/yesimon/db/diamond_db/nr"
 krona_db: "/idi/sabeti-scratch/yesimon/db/krona_taxonomy"
 
 # BWA index prefix for metagenomic alignment.
-align_rna_db: "/idi/sabeti-scratch/shared-resources/rna_bwa/human_viral_16s"
+# This contains the full human reference genome, multiple diverse sequences covering human
+# pathogenic viruses, as well as LSU and SSU rRNA sequences from SILVA.
+# For pre-built hosted databases, you may download:
+#  - https://storage.googleapis.com/sabeti-public/meta_dbs/human_viral_rrna.tar.lz4
+align_rna_db: "/idi/sabeti-scratch/shared-resources/rna_bwa/human_viral_rrna"
 
 # Directory of the NCBI taxonomy dmp files
+# For pre-packaged hosted databases, you may download:
+#  - https://storage.googleapis.com/sabeti-public/meta_dbs/taxonomy.tar.lz4
 taxonomy_db: "/idi/sabeti-scratch/shared-resources/taxonomy"
 
 # These are variables that must be set

diff --git a/pipes/rules/hs_deplete.rules b/pipes/rules/hs_deplete.rules
@@ -63,6 +63,11 @@ rule filter_to_taxon:
             logid="{sample}"
     shell:  "{config[bin_dir]}/taxon_filter.py filter_lastal_bam {input[0]} {params.lastalDb} {output}"
 
+
+class MergeInputException(Exception):
+    '''Cannot find merging multiplexed sample inputs'''
+
+
 def merge_one_per_sample_inputs(wildcards):
     if 'seqruns_demux' not in config or not os.path.isfile(config['seqruns_demux']):
         if wildcards.adjective == 'raw':
@@ -79,7 +84,12 @@ def merge_one_per_sample_inputs(wildcards):
               else:
                  runs.add(os.path.join(config["data_dir"], config["subdirs"]["depletion"],
                     get_run_id(well) +'.'+ lane['flowcell'] +'.'+ lane['lane'] +'.'+ wildcards.adjective + '.bam'))
+    if not runs:
+        raise MergeInputException(
+            "Cannot find inputs to merge into '{}.{}.bam'. Make sure to check 'flowcells.txt' and 'barcodes.txt' "
+            "to ensure they match up with the sample bams.".format(wildcards.sample, wildcards.adjective))
     return sorted(runs)
+
 rule merge_one_per_sample:
     ''' All of the above depletion steps are run once per flowcell per lane per
         multiplexed sample.  This reduces recomputation on the same data when
@@ -102,4 +112,3 @@ rule merge_one_per_sample:
             else:
                 shell("{config[bin_dir]}/read_utils.py merge_bams {input} {params.tmpf_bam} --picardOptions SORT_ORDER=queryname")
                 shell("{config[bin_dir]}/read_utils.py rmdup_mvicuna_bam {params.tmpf_bam} {output} --JVMmemory 8g")
-
diff --git a/pipes/rules/metagenomics.rules b/pipes/rules/metagenomics.rules
@@ -69,7 +69,7 @@ rule align_rna:
             lca=join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.rna_bwa.lca.gz"),
             nodupes_lca=join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.rna_bwa.lca_nodupes.gz")
     resources: cores=int(config.get("number_of_threads", 1)),
-               mem=64
+               mem=26
     params: numThreads=str(config.get("number_of_threads", 1)),
             UGER=config.get('UGER_queues', {}).get('long', '-q long')
     shell:

diff --git a/reports.py b/reports.py
@@ -76,6 +76,15 @@ def get_assembly_stats(sample,
             # and leading/trailing dots are stripped from sample names in util.file.string_to_file_name()
             # TODO: replace this with better filtering?
             for bam in glob.glob(os.path.join(raw_reads_dir, sample + ".*.bam")))
+        sample_raw_fname = os.path.join(raw_reads_dir, sample + ".bam")
+        if os.path.isfile(sample_raw_fname):
+            # if "00_raw/sample.bam" exists, these were not demuxed by snakemake
+            if out['reads_raw']:
+                # if sample.bam AND sample.library.flowcell.lane.bam exist, we have a problem!
+                out['reads_raw'] = 'ambiguous filenames in raw reads directory!'
+            else:
+                # just count the sample.bam reads
+                out['reads_raw'] = samtools.count(sample_raw_fname)
 
     # pre-assembly stats
     out['assembled_trinity'] = os.path.isfile(os.path.join(assembly_tmp, sample +