Skip to content

Commit

Permalink
Merge branch 'master' into ct-intrahost-changes-from-alin
Browse files Browse the repository at this point in the history
  • Loading branch information
tomkinsc committed Dec 7, 2016
2 parents 12cbaf7 + 5a83ae7 commit 7fc4eb1
Show file tree
Hide file tree
Showing 9 changed files with 164 additions and 10 deletions.
9 changes: 7 additions & 2 deletions intrahost.py
Expand Up @@ -123,10 +123,15 @@ def vphaser_one_sample(inBam, inConsFasta, outTab, vphaserNumThreads=None,

bam_to_process = sorted_bam_file
if removeDoublyMappedReads:
bam_to_process = util.file.mkstempfname('.mapped-withdoublymappedremoved.bam')
leading_or_trailing_indels_removed = util.file.mkstempfname('.mapped-leading_or_trailing_indels_removed.bam')
# vphaser crashes when cigar strings have leading or trailing indels
# so we will use filterByCigarString() with its default regex to remove such reads
samtoolsTool.filterByCigarString(sorted_bam_file, leading_or_trailing_indels_removed)

samtoolsTool.removeDoublyMappedReads(sorted_bam_file, bam_to_process)
bam_to_process = util.file.mkstempfname('.mapped-withdoublymappedremoved.bam')
samtoolsTool.removeDoublyMappedReads(leading_or_trailing_indels_removed, bam_to_process)
samtoolsTool.index(bam_to_process)
os.unlink(leading_or_trailing_indels_removed)

variantIter = Vphaser2Tool().iterate(bam_to_process, vphaserNumThreads)
filteredIter = filter_strand_bias(variantIter, minReadsEach, maxBias)
Expand Down
8 changes: 7 additions & 1 deletion pipes/config.yaml
Expand Up @@ -55,9 +55,15 @@ diamond_db: "/idi/sabeti-scratch/yesimon/db/diamond_db/nr"
krona_db: "/idi/sabeti-scratch/yesimon/db/krona_taxonomy"

# BWA index prefix for metagenomic alignment.
align_rna_db: "/idi/sabeti-scratch/shared-resources/rna_bwa/human_viral_16s"
# This contains the full human reference genome, multiple diverse sequences covering human
# pathogenic viruses, as well as LSU and SSU rRNA sequences from SILVA.
# For pre-built hosted databases, you may download:
# - https://storage.googleapis.com/sabeti-public/meta_dbs/human_viral_rrna.tar.lz4
align_rna_db: "/idi/sabeti-scratch/shared-resources/rna_bwa/human_viral_rrna"

# Directory of the NCBI taxonomy dmp files
# For pre-packaged hosted databases, you may download:
# - https://storage.googleapis.com/sabeti-public/meta_dbs/taxonomy.tar.lz4
taxonomy_db: "/idi/sabeti-scratch/shared-resources/taxonomy"

# These are variables that must be set
Expand Down
11 changes: 10 additions & 1 deletion pipes/rules/hs_deplete.rules
Expand Up @@ -63,6 +63,11 @@ rule filter_to_taxon:
logid="{sample}"
shell: "{config[bin_dir]}/taxon_filter.py filter_lastal_bam {input[0]} {params.lastalDb} {output}"


class MergeInputException(Exception):
'''Cannot find merging multiplexed sample inputs'''


def merge_one_per_sample_inputs(wildcards):
if 'seqruns_demux' not in config or not os.path.isfile(config['seqruns_demux']):
if wildcards.adjective == 'raw':
Expand All @@ -79,7 +84,12 @@ def merge_one_per_sample_inputs(wildcards):
else:
runs.add(os.path.join(config["data_dir"], config["subdirs"]["depletion"],
get_run_id(well) +'.'+ lane['flowcell'] +'.'+ lane['lane'] +'.'+ wildcards.adjective + '.bam'))
if not runs:
raise MergeInputException(
"Cannot find inputs to merge into '{}.{}.bam'. Make sure to check 'flowcells.txt' and 'barcodes.txt' "
"to ensure they match up with the sample bams.".format(wildcards.sample, wildcards.adjective))
return sorted(runs)

rule merge_one_per_sample:
''' All of the above depletion steps are run once per flowcell per lane per
multiplexed sample. This reduces recomputation on the same data when
Expand All @@ -102,4 +112,3 @@ rule merge_one_per_sample:
else:
shell("{config[bin_dir]}/read_utils.py merge_bams {input} {params.tmpf_bam} --picardOptions SORT_ORDER=queryname")
shell("{config[bin_dir]}/read_utils.py rmdup_mvicuna_bam {params.tmpf_bam} {output} --JVMmemory 8g")

2 changes: 1 addition & 1 deletion pipes/rules/metagenomics.rules
Expand Up @@ -69,7 +69,7 @@ rule align_rna:
lca=join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.rna_bwa.lca.gz"),
nodupes_lca=join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.rna_bwa.lca_nodupes.gz")
resources: cores=int(config.get("number_of_threads", 1)),
mem=64
mem=26
params: numThreads=str(config.get("number_of_threads", 1)),
UGER=config.get('UGER_queues', {}).get('long', '-q long')
shell:
Expand Down
9 changes: 9 additions & 0 deletions reports.py
Expand Up @@ -76,6 +76,15 @@ def get_assembly_stats(sample,
# and leading/trailing dots are stripped from sample names in util.file.string_to_file_name()
# TODO: replace this with better filtering?
for bam in glob.glob(os.path.join(raw_reads_dir, sample + ".*.bam")))
sample_raw_fname = os.path.join(raw_reads_dir, sample + ".bam")
if os.path.isfile(sample_raw_fname):
# if "00_raw/sample.bam" exists, these were not demuxed by snakemake
if out['reads_raw']:
# if sample.bam AND sample.library.flowcell.lane.bam exist, we have a problem!
out['reads_raw'] = 'ambiguous filenames in raw reads directory!'
else:
# just count the sample.bam reads
out['reads_raw'] = samtools.count(sample_raw_fname)

# pre-assembly stats
out['assembled_trinity'] = os.path.isfile(os.path.join(assembly_tmp, sample +
Expand Down

0 comments on commit 7fc4eb1

Please sign in to comment.