Skip to content

Commit

Permalink
Merge branch 'master' into ct-ag100-exploration
Browse files Browse the repository at this point in the history
  • Loading branch information
tomkinsc committed Mar 26, 2018
2 parents 722d24f + 2e2e05a commit 33edff0
Show file tree
Hide file tree
Showing 33 changed files with 354 additions and 171 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM quay.io/broadinstitute/viral-baseimage:0.1.8
FROM quay.io/broadinstitute/viral-baseimage:0.1.9

LABEL maintainer "viral-ngs@broadinstitute.org"

Expand Down
29 changes: 23 additions & 6 deletions ncbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def tbl_transfer_common(cmap, ref_tbl, out_tbl, alt_chrlens, oob_clip=False):
if not ((refID.startswith('gb|') or refID.startswith('ref|')) and refID.endswith('|') and
len(refID) > 4):
raise Exception("reference annotation does not refer to a GenBank or RefSeq accession")
refID = refID[refID.find("|") + 1:-1]
refID = '|'.join(refID.split('|')[1:-1])
refSeqID = [x for x in cmap.keys() if refID in x][0]
#altid = cmap.mapChr(refSeqID, altid)
altid = list(set(cmap.keys()) - set([refSeqID]))[0] # cmap.mapChr(refSeqID, altid)
Expand Down Expand Up @@ -176,7 +176,7 @@ def tbl_transfer_prealigned(inputFasta, refFasta, refAnnotTblFiles, outputDir, o
# identify the correct feature table as the one that has an ID that is
# part of the ref seq ID
fileAccession = util.genbank.get_feature_table_id(tblFilename)
if fileAccession in matchingRefSeq.id:
if fileAccession == matchingRefSeq.id.split('|')[0]:
ref_tbl = tblFilename
break
if ref_tbl == "":
Expand Down Expand Up @@ -396,7 +396,7 @@ def make_structured_comment_file(cmt_fname, name=None, seq_tech=None, coverage=N

def prep_genbank_files(templateFile, fasta_files, annotDir,
master_source_table=None, comment=None, sequencing_tech=None,
coverage_table=None, biosample_map=None):
coverage_table=None, biosample_map=None, organism=None, mol_type=None):
''' Prepare genbank submission files. Requires .fasta and .tbl files as input,
as well as numerous other metadata files for the submission. Creates a
directory full of files (.sqn in particular) that can be sent to GenBank.
Expand Down Expand Up @@ -436,13 +436,22 @@ def prep_genbank_files(templateFile, fasta_files, annotDir,
Bio.SeqIO.write(seq_obj, out_chr_fasta, "fasta")

# make .fsa files
fasta2fsa(out_file_name, annotDir, biosample=biosample.get(sample))
fasta2fsa(out_file_name, annotDir, biosample=biosample.get(sample_base))
# remove the .fasta file
os.unlink(out_file_name)

# make .src files
if master_source_table:
shutil.copy(master_source_table, os.path.join(annotDir, sample + '.src'))
out_src_fname = os.path.join(annotDir, sample + '.src')
with open(master_source_table, 'rt') as inf:
with open(out_src_fname, 'wt') as outf:
outf.write(inf.readline())
for line in inf:
row = line.rstrip('\n').split('\t')
if row[0] == sample_base:
row[0] = sample
outf.write('\t'.join(row) + '\n')

# make .cmt files
make_structured_comment_file(os.path.join(annotDir, sample + '.cmt'),
name=sample,
Expand All @@ -451,7 +460,13 @@ def prep_genbank_files(templateFile, fasta_files, annotDir,

# run tbl2asn (relies on filesnames matching by prefix)
tbl2asn = tools.tbl2asn.Tbl2AsnTool()
tbl2asn.execute(templateFile, annotDir, comment=comment, per_genome_comment=True)
source_quals = []
if organism:
source_quals.append(('organism', organism))
if mol_type:
source_quals.append(('mol_type', mol_type))
tbl2asn.execute(templateFile, annotDir, comment=comment,
per_genome_comment=True, source_quals=source_quals)


def parser_prep_genbank_files(parser=argparse.ArgumentParser()):
Expand All @@ -462,6 +477,8 @@ def parser_prep_genbank_files(parser=argparse.ArgumentParser()):
parser.add_argument('--comment', default=None, help='comment field')
parser.add_argument('--sequencing_tech', default=None, help='sequencing technology (e.g. Illumina HiSeq 2500)')
parser.add_argument('--master_source_table', default=None, help='source modifier table')
parser.add_argument('--organism', default=None, help='species name')
parser.add_argument('--mol_type', default=None, help='molecule type')
parser.add_argument("--biosample_map",
help="""A file with two columns and a header: sample and BioSample.
This file may refer to samples that are not included in this submission.""")
Expand Down
4 changes: 2 additions & 2 deletions pipes/WDL/dx-defaults-assemble_denovo_with_deplete.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"assemble_denovo_with_deplete.deplete_taxa.bmtaggerDbs": [
"dx://file-BYF8y0Q06PJ7G1fPvkB9q3fK"
"assemble_denovo_with_deplete.deplete_taxa.bwaDbs": [
"dx://file-F9k7Bx00Z3ybJjvY3ZVj7Z9P"
],
"assemble_denovo_with_deplete.deplete_taxa.blastDbs": [
"dx://file-F8B3B6Q09y3bZg3j1FqK7bJ9",
Expand Down
14 changes: 14 additions & 0 deletions pipes/WDL/dx-defaults-contigs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"contigs.deplete.bwaDbs": [
"dx://file-F9k7Bx00Z3ybJjvY3ZVj7Z9P"
],
"contigs.deplete.blastDbs": [
"dx://file-F8B3B6Q09y3bZg3j1FqK7bJ9",
"dx://file-F8BjgXj09y3gkfZGPPQZbZkK",
"dx://file-F8B3Pp809y3jBpXq7xjxbq94",
"dx://file-F8B3B6809y3kK1JP5X8Pg361"
],

"contigs.spades.trim_clip_db":
"dx://file-BXF0vYQ0QyBF509G9J12g927"
}
4 changes: 2 additions & 2 deletions pipes/WDL/dx-defaults-demux_plus.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
"demux_plus.spikein.spikein_db":
"dx://file-F6PXkF00Yqp3zVXq14fF98Kz",

"demux_plus.deplete.bmtaggerDbs": [
"dx://file-BYF8y0Q06PJ7G1fPvkB9q3fK"
"demux_plus.deplete.bwaDbs": [
"dx://file-F9k7Bx00Z3ybJjvY3ZVj7Z9P"
],
"demux_plus.deplete.blastDbs": [
"dx://file-F8B3B6Q09y3bZg3j1FqK7bJ9",
Expand Down
4 changes: 2 additions & 2 deletions pipes/WDL/dx-defaults-deplete_only.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"deplete_only.deplete_taxa.bmtaggerDbs": [
"dx://file-BYF8y0Q06PJ7G1fPvkB9q3fK"
"deplete_only.deplete_taxa.bwaDbs": [
"dx://file-F9k7Bx00Z3ybJjvY3ZVj7Z9P"
],
"deplete_only.deplete_taxa.blastDbs": [
"dx://file-F8B3B6Q09y3bZg3j1FqK7bJ9",
Expand Down
4 changes: 4 additions & 0 deletions pipes/WDL/dx-defaults-spikein.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"spikein.spikein_report.spikein_db":
"dx://file-F6PXkF00Yqp3zVXq14fF98Kz"
}
32 changes: 32 additions & 0 deletions pipes/WDL/workflows/align_and_annot.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import "interhost.wdl" as interhost
import "ncbi.wdl" as ncbi

# DX_SKIP_WORKFLOW
workflow align_and_annot {

File reference_fasta
Array[File]+ assemblies_fasta
Array[File]+ annotations_tbl

call interhost.multi_align_mafft_ref as mafft {
input:
reference_fasta = reference_fasta,
assemblies_fasta = assemblies_fasta
}

scatter(by_chr in zip(mafft.alignments_by_chr, annotations_tbl)) {
call ncbi.annot_transfer as annot {
input:
chr_mutli_aln_fasta = by_chr.left,
reference_fasta = reference_fasta,
reference_feature_table = by_chr.right
}
call ncbi.prepare_genbank as genbank {
input:
assemblies_fasta = assemblies_fasta,
annotations_tbl = annot.transferred_feature_tables,
out_prefix = basename(by_chr.right, '.tbl')
}
}
}
2 changes: 1 addition & 1 deletion pipes/WDL/workflows/align_and_plot.wdl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import "tasks/reports.wdl" as reports
import "reports.wdl" as reports

workflow align_and_plot {
call reports.plot_coverage {
Expand Down
4 changes: 2 additions & 2 deletions pipes/WDL/workflows/assemble_denovo.wdl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import "tasks/taxon_filter.wdl" as taxon_filter
import "tasks/assembly.wdl" as assembly
import "taxon_filter.wdl" as taxon_filter
import "assembly.wdl" as assembly

workflow assemble_denovo {
File reads_unmapped_bam
Expand Down
4 changes: 2 additions & 2 deletions pipes/WDL/workflows/assemble_denovo_with_deplete.wdl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import "tasks/taxon_filter.wdl" as taxon_filter
import "tasks/assembly.wdl" as assembly
import "taxon_filter.wdl" as taxon_filter
import "assembly.wdl" as assembly

workflow assemble_denovo_with_deplete {

Expand Down
4 changes: 2 additions & 2 deletions pipes/WDL/workflows/assemble_refbased.wdl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import "tasks/assembly.wdl" as assembly
import "assembly.wdl" as assembly

workflow assemble_refbased {
call assembly.refine_2x_and_plot
}
}
2 changes: 1 addition & 1 deletion pipes/WDL/workflows/classify_kraken.wdl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import "tasks/metagenomics.wdl" as metagenomics
import "metagenomics.wdl" as metagenomics
workflow classify_kraken {
call metagenomics.kraken
Expand Down
17 changes: 17 additions & 0 deletions pipes/WDL/workflows/contigs.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import "metagenomics.wdl" as metagenomics
import "taxon_filter.wdl" as taxon_filter
import "assembly.wdl" as assembly
workflow contigs {

call taxon_filter.deplete_taxa as deplete

call assembly.assemble as spades {
input:
assembler = "spades",
reads_unmapped_bam = deplete.cleaned_bam
}

# TO DO: taxonomic classification of contigs
}
10 changes: 5 additions & 5 deletions pipes/WDL/workflows/demux_metag.wdl
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#DX_SKIP_WORKFLOW
import "tasks/demux.wdl" as demux
import "tasks/metagenomics.wdl" as metagenomics
import "tasks/taxon_filter.wdl" as taxon_filter
import "tasks/assembly.wdl" as assembly
import "tasks/reports.wdl" as reports
import "demux.wdl" as demux
import "metagenomics.wdl" as metagenomics
import "taxon_filter.wdl" as taxon_filter
import "assembly.wdl" as assembly
import "reports.wdl" as reports
workflow demux_metag {
File krona_taxonomy_db_tgz
Expand Down
4 changes: 2 additions & 2 deletions pipes/WDL/workflows/demux_only.wdl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import "tasks/demux.wdl" as tasks_demux
import "demux.wdl" as tasks_demux

workflow demux_only {
call tasks_demux.illumina_demux
}
}
10 changes: 5 additions & 5 deletions pipes/WDL/workflows/demux_plus.wdl
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import "tasks/demux.wdl" as demux
import "tasks/metagenomics.wdl" as metagenomics
import "tasks/taxon_filter.wdl" as taxon_filter
import "tasks/assembly.wdl" as assembly
import "tasks/reports.wdl" as reports
import "demux.wdl" as demux
import "metagenomics.wdl" as metagenomics
import "taxon_filter.wdl" as taxon_filter
import "assembly.wdl" as assembly
import "reports.wdl" as reports
workflow demux_plus {

Expand Down
4 changes: 2 additions & 2 deletions pipes/WDL/workflows/deplete_only.wdl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import "tasks/taxon_filter.wdl" as taxon_filter
import "taxon_filter.wdl" as taxon_filter

workflow deplete_only {
call taxon_filter.deplete_taxa
}
}
7 changes: 7 additions & 0 deletions pipes/WDL/workflows/fetch_annotations.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import "ncbi.wdl" as ncbi

workflow fetch_annotations {

call ncbi.download_annotations as download

}
4 changes: 2 additions & 2 deletions pipes/WDL/workflows/scaffold_and_refine.wdl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import "tasks/assembly.wdl" as assembly
import "assembly.wdl" as assembly

workflow scaffold_and_refine {
File reads_unmapped_bam
Expand All @@ -13,4 +13,4 @@ workflow scaffold_and_refine {
assembly_fasta = scaffold.scaffold_fasta,
reads_unmapped_bam = reads_unmapped_bam
}
}
}
7 changes: 7 additions & 0 deletions pipes/WDL/workflows/spikein.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import "reports.wdl" as reports

workflow spikein {

call reports.spikein_report as spikein_report

}
29 changes: 18 additions & 11 deletions pipes/WDL/workflows/tasks/assembly.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@ task scaffold {
--outAlternateContigs ${sample_name}.scaffolding_alt_contigs.fasta \
--loglevel=DEBUG

grep '^>' ${sample_name}.scaffolding_chosen_ref.fasta | cut -c 2- | tr '\n' '\t' > ${sample_name}.scaffolding_chosen_ref.txt

assembly.py gapfill_gap2seq \
${sample_name}.intermediate_scaffold.fasta \
${reads_bam} \
Expand All @@ -132,7 +134,7 @@ task scaffold {
assembly.py impute_from_reference \
${sample_name}.intermediate_gapfill.fasta \
${sample_name}.scaffolding_chosen_ref.fasta \
${sample_name}.scaffold.fasta \
${sample_name}.scaffolded_imputed.fasta \
--newName ${sample_name} \
${'--replaceLength=' + replace_length} \
${'--minLengthFraction=' + min_length_fraction} \
Expand All @@ -142,14 +144,15 @@ task scaffold {
}

output {
File scaffold_fasta = "${sample_name}.scaffold.fasta"
File intermediate_scaffold_fasta = "${sample_name}.intermediate_scaffold.fasta"
File intermediate_gapfill_fasta = "${sample_name}.intermediate_gapfill.fasta"
Int assembly_preimpute_length = read_int("assembly_preimpute_length")
Int assembly_preimpute_length_unambiguous = read_int("assembly_preimpute_length_unambiguous")
File scaffolding_chosen_ref = "${sample_name}.scaffolding_chosen_ref.fasta"
File scaffolding_stats = "${sample_name}.scaffolding_stats.txt"
File scaffolding_alt_contigs = "${sample_name}.scaffolding_alt_contigs.fasta"
File scaffold_fasta = "${sample_name}.scaffolded_imputed.fasta"
File intermediate_scaffold_fasta = "${sample_name}.intermediate_scaffold.fasta"
File intermediate_gapfill_fasta = "${sample_name}.intermediate_gapfill.fasta"
Int assembly_preimpute_length = read_int("assembly_preimpute_length")
Int assembly_preimpute_length_unambiguous = read_int("assembly_preimpute_length_unambiguous")
String scaffolding_chosen_ref_name = read_string("${sample_name}.scaffolding_chosen_ref.txt")
File scaffolding_chosen_ref = "${sample_name}.scaffolding_chosen_ref.fasta"
File scaffolding_stats = "${sample_name}.scaffolding_stats.txt"
File scaffolding_alt_contigs = "${sample_name}.scaffolding_alt_contigs.fasta"
}

runtime {
Expand Down Expand Up @@ -307,7 +310,8 @@ task refine_2x_and_plot {
samtools flagstat ${sample_name}.all.bam | tee ${sample_name}.all.bam.flagstat.txt
grep properly ${sample_name}.all.bam.flagstat.txt | cut -f 1 -d ' ' | tee read_pairs_aligned
samtools view ${sample_name}.mapped.bam | cut -f10 | tr -d '\n' | wc -c | tee bases_aligned
echo $(( $(cat bases_aligned) / $(cat assembly_length) )) | tee mean_coverage
#echo $(( $(cat bases_aligned) / $(cat assembly_length) )) | tee mean_coverage
python -c "print (float("`cat bases_aligned`")/"`cat assembly_length`") if "`cat assembly_length`">0 else 0" > mean_coverage

# fastqc mapped bam
reports.py fastqc ${sample_name}.mapped.bam ${sample_name}.mapped_fastqc.html
Expand All @@ -321,6 +325,7 @@ task refine_2x_and_plot {
--plotWidth 1100 \
--plotHeight 850 \
--plotDPI 100 \
--plotTitle "${sample_name} coverage plot" \
--loglevel=DEBUG
else
touch ${sample_name}.coverage_plot.pdf
Expand All @@ -333,16 +338,18 @@ task refine_2x_and_plot {
File refine2_sites_vcf_gz = "${sample_name}.refine2.pre_fasta.vcf.gz"
File final_assembly_fasta = "${sample_name}.fasta"
File aligned_bam = "${sample_name}.all.bam"
File aligned_bam_idx = "${sample_name}.all.bai"
File aligned_bam_flagstat = "${sample_name}.all.bam.flagstat.txt"
File aligned_only_reads_bam = "${sample_name}.mapped.bam"
File aligned_only_reads_bam_idx = "${sample_name}.mapped.bai"
File aligned_only_reads_fastqc = "${sample_name}.mapped_fastqc.html"
File coverage_plot = "${sample_name}.coverage_plot.pdf"
Int assembly_length = read_int("assembly_length")
Int assembly_length_unambiguous = read_int("assembly_length_unambiguous")
Int reads_aligned = read_int("reads_aligned")
Int read_pairs_aligned = read_int("read_pairs_aligned")
Int bases_aligned = read_int("bases_aligned")
Int mean_coverage = read_int("mean_coverage")
Float mean_coverage = read_float("mean_coverage")
}

runtime {
Expand Down
Loading

0 comments on commit 33edff0

Please sign in to comment.