Skip to content

Commit

Permalink
Work on WDL rules
Browse files Browse the repository at this point in the history
  • Loading branch information
yesimon committed Feb 28, 2018
1 parent 5058dda commit 9188175
Show file tree
Hide file tree
Showing 5 changed files with 144 additions and 16 deletions.
50 changes: 37 additions & 13 deletions metagenomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,13 @@ class TaxIdError(ValueError):
'''Taxonomy ID couldn't be determined.'''


def maybe_compressed(fn):
def maybe_compressed(fn, missing_ok=False):
fn_gz = fn + '.gz'
if os.path.exists(fn):
return fn
elif os.path.exists(fn_gz):
return fn_gz
else:
elif not missing_ok:
raise FileNotFoundError(fn)


Expand All @@ -79,8 +79,8 @@ def __init__(
load_names=False
):
if tax_dir:
gis_paths = [maybe_compressed(join(tax_dir, 'gi_taxid_nucl.dmp')),
maybe_compressed(join(tax_dir, 'gi_taxid_prot.dmp'))]
gis_paths = [maybe_compressed(join(tax_dir, 'gi_taxid_nucl.dmp'), missing_ok=True),
maybe_compressed(join(tax_dir, 'gi_taxid_prot.dmp'), missing_ok=True)]
nodes_path = maybe_compressed(join(tax_dir, 'nodes.dmp'))
names_path = maybe_compressed(join(tax_dir, 'names.dmp'))
self.tax_dir = tax_dir
Expand Down Expand Up @@ -198,11 +198,8 @@ def blast_records(f):
if x != 'N/A']
for field in (2, 10, 11):
parts[field] = float(parts[field])
args = parts[:12]
extra = parts[12:]
args.append(extra)

yield BlastRecord(*args)
yield BlastRecord(*parts)


def paired_query_id(record):
Expand Down Expand Up @@ -721,15 +718,20 @@ def kraken_dfs_report(db, taxa_hits):
db.children = parents_to_children(db.parents)
total_hits = sum(taxa_hits.values())
lines = []
kraken_dfs(db, lines, taxa_hits, total_hits, 1, 0)
if total_hits:
kraken_dfs(db, lines, taxa_hits, total_hits, 1, 0)
unclassified_hits = taxa_hits.get(0, 0)
unclassified_hits += taxa_hits.get(-1, 0)

if unclassified_hits > 0 or not total_hits:
percent_covered = '%.2f' % (unclassified_hits / total_hits * 100)
if total_hits:
percent_unclassified = '%.2f' % (unclassified_hits / total_hits * 100)
else:
percent_unclassified = 0

lines.append(
'\t'.join([
str(percent_covered), str(unclassified_hits), str(unclassified_hits), 'U', '0', 'unclassified'
str(percent_unclassified), str(unclassified_hits), str(unclassified_hits), 'U', '0', 'unclassified'
])
)
return reversed(lines)
Expand Down Expand Up @@ -1066,9 +1068,28 @@ def extract_kraken_unclassified(inKrakenReads, inBam, outBam, p_threshold=0.5):
qname = parts[1]
taxid = parts[2]
length = parts[3]
p = float(parts[4].split('=')[1])
kmer_str = parts[5]
# Check if already filtered and P= added
if '=' in parts[4]:
p = float(parts[4].split('=')[1])
kmer_str = parts[5]
else:
kmer_str = parts[4]
kmers = [kmer for kmer in kmer_str.split(' ')]
kmer_counts = []
for kmer in kmers:
t = kmer.split(':')
kmer_counts.append((t[0], int(t[1])))
n_kmers = 0
n_classified_kmers = 0
for kmer_taxid, count in kmer_counts:
n_kmers += count
if kmer_taxid not in ('0', 'A'):
n_classified_kmers += count
p = n_classified_kmers / n_kmers

if p <= p_threshold:
if qname.endswith('/1') or qname.endswith('/2'):
qname = qname[:-2]
qnames.add(qname)

in_sam = pysam.AlignmentFile(inBam, 'rb', check_sq=False)
Expand All @@ -1089,6 +1110,7 @@ def parser_kraken_unclassified(parser=argparse.ArgumentParser()):
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, extract_kraken_unclassified, split_args=True)
return parser
__commands__.append(('kraken_unclassified', parser_kraken_unclassified))


def blast_report(tool, db, input_fn, tax_dir=None, tax_db=None,
Expand Down Expand Up @@ -1171,6 +1193,7 @@ def parser_blast_taxonomy(parser=argparse.ArgumentParser()):
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, blast_taxonomy, split_args=True)
return parser
__commands__.append(('blast_taxonomy', parser_blast_taxonomy))


def infernal_rna(db, inFasta, outTbl, numThreads=None):
Expand Down Expand Up @@ -1247,6 +1270,7 @@ def parser_cluster_contigs(parser=argparse.ArgumentParser()):
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, cluster_contigs, split_args=True)
return parser
__commands__.append(('cluster_contigs', parser_cluster_contigs))


def parser_metagenomic_report_merge(parser=argparse.ArgumentParser()):
Expand Down
101 changes: 101 additions & 0 deletions pipes/WDL/workflows/tasks/metagenomics.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,107 @@ task krona {
}


task blastn_contigs {
File contigs_fasta

String contigs_basename = basename(contigs_fasta, ".fasta")

command {
set -ex -o pipefail
if [ -d /mnt/tmp ]; then
TMPDIR=/mnt/tmp
fi
DB_DIR=$(mktemp -d)
TAX_DB_DIR=$(mktemp -d)

# Download latest databases from NCBI FTP
lftp -c "mirror --exclude-glob * --include-glob nt.*.tar.gz ftp://ftp.ncbi.nlm.nih.gov/blast/db/ $DB_DIR"
pigz -d $DB_DIR/*.gz

curl -s ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz | tar -zx -C $TAX_DB_DIR

metagenomics.py blast_taxonomy ${contigs_fasta} \
--taxDb $TAX_DB_DIR \
--ntDb $DB_DIR \
--outBlastn ${contigs_basename}.blastn.m8.gz \
--outBlastnLca ${contigs_basename}.blastn.lca.tsv.gz \
--outBlastnReport ${contigs_basename}.blastn.report
}

output {
File blastn = "${contigs_basename}.blastn.m8.gz"
File blastn_lca = "${contigs_basename}.blastn.lca.tsv.gz"
File blastn_report = "${contigs_basename}.blastn.report"
}
}

task blastx_contigs {
File contigs_fasta

String contigs_basename = basename(contigs_fasta, ".fasta")

command {
set -ex -o pipefail
if [ -d /mnt/tmp ]; then
TMPDIR=/mnt/tmp
fi
DB_DIR=$(mktemp -d)
TAX_DB_DIR=$(mktemp -d)

# Download latest databases from NCBI FTP
lftp -c "mirror --exclude-glob * --include-glob nr.*.tar.gz ftp://ftp.ncbi.nlm.nih.gov/blast/db/ $DB_DIR"
pigz -d $DB_DIR/*.gz

curl -s ftp://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz | tar -zx -C $DB_DIR

curl -s ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz | tar -zx -C $TAX_DB_DIR

metagenomics.py blast_taxonomy ${contigs_fasta} \
--taxDb $TAX_DB_DIR \
--nrDb $DB_DIR \
--outBlastx ${contigs_basename}.blastx.m8.gz \
--outBlastxLca ${contigs_basename}.blastx.lca.tsv.gz \
--outBlastxReport ${contigs_basename}.blastx.report
}

output {
File blastx = "${contigs_basename}.blastx.m8.gz"
File blastx_lca = "${contigs_basename}.blastx.lca.tsv.gz"
File blastx_report = "${contigs_basename}.blastx.report"
}
}

task rpsblast_models {
File contigs_fasta

String contigs_basename = basename(contigs_fasta, ".fasta")
command {
set -ex -o pipefail
if [ -d /mnt/tmp ]; then
TMPDIR=/mnt/tmp
fi
DB_DIR=$(mktemp -d)

curl -s ftp://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/Cdd_LE.tar.gz | tar -zx -C $DB_DIR

metagenomics.py rpsblast_models \
$DB_DIR \
${contigs_fasta} \
${contigs_basename}.cdd.report
}

output {
File cdd_report = "${contigs_basename}.cdd.report
}
}
task infernal_rfam {
}
task diamond_contigs {
File contigs_fasta
File reads_unmapped_bam
Expand Down
4 changes: 2 additions & 2 deletions pipes/rules/metagenomics.rules
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ rule krona_import_taxonomy:
mem=32
run:
krona_db_prefix = strip_protocol(config["krona_db"], relative=True)
shell("{config[bin_dir]}/metagenomics.py krona {input.tsv} "+krona_db_prefix+" {output} --noRank")
shell("{config[bin_dir]}/metagenomics.py krona {input.tsv} {krona_db_prefix} {output} --norank")

rule unclassified_trinity:
''' This step runs the Trinity assembler on Kraken unclassified reads.
Expand All @@ -238,7 +238,7 @@ rule unclassified_trinity:
shell:
'''
{config[bin_dir]}/metagenomics.py kraken_unclassified {input.kraken} {input.bam} {params.extracted}
{config[bin_dir]}/assembly.py assemble_trinity {params.extracted} {params.clipDb} {output} --n_reads 0 --outReads {params.subsamp_bam} --threads {params.numThreads} --JVMmemory 14g
{config[bin_dir]}/assembly.py assemble_trinity {params.extracted} {params.clipDb} {output} --n_reads 100000 --outReads {params.subsamp_bam} --threads {params.numThreads} --JVMmemory 14g
'''

rule cluster_contigs:
Expand Down
2 changes: 2 additions & 0 deletions tools/infernal.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

log = logging.getLogger(__name__)

CONDA_VERSION='1.1.2'

class Infernal(tools.Tool):

COMMANDS = [
Expand Down
3 changes: 2 additions & 1 deletion tools/prodigal.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,14 @@

log = logging.getLogger(__name__)

TOOL_VERSION = '2.6.3'

class Prodigal(tools.Tool):

def __init__(self, install_methods=None):
if not install_methods:
install_methods = [
tools.CondaPackage('prodigal', version=CONDA_VERSION)
tools.CondaPackage('prodigal', version=TOOL_VERSION)
]
super(Prodigal, self).__init__(install_methods=install_methods)

Expand Down

0 comments on commit 9188175

Please sign in to comment.