Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to use ntLink for mapping step #110

Merged
merged 16 commits into from
Dec 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ tigmint-make tigmint-long draft=myassembly reads=myreads span=auto G=gsize dist=

- `minimap2 map-ont` is used to align long reads from the Oxford Nanopore Technologies (ONT) platform, which is the default input for Tigmint. To use PacBio long reads specify the parameter `longmap=pb`. The former calls `minimap2 -x map-ont` while the latter calls `minimap2 -x map-pb` instead. When using PacBio HiFi long reads, specify the parameter `longmap=hifi`.

Optionally, ntLink (v1.3.6+) can be used to map the long reads to the draft assembly. To use ntLink mappings, specify `mapping=ntLink` to your `tigmint` command.


# Notes

Expand Down
4 changes: 2 additions & 2 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
displayName: Create Anaconda environment
- script: |
source activate tigmint_CI
conda install --yes --quiet --name tigmint_CI -c conda-forge -c bioconda pytest bedtools bwa samtools=1.9 minimap2 pylint compilers
conda install --yes --quiet --name tigmint_CI -c conda-forge -c bioconda pytest bedtools bwa samtools=1.9 minimap2 pylint compilers ntlink
conda install --yes --quiet --name tigmint_CI -c conda-forge -c bioconda --file requirements.txt
displayName: Install Anaconda packages
- script: |
Expand All @@ -29,7 +29,7 @@ jobs:
- script: |
source activate tigmint_CI
cd bin
pylint tigmint-cut tigmint_molecule.py tigmint_molecule_paf.py tigmint_estimate_dist.py
pylint tigmint-cut tigmint_molecule.py tigmint_molecule_paf.py tigmint_estimate_dist.py tigmint-ntlink-map tigmint-filter-paf
cd ../
displayName: Run pylint
- script: |
Expand Down
4 changes: 2 additions & 2 deletions bin/tigmint-cut
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,9 @@ def findBreakpoints(bed_name, window_length, contigLengths, num_spanning, bps_qu
checkSpanningMolecules(interval_tree, window_length, contigLengths, contig, num_spanning, bps_queue, attempted_corr_queue, min_length)
interval_tree.clear()
contig = bed.chrom
interval_tree[bed.start:bed.stop] = bed.score
interval_tree[bed.start:bed.stop] = bed.name
else: # Same contig ID, keep reading in the BED file to collect all molecule extents for that contig
interval_tree[bed.start:bed.stop] = bed.score
interval_tree[bed.start:bed.stop] = bed.name
# Ensuring final contig in the bed file is also checked for spanning molecules
if contig != "":
checkSpanningMolecules(interval_tree, window_length, contigLengths, contig, num_spanning, bps_queue, attempted_corr_queue, min_length)
Expand Down
31 changes: 31 additions & 0 deletions bin/tigmint-filter-paf
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env python3
"""
Given a PAF-formatted mapping file, output mapping extents BED file
"""
import argparse

def paf_to_molecule_extents(args: argparse.Namespace) -> None:
"Read through the input PAF file, and output the mapping extents above the length threshold"
with open(args.PAF, 'r') as fin:
for line in fin:
line = line.strip().split("\t")
ctg_name, ctg_start, ctg_end = line[5], int(line[7]), int(line[8])
read_name = line[0]
num_mx = int(line[9])
if ctg_end - ctg_start >= args.m:
print(ctg_name, ctg_start, ctg_end, read_name, num_mx, sep="\t")

def main() -> None:
"Filter input PAF file, output BED file"
parser = argparse.ArgumentParser(description="Given a PAF-formatted mapping file, "
"output molecule extents BED")
parser.add_argument("PAF", help="Input PAF file")
parser.add_argument("-m", help="Minimum size of output extents (bp)", default=2000, type=int)
args = parser.parse_args()

args.PAF = "/dev/stdin" if args.PAF == "-" else args.PAF

paf_to_molecule_extents(args)

if __name__ == "__main__":
main()
33 changes: 24 additions & 9 deletions bin/tigmint-make
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ G=0
# Minimap2 long read map parameter
longmap=ont

# Mapping approach
mapping=minimap2 # Can be minimap2 or ntLink
ntLink_k=32
ntLink_w=100

# Minimum molecule size
minsize=2000

Expand Down Expand Up @@ -127,12 +132,16 @@ endif

# Record run time and memory usage in a file using GNU time.
ifdef time
ifneq ($(shell command -v memusg),)
gtime=command memusg -t -o $@.time
else
ifneq ($(shell command -v gtime),)
gtime=command gtime -v -o $@.time
else
gtime=command time -v -o $@.time
endif
endif
endif

.DELETE_ON_ERROR:
.SECONDARY:
Expand Down Expand Up @@ -179,8 +188,8 @@ tigmint-molecule: $(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).dist

# Perform the final tigmint-cut step for linked reads
tigmint-cut: \
$(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).dist$(dist).trim$(trim).window$(window).span$(span).breaktigs.fa \
$(draft).tigmint.fa
$(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).dist$(dist).trim$(trim).window$(window).span$(span).breaktigs.fa \
$(draft).tigmint.fa

# Correct misassemblies for long reads using Tigmint
tigmint-long: \
Expand All @@ -195,8 +204,8 @@ tigmint-long-to-linked: $(draft).$(reads).cut$(cut).molecule.size$(minsize).dist

# Perform the final tigmint-cut step for long reads
tigmint-long-cut: \
$(draft).$(reads).cut$(cut).molecule.size$(minsize).dist$(dist).trim$(trim).window$(window).span$(span).breaktigs.fa \
$(draft).cut$(cut).tigmint.fa
$(draft).$(reads).cut$(cut).molecule.size$(minsize).dist$(dist).trim$(trim).window$(window).span$(span).breaktigs.fa \
$(draft).cut$(cut).tigmint.fa

# Calculate assembly metrics after Tigmint
tigmint_metrics: \
Expand Down Expand Up @@ -266,6 +275,14 @@ dist_sample=1000000
$(reads).tigmint-long.params.tsv: $(longreads)
$(gtime) $(bin)/tigmint_estimate_dist.py $< -n $(dist_sample) -o $@

# Set dist options for the molecule extents rule
dist_options=
ifeq ($(dist), auto)
dist_options+=-p $(reads).tigmint-long.params.tsv
else
dist_options+=-d$(dist)
endif

# Create molecule extents BED using cut long reads
$(draft).$(reads).cut$(cut).molecule.size$(minsize).dist$(dist).bed: $(longreads) $(draft).fa $(reads).tigmint-long.params.tsv
ifeq ($(span), auto)
Expand All @@ -274,14 +291,12 @@ ifeq ($G, 0)
endif
endif

ifeq ($(dist), auto)
$(gtime) sh -c '$(bin)/../src/long-to-linked-pe -l $(cut) -m$(minsize) -g$G -s -b $(reads).barcode-multiplicity.tsv --bx -t$t --fasta -f $(reads).tigmint-long.params.tsv $< | \
minimap2 -y -t$t -x map-$(longmap) --secondary=no $(draft).fa - | \
$(bin)/tigmint_molecule_paf.py -q$(mapq) -s$(minsize) -p $(reads).tigmint-long.params.tsv - | sort -k1,1 -k2,2n -k3,3n $(SORT_OPTS) > $@'
ifeq ($(mapping), ntLink)
$(gtime) $(bin)/tigmint-ntlink-map --bed $@ -k$(ntLink_k) -w$(ntLink_w) -t$t --path $(bin) -m$(minsize) $(draft) $<
else
$(gtime) sh -c '$(bin)/../src/long-to-linked-pe -l $(cut) -m$(minsize) -g$G -s -b $(reads).barcode-multiplicity.tsv --bx -t$t --fasta -f $(reads).tigmint-long.params.tsv $< | \
minimap2 -y -t$t -x map-$(longmap) --secondary=no $(draft).fa - | \
$(bin)/tigmint_molecule_paf.py -q$(mapq) -d$(dist) -s$(minsize) - | sort -k1,1 -k2,2n -k3,3n $(SORT_OPTS) > $@'
$(bin)/tigmint_molecule_paf.py -q$(mapq) -s$(minsize) $(dist_options) - | sort -k1,1 -k2,2n -k3,3n $(SORT_OPTS) > $@'
endif

# Create molecule extents TSV
Expand Down
58 changes: 58 additions & 0 deletions bin/tigmint-ntlink-map
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env python3
"""
Helper script for running ntLink, followed by creation of the molecule extents file
@author: Lauren Coombe
"""
import argparse
import shlex
import subprocess


def run_ntlink_pair(args):
"Run ntLink pair stage to obtain PAF-formatted mappings"
command = f"ntLink pair target={args.target}.fa reads={args.reads} " \
f"paf=True k={args.k} w={args.w} t={args.t}"
command_shlex = shlex.split(command)
return_code = subprocess.call(command_shlex)
assert return_code == 0

def run_tigmint_filter_paf(args):
"Run tigmint-filter-paf to filter and convert PAF file to BED format"
script_loc = "tigmint-filter-paf"
if args.path:
script_loc = f"{args.path}/{script_loc}"

command_1_shlex = shlex.split(f"{script_loc} -m {args.m} {args.target}.fa.k{args.k}.w{args.w}.z1000.paf")
command_2_shlex = shlex.split(f"sort -k1,1 -k2,2n -k3,3n")

with open(args.bed, "wb") as outfile:
process_1 = subprocess.Popen(command_1_shlex, stdout=subprocess.PIPE)
process_2 = subprocess.Popen(command_2_shlex, stdin=process_1.stdout, stdout=outfile)
process_1.wait()
process_2.wait()

assert process_1.returncode == 0
assert process_2.returncode == 0

def main():
"Run ntLink mapping, then output filtered, sorted BED file with mapping extents"
parser = argparse.ArgumentParser(description="Run ntLink mapping, then output a filtered "
"and sorted BED file with mapping extents")
parser.add_argument("target", help="Draft assembly to scaffold")
parser.add_argument("reads", help="Long reads file")
parser.add_argument("--bed", help="Name for output BED file with mapping extents",
required=True, type=str)
parser.add_argument("-k", help="K-mer size (bp)", required=True, type=int)
parser.add_argument("-w", help="Window size", required=True, type=int)
parser.add_argument("-t", help="Number of threads [4]", default=4, type=int)
parser.add_argument("--path", help="Path to directory with tigmint-fiter-paf executable, "
"if not in PATH", type=str)
parser.add_argument("-m", help="Minimum size for mapping block (bp) [2000]", default=2000, type=int)

args = parser.parse_args()

run_ntlink_pair(args)
run_tigmint_filter_paf(args)

if __name__ == "__main__":
main()
Loading