Skip to content

Commit

Permalink
add header to accessions, add hybrid and short-polish
Browse files Browse the repository at this point in the history
  • Loading branch information
rpetit3 committed Sep 3, 2023
1 parent 1c8d4a9 commit 2dacb6a
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 29 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## Unreleased

## 1.0.7

- Fixed `bactopia-search` not including header name in accessions.txt
- Added `--hybrid` and `--short-polish` to `bactopia-prepare`

## 1.0.6

- Fixed `bactopia-summary` handling of empty searches
Expand Down
114 changes: 86 additions & 28 deletions bactopia/cli/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
"--pe2-pattern",
"--merge",
"--ont",
"--hybrid",
"--short-polish",
"--recursive",
"--prefix",
],
Expand Down Expand Up @@ -130,33 +132,61 @@ def print_examples():
print(
textwrap.dedent(
"""
# Example '*.fastq.gz' FASTQ files:
bactopia prepare --path fastqs/
sample runtype genome_size species r1 r2 extra
sample01 paired-end 0 UNKNOWN_SPECIES fastqs/sample01_R1.fastq.gz fastqs/sample01_R2.fastq.gz
sample02 single-end 0 UNKNOWN_SPECIES fastqs/sample02.fastq.gz
sample03 paired-end 0 UNKNOWN_SPECIES fastqs/sample03_R1.fastq.gz fastqs/sample03_R2.fastq.gz
# Example '*_001.fastq.gz' FASTQ files:
bactopia-prepare --path fastqs/ --fastq-ext '_001.fastq.gz' | head -n
sample runtype r1 r2 extra
sample01 paired-end /fastqs/sample01_R1_001.fastq.gz /fastqs/sample01_R2_001.fastq.gz
sample02 paired-end /fastqs/sample02_R1_001.fastq.gz /fastqs/sample02_R2_001.fastq.gz
sample03 single-end /fastqs/sample03_001.fastq.gz
bactopia prepare --path fastqs/ --fastq-ext '_001.fastq.gz'
sample runtype genome_size species r1 r2 extra
sample01 paired-end 0 UNKNOWN_SPECIES fastqs/sample01_R1_001.fastq.gz fastqs/sample01_R2_001.fastq.gz
sample02 paired-end 0 UNKNOWN_SPECIES fastqs/sample02_R1_001.fastq.gz fastqs/sample02_R2_001.fastq.gz
sample03 paired-end 0 UNKNOWN_SPECIES fastqs/sample03_R1_001.fastq.gz fastqs/sample03_R2_001.fastq.gz
# Example '*.fq.gz' FASTQ files:
bactopia prepare --path fastqs/ --fastq-ext '.fq.gz'
sample runtype r1 r2 extra
sample01 single-end /home/robert_petit/bactopia/fastqs/sample01.fq.gz
sample02 single-end /home/robert_petit/bactopia/fastqs/sample02.fq.gz
sample03 single-end /home/robert_petit/bactopia/fastqs/sample03.fq.gz
bactopia prepare --path fastqs --fastq-ext '.fq.gz'
sample runtype genome_size species r1 r2 extra
sample01 single-end 0 UNKNOWN_SPECIES fastqs/sample01.fq.gz
sample02 single-end 0 UNKNOWN_SPECIES fastqs/sample02.fq.gz
sample03 single-end 0 UNKNOWN_SPECIES fastqs/sample03.fq.gz
# Example '*.fna.gz' FASTA files:
bactopia-prepare --path assembly/
sample runtype genome_size species r1 r2 extra
sample01 assembly 0 UNKNOWN_SPECIES assembly/sample01.fna.gz
sample02 assembly 0 UNKNOWN_SPECIES assembly/sample02.fna.gz
sample03 assembly 0 UNKNOWN_SPECIES assembly/sample03.fna.gz
# Example "*.fasta.gz" FASTA files:
bactopia-prepare --path fastqs/ --assembly-ext '.fasta.gz'
sample runtype r1 r2 extra
sample01 assembly /home/robert_petit/bactopia/temp/fastas/sample01.fasta.gz
sample02 assembly /home/robert_petit/bactopia/temp/fastas/sample02.fasta.gz
sample03 assembly /home/robert_petit/bactopia/temp/fastas/sample03.fasta.gz
bactopia prepare --path assembly/ --assembly-ext .fasta.gz
sample runtype genome_size species r1 r2 extra
sample01 assembly 0 UNKNOWN_SPECIES assembly/sample01.fasta.gz
sample02 assembly 0 UNKNOWN_SPECIES assembly/sample02.fasta.gz
sample03 assembly 0 UNKNOWN_SPECIES assembly/sample03.fasta.gz
# Example Nanopore FASTQ files:
bactopia prepare --path fastqs/ --ont
sample runtype genome_size species r1 r2 extra
sample01 ont 0 UNKNOWN_SPECIES fastqs/sample01.fastq.gz
sample02 ont 0 UNKNOWN_SPECIES fastqs/sample02.fastq.gz
sample03 ont 0 UNKNOWN_SPECIES fastqs/sample03.fastq.gz
# Example Illumina and Nanopore FASTQ files:
bactopia prepare --path illumina/ --ont --short-polish
sample runtype genome_size species r1 r2 extra
sample01 short_polish 0 UNKNOWN_SPECIES fastqs/sample01_R1.fastq.gz fastqs/sample01_R2.fastq.gz fastqs/sample01.fastq.gz
sample02 ont 0 UNKNOWN_SPECIES fastqs/sample02.fastq.gz
sample03 short_polish 0 UNKNOWN_SPECIES fastqs/sample03_R1.fastq.gz fastqs/sample03_R2.fastq.gz fastqs/sample03.fastq.gz
# Example changing the separator:
bactopia-prepare --path fastqs/ --fastq-separator '.'
sample runtype r1 r2 extra
my_sample01 ont /home/robert_petit/bactopia/temp/fastqs/my_sample01.fastq.gz
my_sample02 ont /home/robert_petit/bactopia/temp/fastqs/my_sample02.fastq.gz
my_sample03 ont /home/robert_petit/bactopia/temp/fastqs/my_sample03.fastq.gz
bactopia prepare --path ext/ --fastq-separator '.'
sample runtype genome_size species r1 r2 extra
sample_01 single-end 0 UNKNOWN_SPECIES fastqs/sample_01.fastq.gz
sample_02 single-end 0 UNKNOWN_SPECIES fastqs/sample_02.fastq.gz
sample_03 single-end 0 UNKNOWN_SPECIES fastqs/sample_03.fastq.gz
# Example metadata file (--metadata):
sample01 Staphylococcus aureus 0
Expand Down Expand Up @@ -226,6 +256,16 @@ def print_examples():
is_flag=True,
help="Single-end reads should be treated as Oxford Nanopore reads",
)
@click.option(
"--hybrid",
is_flag=True,
help="Samples with paired and single-end reads will be set to Illumina-first hybrid assembly (requires --ont)",
)
@click.option(
"--short-polish",
is_flag=True,
help="Samples with paired and single-end reads will be set to Nanopore-first hybrid assembly (requires --ont)",
)
@click.option(
"--merge",
is_flag=True,
Expand All @@ -248,6 +288,8 @@ def prepare(
taxid,
recursive,
ont,
hybrid,
short_polish,
merge,
prefix,
examples,
Expand All @@ -267,6 +309,12 @@ def prepare(
logging.ERROR if silent else logging.DEBUG if verbose else logging.INFO
)

if hybrid and short_polish:
logging.error(
"--hybrid and --short-polish cannot be used together. Please select only one."
)
sys.exit(1)

abspath = Path(path).absolute()
SAMPLES = {}
SPECIES_TAXIDS = {}
Expand Down Expand Up @@ -351,44 +399,48 @@ def prepare(
if len(assembly) > 1:
# Can't have multiple assemblies for the same sample
errors.append(
f'ERROR: "{sample}" cannot have more than two assembly FASTA, please check.'
f'"{sample}" cannot have more than two assembly FASTA, please check.'
)
elif len(assembly) == 1 and (pe_count or len(se_reads)):
# Can't have an assembly and reads for a sample
errors.append(
f'ERROR: "{sample}" cannot have assembly and sequence reads, please check.'
f'"{sample}" cannot have assembly and sequence reads, please check.'
)

if len(r1_reads) != len(r2_reads):
# PE reads must be a pair
errors.append(
f'ERROR: "{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.'
f'"{sample}" must have equal paired-end read sets (R1 has {len(r1_reads)} and R2 has {len(r2_reads)}, please check.'
)
elif pe_count > 2:
# PE reads must be a pair
if merge:
multiple_read_sets = True
else:
errors.append(
f'ERROR: "{sample}" cannot have more than two paired-end FASTQ, please check.'
f'"{sample}" cannot have more than two paired-end FASTQ, please check. Did you mean to use "--merge"?'
)

if ont:
if not pe_count and len(se_reads):
is_single_end = True
elif pe_count and len(se_reads) and not hybrid and not short_polish:
errors.append(
f'"{sample}" cannot have paired and single-end FASTQs, please check. Did you mean to use "--hybrid" or "--short-polish"?'
)
else:
if len(se_reads) > 1:
# Can't have multiple SE reads
if merge:
multiple_read_sets = True
else:
errors.append(
f'ERROR: "{sample}" has more than two single-end FASTQs, please check.'
f'"{sample}" has more than two single-end FASTQs, please check. Did you mean to use "--merge"?'
)
elif pe_count and len(se_reads):
# Can't have SE and PE reads unless long reads
errors.append(
f'ERROR: "{sample}" has paired and single-end FASTQs, please check.'
f'"{sample}" has paired and single-end FASTQs, please check. Did you mean to use "--ont" along with "--hybrid" or "--short-polish"?'
)

if errors:
Expand All @@ -408,7 +460,10 @@ def prepare(
if pe_count:
if multiple_read_sets:
if ont:
runtype = "hybrid-merge-pe"
if hybrid:
runtype = "hybrid-merge-ont"
elif short_polish:
runtype = "short_polish-merge-ont"
else:
runtype = "merge-pe"
r1 = ",".join(sorted(r1_reads))
Expand All @@ -420,7 +475,10 @@ def prepare(

if se_reads:
if ont and not is_single_end:
runtype = "hybrid"
if hybrid:
runtype = "hybrid"
elif short_polish:
runtype = "short_polish"
extra = se_reads[0]
elif ont and is_single_end:
runtype = "ont"
Expand Down
1 change: 1 addition & 0 deletions bactopia/cli/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,7 @@ def search(

logging.info(f"Writing accessions to {accessions_file}")
with open(accessions_file, "w") as output_fh:
output_fh.write("accession\truntype\tspecies\tgenome_size\n")
for accession in accessions:
output_fh.write(f"{accession}\n")

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "bactopia"
version = "1.0.6"
version = "1.0.7"
description = "A Python package for working with Bactopia"
authors = [
"Robert A. Petit III <robbie.petit@gmail.com>",
Expand Down

0 comments on commit 2dacb6a

Please sign in to comment.