Skip to content

Commit

Permalink
feat: Add phylogenomics entries (#145)
Browse files Browse the repository at this point in the history
  • Loading branch information
jvfe committed Jul 3, 2023
1 parent cd9ef1b commit 1cb6563
Show file tree
Hide file tree
Showing 7 changed files with 284 additions and 42 deletions.
69 changes: 28 additions & 41 deletions bin/check_assembly_samplesheet.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,10 @@
#!/usr/bin/env python

#Input check for post-assembly "Annotation" workflow
# Input check for post-assembly "Annotation" workflow

import os
import sys
import errno
import argparse


def parse_args(args=None):
Description = "Reformat nf-core/arete samplesheet file and check its contents."
Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"

parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument("FILE_IN", help="Input samplesheet file.")
parser.add_argument("FILE_OUT", help="Output file.")
return parser.parse_args(args)


def make_dir(path):
if len(path) > 0:
try:
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise exception


def print_error(error, context="Line", context_str=""):
error_str = "ERROR: Please check samplesheet -> {}".format(error)
if context != "" and context_str != "":
error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
error, context.strip(), context_str.strip()
)
print(error_str)
sys.exit(1)
from samplesheet_utils import parse_args, make_dir, print_error


def check_samplesheet(file_in, file_out):
Expand All @@ -48,13 +18,16 @@ def check_samplesheet(file_in, file_out):

sample_mapping_dict = {}
with open(file_in, "r") as fin:

## Check header
MIN_COLS = 2
HEADER = ["sample", "fna_file_path"]
header = [x.strip('"') for x in fin.readline().strip().split(",")]
if header[: len(HEADER)] != HEADER:
print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER)))
print(
"ERROR: Please check samplesheet header -> {} != {}".format(
",".join(header), ",".join(HEADER)
)
)
sys.exit(1)

## Check sample entries
Expand All @@ -71,7 +44,9 @@ def check_samplesheet(file_in, file_out):
num_cols = len([x for x in lspl if x])
if num_cols < MIN_COLS:
print_error(
"Invalid number of populated columns (minimum = {})!".format(MIN_COLS),
"Invalid number of populated columns (minimum = {})!".format(
MIN_COLS
),
"Line",
line,
)
Expand All @@ -89,12 +64,17 @@ def check_samplesheet(file_in, file_out):
if fna_file_path:
if fna_file_path.find(" ") != -1:
print_error("fna file contains spaces!", "Line", line)
if not fna_file_path.endswith(".fna") and not fna_file_path.endswith(".fna.gz") and not fna_file_path.endswith(".fa") and not fna_file_path.endswith(".fa.gz"):
if (
not fna_file_path.endswith(".fna")
and not fna_file_path.endswith(".fna.gz")
and not fna_file_path.endswith(".fa")
and not fna_file_path.endswith(".fa.gz")
):
print_error(
"Assembly files must be one of .fa, .fna, .fa.gz, or .fna.gz.",
"Line",
line,
)
)
"""
## Auto-detect paired-end/single-end
sample_info = [] ## [single_end, fastq_1, fastq_2]
Expand All @@ -110,13 +90,15 @@ def check_samplesheet(file_in, file_out):
## Create sample mapping dictionary = { sample: path }
if sample not in sample_mapping_dict:
sample_mapping_dict[sample] = sample_info
#TODO Come back to this conditional; does it make sense for multiple assemblies to one sample?
# TODO Come back to this conditional; does it make sense for multiple assemblies to one sample?
else:
if sample_info in sample_mapping_dict[sample]:
print_error("Samplesheet contains duplicate rows!", "Line", line)
else:
#sample_mapping_dict[sample].append(sample_info)
print_error("Mutliple files associated to one sample!", "Line", line)
# sample_mapping_dict[sample].append(sample_info)
print_error(
"Mutliple files associated to one sample!", "Line", line
)

## Write validated samplesheet with appropriate columns
if len(sample_mapping_dict) > 0:
Expand All @@ -125,7 +107,12 @@ def check_samplesheet(file_in, file_out):
with open(file_out, "w") as fout:
fout.write(",".join(["sample", "path"]) + "\n")
for sample in sorted(sample_mapping_dict.keys()):
fout.write(",".join(["{}".format(sample), "{}".format(sample_mapping_dict[sample])])+"\n")
fout.write(
",".join(
["{}".format(sample), "{}".format(sample_mapping_dict[sample])]
)
+ "\n"
)
else:
print_error("No entries to process!", "Samplesheet: {}".format(file_in))

Expand Down
123 changes: 123 additions & 0 deletions bin/check_phylo_samplesheet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#!/usr/bin/env python

# Input check for post-assembly "Phylogenomics" workflow

import os
import sys
from samplesheet_utils import parse_args, make_dir, print_error


def check_samplesheet(file_in, file_out):
"""
This function checks that the samplesheet follows the following structure:
sample,gff_file_path
sample,gff_file_path
"""

sample_mapping_dict = {}
with open(file_in, "r") as fin:
## Check header
MIN_COLS = 2
HEADER = ["sample", "gff_file_path"]
header = [x.strip('"') for x in fin.readline().strip().split(",")]
if header[: len(HEADER)] != HEADER:
print(
"ERROR: Please check samplesheet header -> {} != {}".format(
",".join(header), ",".join(HEADER)
)
)
sys.exit(1)

## Check sample entries
for line in fin:
lspl = [x.strip().strip('"') for x in line.strip().split(",")]

# Check valid number of columns per row
if len(lspl) < len(HEADER):
print_error(
"Invalid number of columns (minimum = {})!".format(len(HEADER)),
"Line",
line,
)
num_cols = len([x for x in lspl if x])
if num_cols < MIN_COLS:
print_error(
"Invalid number of populated columns (minimum = {})!".format(
MIN_COLS
),
"Line",
line,
)

## Check sample name entries
sample, gff_file_path = lspl[: len(HEADER)]
if sample:
if sample.find(" ") != -1:
print_error("Sample entry contains spaces!", "Line", line)
else:
print_error("Sample entry has not been specified!", "Line", line)

## Check assembly gff file extension

if gff_file_path:
if gff_file_path.find(" ") != -1:
print_error("gff file path contains spaces!", "Line", line)
if not gff_file_path.endswith(".gff") and not gff_file_path.endswith(
".gff3"
):
print_error(
"Phylo files must be one of .gff or .gff3.",
"Line",
line,
)
"""
## Auto-detect paired-end/single-end
sample_info = [] ## [single_end, fastq_1, fastq_2]
if sample and fastq_1 and fastq_2: ## Paired-end short reads
sample_info = ["0", fastq_1, fastq_2]
elif sample and fastq_1 and not fastq_2: ## Single-end short reads
sample_info = ["1", fastq_1, fastq_2]
else:
print_error("Invalid combination of columns provided!", "Line", line)
"""
sample_info = gff_file_path

## Create sample mapping dictionary = { sample: path }
if sample not in sample_mapping_dict:
sample_mapping_dict[sample] = sample_info
# TODO Come back to this conditional; does it make sense for multiple assemblies to one sample?
else:
if sample_info in sample_mapping_dict[sample]:
print_error("Samplesheet contains duplicate rows!", "Line", line)
else:
# sample_mapping_dict[sample].append(sample_info)
print_error(
"Mutliple files associated to one sample!", "Line", line
)

## Write validated samplesheet with appropriate columns
if len(sample_mapping_dict) > 0:
out_dir = os.path.dirname(file_out)
make_dir(out_dir)
with open(file_out, "w") as fout:
fout.write(",".join(["sample", "path"]) + "\n")
for sample in sorted(sample_mapping_dict.keys()):
fout.write(
",".join(
["{}".format(sample), "{}".format(sample_mapping_dict[sample])]
)
+ "\n"
)
else:
print_error("No entries to process!", "Samplesheet: {}".format(file_in))


def main(args=None):
args = parse_args(args)
check_samplesheet(args.FILE_IN, args.FILE_OUT)


if __name__ == "__main__":
sys.exit(main())
33 changes: 33 additions & 0 deletions bin/samplesheet_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os
import sys
import errno
import argparse


def parse_args(args=None):
Description = "Reformat nf-core/arete samplesheet file and check its contents."
Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"

parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument("FILE_IN", help="Input samplesheet file.")
parser.add_argument("FILE_OUT", help="Output file.")
return parser.parse_args(args)


def make_dir(path):
if len(path) > 0:
try:
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise exception


def print_error(error, context="Line", context_str=""):
error_str = "ERROR: Please check samplesheet -> {}".format(error)
if context != "" and context_str != "":
error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
error, context.strip(), context_str.strip()
)
print(error_str)
sys.exit(1)
5 changes: 5 additions & 0 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ WorkflowArete.initialise(workflow, params, log)
include { ARETE } from './workflows/arete'
include { ASSEMBLY } from './workflows/arete'
include { ANNOTATION } from './workflows/arete'
include { PHYLO } from './workflows/arete'
include { QUALITYCHECK } from './workflows/arete'
include { POPPUNK } from './workflows/arete'

Expand All @@ -55,6 +56,10 @@ workflow annotation {
ANNOTATION ()
}

workflow phylogenomics {
PHYLO ()
}

workflow assembly_qc {
QUALITYCHECK()
}
Expand Down
25 changes: 25 additions & 0 deletions modules/local/samplesheet_check.nf
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,28 @@ process ASSEMBLYSHEET_CHECK {
check_assembly_samplesheet.py $assemblysheet assemblysheet.valid.csv
"""
}

process PHYLOSHEET_CHECK {
tag "$phylosheet"
publishDir "${params.outdir}",
mode: params.publish_dir_mode,
saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'pipeline_info', publish_id:'') }

conda (params.enable_conda ? "conda-forge::python=3.8.3" : null)
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container "https://depot.galaxyproject.org/singularity/python:3.8.3"
} else {
container "quay.io/biocontainers/python:3.8.3"
}

input:
path phylosheet

output:
path '*.csv'

script: // This script is bundled with the pipeline, in arete/bin/
"""
check_phylo_samplesheet.py $phylosheet phylosheet.valid.csv
"""
}
39 changes: 39 additions & 0 deletions subworkflows/local/phylo_input_check.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
include { PHYLOSHEET_CHECK } from '../../modules/local/samplesheet_check'

// Input to the phylo work flow is different
// Instead of reads, pass in GFF files
workflow PHYLO_INPUT_CHECK {
take:
samplesheet

main:
PHYLOSHEET_CHECK ( samplesheet )
.splitCsv ( header:true, sep:',' )
.map { get_sample_info_phylo(it) }
.set { genomes }

//Check that no dots "." are in sample ID
genomes
.map { meta, reads -> meta.id }
.subscribe { if ( "$it".contains(".") ) exit 1, "Please review data input, sampleIDs may not contain dots, but \"$it\" does." }

emit:
genomes // channel: [ val(meta), [ reads ] ]
}
// Function to get list of [ meta, [ path ] ]
def get_sample_info_phylo(LinkedHashMap row) {
def meta = [:]
meta.id = row.sample
meta.single_end = true

def array = []
if (!file(row.path).exists()) {
print("***")
print(row)
print("***")
exit 1, "ERROR: Please check input samplesheet -> Sequence file does not exist!\n${row.path}"
}
array = [ meta, file(row.path)]

return array
}
Loading

0 comments on commit 1cb6563

Please sign in to comment.