feat: Add phylogenomics entries (#145)

beiko-lab · Jul 3, 2023 · 1cb6563 · 1cb6563
1 parent cd9ef1b
commit 1cb6563
Show file tree

Hide file tree

Showing 7 changed files with 284 additions and 42 deletions.
diff --git a/bin/check_assembly_samplesheet.py b/bin/check_assembly_samplesheet.py
@@ -1,40 +1,10 @@
 #!/usr/bin/env python
 
-#Input check for post-assembly "Annotation" workflow
+# Input check for post-assembly "Annotation" workflow
 
 import os
 import sys
-import errno
-import argparse
-
-
-def parse_args(args=None):
-    Description = "Reformat nf-core/arete samplesheet file and check its contents."
-    Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"
-
-    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
-    parser.add_argument("FILE_IN", help="Input samplesheet file.")
-    parser.add_argument("FILE_OUT", help="Output file.")
-    return parser.parse_args(args)
-
-
-def make_dir(path):
-    if len(path) > 0:
-        try:
-            os.makedirs(path)
-        except OSError as exception:
-            if exception.errno != errno.EEXIST:
-                raise exception
-
-
-def print_error(error, context="Line", context_str=""):
-    error_str = "ERROR: Please check samplesheet -> {}".format(error)
-    if context != "" and context_str != "":
-        error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
-            error, context.strip(), context_str.strip()
-        )
-    print(error_str)
-    sys.exit(1)
+from samplesheet_utils import parse_args, make_dir, print_error
 
 
 def check_samplesheet(file_in, file_out):
@@ -48,13 +18,16 @@ def check_samplesheet(file_in, file_out):
 
     sample_mapping_dict = {}
     with open(file_in, "r") as fin:
-
         ## Check header
         MIN_COLS = 2
         HEADER = ["sample", "fna_file_path"]
         header = [x.strip('"') for x in fin.readline().strip().split(",")]
         if header[: len(HEADER)] != HEADER:
-            print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER)))
+            print(
+                "ERROR: Please check samplesheet header -> {} != {}".format(
+                    ",".join(header), ",".join(HEADER)
+                )
+            )
             sys.exit(1)
 
         ## Check sample entries
@@ -71,7 +44,9 @@ def check_samplesheet(file_in, file_out):
             num_cols = len([x for x in lspl if x])
             if num_cols < MIN_COLS:
                 print_error(
-                    "Invalid number of populated columns (minimum = {})!".format(MIN_COLS),
+                    "Invalid number of populated columns (minimum = {})!".format(
+                        MIN_COLS
+                    ),
                     "Line",
                     line,
                 )
@@ -89,12 +64,17 @@ def check_samplesheet(file_in, file_out):
             if fna_file_path:
                 if fna_file_path.find(" ") != -1:
                     print_error("fna file contains spaces!", "Line", line)
-                if not fna_file_path.endswith(".fna") and not fna_file_path.endswith(".fna.gz") and not fna_file_path.endswith(".fa") and not fna_file_path.endswith(".fa.gz"):
+                if (
+                    not fna_file_path.endswith(".fna")
+                    and not fna_file_path.endswith(".fna.gz")
+                    and not fna_file_path.endswith(".fa")
+                    and not fna_file_path.endswith(".fa.gz")
+                ):
                     print_error(
                         "Assembly files must be one of .fa, .fna, .fa.gz, or .fna.gz.",
                         "Line",
                         line,
-                        )
+                    )
             """
             ## Auto-detect paired-end/single-end
             sample_info = []  ## [single_end, fastq_1, fastq_2]
@@ -110,13 +90,15 @@ def check_samplesheet(file_in, file_out):
             ## Create sample mapping dictionary = { sample: path }
             if sample not in sample_mapping_dict:
                 sample_mapping_dict[sample] = sample_info
-            #TODO Come back to this conditional; does it make sense for multiple assemblies to one sample?
+            # TODO Come back to this conditional; does it make sense for multiple assemblies to one sample?
             else:
                 if sample_info in sample_mapping_dict[sample]:
                     print_error("Samplesheet contains duplicate rows!", "Line", line)
                 else:
-                    #sample_mapping_dict[sample].append(sample_info)
-                    print_error("Mutliple files associated to one sample!", "Line", line)
+                    # sample_mapping_dict[sample].append(sample_info)
+                    print_error(
+                        "Mutliple files associated to one sample!", "Line", line
+                    )
 
     ## Write validated samplesheet with appropriate columns
     if len(sample_mapping_dict) > 0:
@@ -125,7 +107,12 @@ def check_samplesheet(file_in, file_out):
         with open(file_out, "w") as fout:
             fout.write(",".join(["sample", "path"]) + "\n")
             for sample in sorted(sample_mapping_dict.keys()):
-                fout.write(",".join(["{}".format(sample), "{}".format(sample_mapping_dict[sample])])+"\n")
+                fout.write(
+                    ",".join(
+                        ["{}".format(sample), "{}".format(sample_mapping_dict[sample])]
+                    )
+                    + "\n"
+                )
     else:
         print_error("No entries to process!", "Samplesheet: {}".format(file_in))
 

diff --git a/bin/check_phylo_samplesheet.py b/bin/check_phylo_samplesheet.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+
+# Input check for post-assembly "Phylogenomics" workflow
+
+import os
+import sys
+from samplesheet_utils import parse_args, make_dir, print_error
+
+
+def check_samplesheet(file_in, file_out):
+    """
+    This function checks that the samplesheet follows the following structure:
+
+    sample,gff_file_path
+    sample,gff_file_path
+
+    """
+
+    sample_mapping_dict = {}
+    with open(file_in, "r") as fin:
+        ## Check header
+        MIN_COLS = 2
+        HEADER = ["sample", "gff_file_path"]
+        header = [x.strip('"') for x in fin.readline().strip().split(",")]
+        if header[: len(HEADER)] != HEADER:
+            print(
+                "ERROR: Please check samplesheet header -> {} != {}".format(
+                    ",".join(header), ",".join(HEADER)
+                )
+            )
+            sys.exit(1)
+
+        ## Check sample entries
+        for line in fin:
+            lspl = [x.strip().strip('"') for x in line.strip().split(",")]
+
+            # Check valid number of columns per row
+            if len(lspl) < len(HEADER):
+                print_error(
+                    "Invalid number of columns (minimum = {})!".format(len(HEADER)),
+                    "Line",
+                    line,
+                )
+            num_cols = len([x for x in lspl if x])
+            if num_cols < MIN_COLS:
+                print_error(
+                    "Invalid number of populated columns (minimum = {})!".format(
+                        MIN_COLS
+                    ),
+                    "Line",
+                    line,
+                )
+
+            ## Check sample name entries
+            sample, gff_file_path = lspl[: len(HEADER)]
+            if sample:
+                if sample.find(" ") != -1:
+                    print_error("Sample entry contains spaces!", "Line", line)
+            else:
+                print_error("Sample entry has not been specified!", "Line", line)
+
+            ## Check assembly gff file extension
+
+            if gff_file_path:
+                if gff_file_path.find(" ") != -1:
+                    print_error("gff file path contains spaces!", "Line", line)
+                if not gff_file_path.endswith(".gff") and not gff_file_path.endswith(
+                    ".gff3"
+                ):
+                    print_error(
+                        "Phylo files must be one of .gff or .gff3.",
+                        "Line",
+                        line,
+                    )
+            """
+            ## Auto-detect paired-end/single-end
+            sample_info = []  ## [single_end, fastq_1, fastq_2]
+            if sample and fastq_1 and fastq_2:  ## Paired-end short reads
+                sample_info = ["0", fastq_1, fastq_2]
+            elif sample and fastq_1 and not fastq_2:  ## Single-end short reads
+                sample_info = ["1", fastq_1, fastq_2]
+            else:
+                print_error("Invalid combination of columns provided!", "Line", line)
+            """
+            sample_info = gff_file_path
+
+            ## Create sample mapping dictionary = { sample: path }
+            if sample not in sample_mapping_dict:
+                sample_mapping_dict[sample] = sample_info
+            # TODO Come back to this conditional; does it make sense for multiple assemblies to one sample?
+            else:
+                if sample_info in sample_mapping_dict[sample]:
+                    print_error("Samplesheet contains duplicate rows!", "Line", line)
+                else:
+                    # sample_mapping_dict[sample].append(sample_info)
+                    print_error(
+                        "Mutliple files associated to one sample!", "Line", line
+                    )
+
+    ## Write validated samplesheet with appropriate columns
+    if len(sample_mapping_dict) > 0:
+        out_dir = os.path.dirname(file_out)
+        make_dir(out_dir)
+        with open(file_out, "w") as fout:
+            fout.write(",".join(["sample", "path"]) + "\n")
+            for sample in sorted(sample_mapping_dict.keys()):
+                fout.write(
+                    ",".join(
+                        ["{}".format(sample), "{}".format(sample_mapping_dict[sample])]
+                    )
+                    + "\n"
+                )
+    else:
+        print_error("No entries to process!", "Samplesheet: {}".format(file_in))
+
+
+def main(args=None):
+    args = parse_args(args)
+    check_samplesheet(args.FILE_IN, args.FILE_OUT)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/bin/samplesheet_utils.py b/bin/samplesheet_utils.py
@@ -0,0 +1,33 @@
+import os
+import sys
+import errno
+import argparse
+
+
+def parse_args(args=None):
+    Description = "Reformat nf-core/arete samplesheet file and check its contents."
+    Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"
+
+    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
+    parser.add_argument("FILE_IN", help="Input samplesheet file.")
+    parser.add_argument("FILE_OUT", help="Output file.")
+    return parser.parse_args(args)
+
+
+def make_dir(path):
+    if len(path) > 0:
+        try:
+            os.makedirs(path)
+        except OSError as exception:
+            if exception.errno != errno.EEXIST:
+                raise exception
+
+
+def print_error(error, context="Line", context_str=""):
+    error_str = "ERROR: Please check samplesheet -> {}".format(error)
+    if context != "" and context_str != "":
+        error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
+            error, context.strip(), context_str.strip()
+        )
+    print(error_str)
+    sys.exit(1)
diff --git a/main.nf b/main.nf
@@ -34,6 +34,7 @@ WorkflowArete.initialise(workflow, params, log)
 include { ARETE } from './workflows/arete'
 include { ASSEMBLY } from './workflows/arete'
 include { ANNOTATION } from './workflows/arete'
+include { PHYLO } from './workflows/arete'
 include { QUALITYCHECK } from './workflows/arete'
 include { POPPUNK } from './workflows/arete'
 
@@ -55,6 +56,10 @@ workflow annotation {
     ANNOTATION ()
 }
 
+workflow phylogenomics {
+    PHYLO ()
+}
+
 workflow assembly_qc {
     QUALITYCHECK()
 }

diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf
@@ -54,3 +54,28 @@ process ASSEMBLYSHEET_CHECK {
     check_assembly_samplesheet.py $assemblysheet assemblysheet.valid.csv
     """
 }
+
+process PHYLOSHEET_CHECK {
+    tag "$phylosheet"
+    publishDir "${params.outdir}",
+        mode: params.publish_dir_mode,
+        saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'pipeline_info', publish_id:'') }
+
+    conda (params.enable_conda ? "conda-forge::python=3.8.3" : null)
+    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+        container "https://depot.galaxyproject.org/singularity/python:3.8.3"
+    } else {
+        container "quay.io/biocontainers/python:3.8.3"
+    }
+
+    input:
+    path phylosheet
+
+    output:
+    path '*.csv'
+
+    script:  // This script is bundled with the pipeline, in arete/bin/
+    """
+    check_phylo_samplesheet.py $phylosheet phylosheet.valid.csv
+    """
+}
diff --git a/subworkflows/local/phylo_input_check.nf b/subworkflows/local/phylo_input_check.nf
@@ -0,0 +1,39 @@
+include { PHYLOSHEET_CHECK } from '../../modules/local/samplesheet_check'
+
+// Input to the phylo work flow is different
+// Instead of reads, pass in GFF files
+workflow PHYLO_INPUT_CHECK {
+    take:
+    samplesheet
+
+    main:
+    PHYLOSHEET_CHECK ( samplesheet )
+        .splitCsv ( header:true, sep:',' )
+        .map { get_sample_info_phylo(it) }
+        .set { genomes }
+
+    //Check that no dots "." are in sample ID
+    genomes
+        .map { meta, reads -> meta.id }
+        .subscribe { if ( "$it".contains(".") ) exit 1, "Please review data input, sampleIDs may not contain dots, but \"$it\" does." }
+
+    emit:
+    genomes // channel: [ val(meta), [ reads ] ]
+}
+// Function to get list of [ meta, [ path ] ]
+def get_sample_info_phylo(LinkedHashMap row) {
+    def meta = [:]
+    meta.id           = row.sample
+    meta.single_end = true
+
+    def array = []
+    if (!file(row.path).exists()) {
+        print("***")
+        print(row)
+        print("***")
+        exit 1, "ERROR: Please check input samplesheet -> Sequence file does not exist!\n${row.path}"
+    }
+    array = [ meta, file(row.path)]
+
+    return array
+}