biocore · antgonza · Mar 14, 2024 · Mar 14, 2024 · Mar 14, 2024 · Mar 14, 2024
diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py
@@ -271,8 +271,7 @@ def _configure_profile(self):
                     profile_paths.append(some_path)
 
         # There must be at least one valid profile for the Pipeline to
-        # continue operation. There must also be a default profile, described
-        # below.
+        # continue operation.
         if not profile_paths:
             raise ValueError(f"'{profile_dir}' doesn't contain profile files")
 
@@ -293,72 +292,31 @@ def _configure_profile(self):
 
                 # the 'profile' attribute must have a dictionary as its value.
                 # all profiles must contain 'instrument_type' and 'assay_type',
-                # unless instrument_type == 'default', in which case the
-                # profile defines the defaults across all instrument-types and
-                # assay-types.
                 if 'instrument_type' not in contents['profile']:
                     raise ValueError("'instrument_type' is not an attribute "
                                      f"in '{profile_path}'.profile")
 
                 if 'assay_type' not in contents['profile']:
-                    if contents['profile']['instrument_type'] != 'default':
-                        raise ValueError("'assay_type' is not an attribute "
-                                         f"in '{profile_path}'.profile")
+                    raise ValueError("'assay_type' is not an attribute "
+                                     f"in '{profile_path}'.profile")
 
                 profiles.append(contents)
 
-        # The default profile provides 'fall-through' configuration settings
-        # for all items. This allows the user to not have to redefine settings
-        # for all items for all instrument and assay combinations.
-
-        # the final profile is created by taking the default profile and using
-        # it as a base. If a profile matching the run-directory's instrument
-        # and assay types is found, settings from that profile will overwrite
-        # the base-profile settings as appropriate.
-        base_profile = None
         selected_profile = None
 
-        # iterate through all the profiles, searching for a default
-        # profile and the first profile w/matching instrument and assay types.
-        # if a matching profile isn't found, that's okay, but if a default
-        # profile isn't found, then raise an Error.
-
         for profile in profiles:
-            p_i_type = profile['profile']['instrument_type']
-            if p_i_type == 'default':
-                base_profile = profile
-            else:
-                p_a_type = profile['profile']['assay_type']
+            i_type = profile['profile']['instrument_type']
+            a_type = profile['profile']['assay_type']
 
-                # if both items have been found, it's safe to break early.
-                if base_profile is not None and selected_profile is not None:
-                    break
+            if i_type == instr_type and a_type == assay_type:
+                selected_profile = profile
+                break
+
+        if selected_profile is None:
+            raise ValueError(f"a matching profile ({instr_type}, {assay_type}"
+                             ") was not found. Please notify an administrator")
 
-                if p_i_type == instr_type and p_a_type == assay_type:
-                    selected_profile = profile
-
-        if base_profile is None:
-            raise ValueError("a 'default' profile was not found")
-
-        if selected_profile:
-            # overwrite the configuration values in the base-profile with those
-            # in the matching profile as appropriate.
-            for attribute in selected_profile['profile']['configuration']:
-                value = selected_profile['profile']['configuration'][attribute]
-                base_profile['profile']['configuration'][attribute] = value
-
-            # overwrite default info w/selected profile (if one was found)
-            # so that complete profile can be written to working directory
-            # as a log.
-            base_profile['profile']['instrument_type'] = instr_type
-            base_profile['profile']['assay_type'] = assay_type
-
-        # load the default first to create a default entry for everything.
-        # then overwrite the defaults as they appear once you've identified
-        # the correct (instrument-type, assay-type) pair.
-        # set this to a new self.config_profile variable and modify the tests
-        # and code accordingly.
-        self.config_profile = base_profile
+        self.config_profile = selected_profile
 
     def _search_for_run_dir(self):
         # this method will catch a run directory as well as its products

diff --git a/.../data/configuration_profiles/default.json → ...onfiguration_profiles/miseq_amplicon.json b/.../data/configuration_profiles/default.json → ...onfiguration_profiles/miseq_amplicon.json
@@ -1,17 +1,18 @@
 {
   "profile": {
-    "instrument_type": "default",
+    "instrument_type": "MiSeq",
+    "assay_type": "TruSeq HT",
     "configuration": {
       "bcl2fastq": {
-        "nodes": 1,
-        "nprocs": 16,
+        "nodes": 2,
+        "nprocs": 62,
         "queue": "qiita",
-        "wallclock_time_in_minutes": 216,
+        "wallclock_time_in_minutes": 1022,
         "modules_to_load": [
-          "bcl2fastq_2.20.0.422"
+          "bcl2fastq_2.20.0.222"
         ],
         "executable_path": "bcl2fastq",
-        "per_process_memory_limit": "10gb"
+        "per_process_memory_limit": "100gb"
       },
       "bcl-convert": {
         "nodes": 1,
@@ -46,10 +47,10 @@
         "job_max_array_length": 1000
       },
       "nu-qc": {
-        "nodes": 1,
-        "cpus_per_task": 8,
+        "nodes": 2,
+        "cpus_per_task": 32,
         "queue": "qiita",
-        "wallclock_time_in_minutes": 240,
+        "wallclock_time_in_minutes": 2028,
         "minimap2_databases": "/scratch/databases/minimap2",
         "modules_to_load": [
           "fastp_0.20.1",
@@ -64,27 +65,27 @@
         "known_adapters_path": "fastp_known_adapters_formatted.fna",
         "bucket_size": 8,
         "length_limit": 100,
-        "cores_per_task": 4
+        "cores_per_task": 2
       },
       "seqpro": {
         "seqpro_path": "seqpro",
         "modules_to_load": []
       },
       "fastqc": {
-        "nodes": 1,
-        "nprocs": 16,
+        "nodes": 2,
+        "nprocs": 62,
         "queue": "qiita",
-        "nthreads": 16,
-        "wallclock_time_in_minutes": 60,
+        "nthreads": 62,
+        "wallclock_time_in_minutes": 220,
         "modules_to_load": [
           "fastqc_0.11.5"
         ],
         "fastqc_executable_path": "fastqc",
         "multiqc_executable_path": "multiqc",
         "multiqc_config_file_path": "sequence_processing_pipeline/multiqc-bclconvert-config.yaml",
         "job_total_memory_limit": "20gb",
-        "job_pool_size": 30,
-        "job_max_array_length": 1000
+        "job_pool_size": 120,
+        "job_max_array_length": 2000
       }
     }
   }

diff --git a/sequence_processing_pipeline/tests/data/configuration_profiles/miseq_metagenomic.json b/sequence_processing_pipeline/tests/data/configuration_profiles/miseq_metagenomic.json
@@ -14,6 +14,38 @@
         "executable_path": "bcl2fastq",
         "per_process_memory_limit": "100gb"
       },
+      "bcl-convert": {
+        "nodes": 1,
+        "nprocs": 16,
+        "queue": "qiita",
+        "wallclock_time_in_minutes": 216,
+        "modules_to_load": [
+          "bclconvert_3.7.5"
+        ],
+        "executable_path": "bcl-convert",
+        "per_process_memory_limit": "10gb"
+      },
+      "qc": {
+        "nodes": 1,
+        "nprocs": 16,
+        "queue": "qiita",
+        "wallclock_time_in_minutes": 60,
+        "minimap2_databases": [
+          "/databases/minimap2/human-phix-db.mmi"
+        ],
+        "kraken2_database": "/databases/minimap2/hp_kraken-db.mmi",
+        "modules_to_load": [
+          "fastp_0.20.1",
+          "samtools_1.12",
+          "minimap2_2.18"
+        ],
+        "fastp_executable_path": "fastp",
+        "minimap2_executable_path": "minimap2",
+        "samtools_executable_path": "samtools",
+        "job_total_memory_limit": "20gb",
+        "job_pool_size": 30,
+        "job_max_array_length": 1000
+      },
       "nu-qc": {
         "nodes": 2,
         "cpus_per_task": 32,

diff --git a/sequence_processing_pipeline/tests/data/configuration_profiles/novaseq_amplicon.json b/sequence_processing_pipeline/tests/data/configuration_profiles/novaseq_amplicon.json
@@ -14,6 +14,38 @@
         "executable_path": "bcl2fastq",
         "per_process_memory_limit": "100gb"
       },
+      "bcl-convert": {
+        "nodes": 1,
+        "nprocs": 16,
+        "queue": "qiita",
+        "wallclock_time_in_minutes": 216,
+        "modules_to_load": [
+          "bclconvert_3.7.5"
+        ],
+        "executable_path": "bcl-convert",
+        "per_process_memory_limit": "10gb"
+      },
+      "qc": {
+        "nodes": 1,
+        "nprocs": 16,
+        "queue": "qiita",
+        "wallclock_time_in_minutes": 60,
+        "minimap2_databases": [
+          "/databases/minimap2/human-phix-db.mmi"
+        ],
+        "kraken2_database": "/databases/minimap2/hp_kraken-db.mmi",
+        "modules_to_load": [
+          "fastp_0.20.1",
+          "samtools_1.12",
+          "minimap2_2.18"
+        ],
+        "fastp_executable_path": "fastp",
+        "minimap2_executable_path": "minimap2",
+        "samtools_executable_path": "samtools",
+        "job_total_memory_limit": "20gb",
+        "job_pool_size": 30,
+        "job_max_array_length": 1000
+      },
       "nu-qc": {
         "nodes": 4,
         "cpus_per_task": 32,

diff --git a/sequence_processing_pipeline/tests/data/configuration_profiles/novaseq_metagenomic.json b/sequence_processing_pipeline/tests/data/configuration_profiles/novaseq_metagenomic.json
@@ -14,6 +14,38 @@
         "executable_path": "bcl2fastq",
         "per_process_memory_limit": "100gb"
       },
+      "bcl-convert": {
+        "nodes": 1,
+        "nprocs": 16,
+        "queue": "qiita",
+        "wallclock_time_in_minutes": 216,
+        "modules_to_load": [
+          "bclconvert_3.7.5"
+        ],
+        "executable_path": "bcl-convert",
+        "per_process_memory_limit": "10gb"
+      },
+      "qc": {
+        "nodes": 1,
+        "nprocs": 16,
+        "queue": "qiita",
+        "wallclock_time_in_minutes": 60,
+        "minimap2_databases": [
+          "/databases/minimap2/human-phix-db.mmi"
+        ],
+        "kraken2_database": "/databases/minimap2/hp_kraken-db.mmi",
+        "modules_to_load": [
+          "fastp_0.20.1",
+          "samtools_1.12",
+          "minimap2_2.18"
+        ],
+        "fastp_executable_path": "fastp",
+        "minimap2_executable_path": "minimap2",
+        "samtools_executable_path": "samtools",
+        "job_total_memory_limit": "20gb",
+        "job_pool_size": 30,
+        "job_max_array_length": 1000
+      },
       "nu-qc": {
         "nodes": 4,
         "cpus_per_task": 32,

diff --git a/sequence_processing_pipeline/tests/test_Pipeline.py b/sequence_processing_pipeline/tests/test_Pipeline.py
@@ -4,7 +4,7 @@
 from sequence_processing_pipeline.Pipeline import Pipeline, InstrumentUtils
 import unittest
 from os import makedirs, walk
-from os.path import abspath, basename, join
+from os.path import abspath, basename, join, exists
 from functools import partial
 import re
 from shutil import copy
@@ -90,7 +90,9 @@ def delete_rtacomplete_file(self):
 
     def delete_more_files(self):
         for file_path in self.delete_these:
-            os.remove(file_path)
+            if exists(file_path):
+                # if file no longer exists, that's okay.
+                os.remove(file_path)
 
     def _make_mapping_file(self, output_file_path):
         cols = ('sample_name', 'barcode', 'library_construction_protocol',
@@ -309,31 +311,11 @@ def test_creation(self):
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
-        # test Error returned when 'assay_type' does not exist in default
-        # profile. Error should not be returned in this case as default
-        # shouldn't have an assay_type.
-        with open(bad_json_file, 'w') as f:
-            f.write('{ "profile": { "instrument_type": "default", '
-                    '"configuration": { "bcl2fastq": { "nodes": 1, "nprocs": '
-                    '16, "queue": "qiita", "wallclock_time_in_minutes": 216, '
-                    '"modules_to_load": [ "bcl2fastq_2.20.0.422" ], '
-                    '"executable_path": "bcl2fastq", '
-                    '"per_process_memory_limit": "10gb" } } } }')
-
-        pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            self.good_sample_sheet_path, None,
-                            self.output_file_path, self.qiita_id,
-                            Pipeline.METAGENOMIC_PTYPE)
-
-        self.assertIsNotNone(pipeline)
-
         # test Error returned when a non-default profile is missing assay_type
+        bad_json_file = self.path('configuration_profiles', 'bad.json')
+        self.delete_these.append(bad_json_file)
 
-        another_bad_json_file = self.path('configuration_profiles',
-                                          'more_bad.json')
-        self.delete_these.append(another_bad_json_file)
-
-        with open(another_bad_json_file, 'w') as f:
+        with open(bad_json_file, 'w') as f:
             f.write('{ "profile": { "instrument_type": "MiSeq", '
                     '"configuration": { "bcl2fastq": { "nodes": 1, "nprocs": '
                     '16, "queue": "qiita", "wallclock_time_in_minutes": 216, '
@@ -345,7 +327,7 @@ def test_creation(self):
                                                 "attribute in 'sequence_"
                                                 "processing_pipeline/tests/"
                                                 "data/configuration_profiles/"
-                                                "more_bad.json'"):
+                                                "bad.json'"):
             Pipeline(self.good_config_file,
                      self.good_run_id,
                      self.good_sample_sheet_path, None,