From 6978944656068e4f5cfe046c4cf620908c32ebf0 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Thu, 18 Apr 2024 14:46:11 -0700
Subject: [PATCH] Removed legacy atropos-and-bowtie2 setting.

Tests reviewed and adjusted appropriately.
---
 metapool/prep.py                              |  50 +++++------------
 metapool/scripts/seqpro.py                    |  29 +++-------
 metapool/scripts/tests/test_seqpro.py         |  52 +----------------
 .../sample_2_S10_L003_R1_001.fastq.gz         | Bin 71 -> 0 bytes
 .../sample_2_S10_L003_R2_001.fastq.gz         | Bin 71 -> 0 bytes
 metapool/tests/test_prep.py                   |  53 ++++++++----------
 6 files changed, 47 insertions(+), 137 deletions(-)
 delete mode 100644 metapool/tests/data/runs/191103_D32611_0365_G00DHB5YXX/Baz_12345/sample_2_S10_L003_R1_001.fastq.gz
 delete mode 100644 metapool/tests/data/runs/191103_D32611_0365_G00DHB5YXX/Baz_12345/sample_2_S10_L003_R2_001.fastq.gz

diff --git a/metapool/prep.py b/metapool/prep.py
index 1b28c17d..3018ca89 100644
--- a/metapool/prep.py
+++ b/metapool/prep.py
@@ -156,7 +156,7 @@ def remove_qiita_id(project_name):
         return matches[1]
 
 
-def get_run_prefix(run_path, project, sample_id, lane, pipeline):
+def get_run_prefix(run_path, project, sample_id, lane):
     """For a sample find the run prefix
 
     Parameters
@@ -169,9 +169,6 @@ def get_run_prefix(run_path, project, sample_id, lane, pipeline):
         Sample ID (was sample_name). Changed to reflect name used for files.
     lane: str
         Lane number
-    pipeline: str
-        The pipeline used to generate the data. Should be one of
-        `atropos-and-bowtie2` or `fastp-and-minimap2`.
 
     Returns
     -------
@@ -182,32 +179,17 @@ def get_run_prefix(run_path, project, sample_id, lane, pipeline):
     base = os.path.join(run_path, project)
     path = base
 
-    # each pipeline sets up a slightly different directory structure,
-    # importantly fastp-and-minimap2 won't save intermediate files
-    if pipeline == 'atropos-and-bowtie2':
-        qc = os.path.join(base, 'atropos_qc')
-        hf = os.path.join(base, 'filtered_sequences')
-
-        # If both folders exist and have sequence files always prefer the
-        # human-filtered sequences
-        if _exists_and_has_files(qc):
-            path = qc
-            if _exists_and_has_files(hf):
-                path = hf
-    elif pipeline == 'fastp-and-minimap2':
-        qc = os.path.join(base, 'trimmed_sequences')
-        hf = os.path.join(base, 'filtered_sequences')
-
-        if _exists_and_has_files(qc) and _exists_and_has_files(hf):
-            path = hf
-        elif _exists_and_has_files(qc):
-            path = qc
-        elif _exists_and_has_files(hf):
-            path = hf
-        else:
-            path = base
+    qc = os.path.join(base, 'trimmed_sequences')
+    hf = os.path.join(base, 'filtered_sequences')
+
+    if _exists_and_has_files(qc) and _exists_and_has_files(hf):
+        path = hf
+    elif _exists_and_has_files(qc):
+        path = qc
+    elif _exists_and_has_files(hf):
+        path = hf
     else:
-        raise ValueError('Invalid pipeline "%s"' % pipeline)
+        path = base
 
     search_me = '%s_S*_L*%s_R*.fastq.gz' % (sample_id, lane)
 
@@ -453,7 +435,7 @@ def process_sample(sample, prep_columns, run_center, run_date, run_prefix,
 
 
 def preparations_for_run(run_path, sheet, generated_prep_columns,
-                         carried_prep_columns, pipeline='fastp-and-minimap2'):
+                         carried_prep_columns):
     """Given a run's path and sample sheet generates preparation files
 
     Parameters
@@ -469,11 +451,6 @@ def preparations_for_run(run_path, sheet, generated_prep_columns,
     carried_prep_columns: list
         List of required columns for output that are expected in KLSampleSheet.
         Varies w/different versions of KLSampleSheet.
-    pipeline: str, optional
-        Which pipeline generated the data. The important difference is that
-        `atropos-and-bowtie2` saves intermediate files, whereas
-        `fastp-and-minimap2` doesn't. Default is `fastp-and-minimap2`, the
-        latest version of the sequence processing pipeline.
 
     Returns
     -------
@@ -533,8 +510,7 @@ def preparations_for_run(run_path, sheet, generated_prep_columns,
                 else:
                     sample_id = sample.sample_id
 
-                run_prefix = get_run_prefix(run_path, project, sample_id, lane,
-                                            pipeline)
+                run_prefix = get_run_prefix(run_path, project, sample_id, lane)
 
                 # ignore the sample if there's no file
                 if run_prefix is not None:
diff --git a/metapool/scripts/seqpro.py b/metapool/scripts/seqpro.py
index 769d0908..f3cc055d 100755
--- a/metapool/scripts/seqpro.py
+++ b/metapool/scripts/seqpro.py
@@ -16,13 +16,9 @@
 @click.argument('sample_sheet', type=click.Path(exists=True, dir_okay=False,
                                                 file_okay=True))
 @click.argument('output_dir', type=click.Path(writable=True))
-@click.option('--pipeline', help='Which pipeline generated the data',
-              show_default=True, default='fastp-and-minimap2',
-              type=click.Choice(['atropos-and-bowtie2', 'fastp-and-minimap2']))
 @click.option('--verbose', help='list prep-file output paths, study_ids',
               is_flag=True)
-def format_preparation_files(run_dir, sample_sheet, output_dir, pipeline,
-                             verbose):
+def format_preparation_files(run_dir, sample_sheet, output_dir, verbose):
     """Generate the preparation files for the projects in a run
 
     RUN_DIR: should be the directory where the results of running bcl2fastq are
@@ -41,13 +37,9 @@ def format_preparation_files(run_dir, sample_sheet, output_dir, pipeline,
     sample_sheet = load_sample_sheet(sample_sheet)
     df_sheet = sample_sheet_to_dataframe(sample_sheet)
 
-    if pipeline == 'fastp-and-minimap2':
-        stats = run_counts(run_dir, sample_sheet)
-        stats['sample_name'] = \
-            df_sheet.set_index('lane', append=True)['sample_name']
-    else:
-        click.echo('Stats collection is not supported for pipeline '
-                   'atropos-and-bowtie2')
+    stats = run_counts(run_dir, sample_sheet)
+    stats['sample_name'] = \
+        df_sheet.set_index('lane', append=True)['sample_name']
 
     # sample_sheet_to_dataframe() automatically lowercases the column names
     # before returning df_sheet. Hence, sample_sheet.CARRIED_PREP_COLUMNS also
@@ -58,20 +50,17 @@ def format_preparation_files(run_dir, sample_sheet, output_dir, pipeline,
     preps = preparations_for_run(run_dir,
                                  df_sheet,
                                  sample_sheet.GENERATED_PREP_COLUMNS,
-                                 c_prep_columns,
-                                 pipeline=pipeline)
+                                 c_prep_columns)
 
     os.makedirs(output_dir, exist_ok=True)
 
     for (run, project, lane), df in preps.items():
         fp = os.path.join(output_dir, f'{run}.{project}.{lane}.tsv')
 
-        if pipeline == 'fastp-and-minimap2':
-            # stats are indexed by sample name and lane, lane is the first
-            # level index. When merging, make sure to select the lane subset
-            # that we care about, otherwise we'll end up with repeated rows
-            df = df.merge(stats.xs(lane, level=1), how='left',
-                          on='sample_name')
+        # stats are indexed by sample name and lane, lane is the first
+        # level index. When merging, make sure to select the lane subset
+        # that we care about, otherwise we'll end up with repeated rows
+        df = df.merge(stats.xs(lane, level=1), how='left', on='sample_name')
 
         # strip qiita_id from project names in sample_project column
         df['sample_project'] = df['sample_project'].map(
diff --git a/metapool/scripts/tests/test_seqpro.py b/metapool/scripts/tests/test_seqpro.py
index 171fcce5..8d4f6a86 100644
--- a/metapool/scripts/tests/test_seqpro.py
+++ b/metapool/scripts/tests/test_seqpro.py
@@ -36,46 +36,6 @@ def setUp(self):
     def tearDown(self):
         rmtree(self.vf_test_dir, ignore_errors=True)
 
-    def test_atropos_run(self):
-        runner = CliRunner()
-
-        with runner.isolated_filesystem():
-            result = runner.invoke(
-                format_preparation_files,
-                args=[
-                    self.run,
-                    self.sheet,
-                    "./",
-                    "--pipeline",
-                    "atropos-and-bowtie2",
-                ],
-            )
-
-            # assert that expected error message appeared in stdout. we are
-            # not concerned w/warning messages that may also appear.
-            self.assertIn(
-                "Stats collection is not supported for pipeline "
-                "atropos-and-bowtie2",
-                result.output,
-            )
-            self.assertEqual(result.exit_code, 0)
-
-            exp_preps = [
-                "191103_D32611_0365_G00DHB5YXX.Baz_12345.1.tsv",
-                "191103_D32611_0365_G00DHB5YXX.Baz_12345.3.tsv",
-                "191103_D32611_0365_G00DHB5YXX.FooBar_666.3.tsv",
-            ]
-
-            self.assertEqual(sorted(os.listdir("./")), exp_preps)
-
-            for prep, exp_lines in zip(exp_preps, [4, 4, 5]):
-                with open(prep) as f:
-                    self.assertEqual(
-                        len(f.read().split("\n")),
-                        exp_lines,
-                        "Assertion error in %s" % prep,
-                    )
-
     def test_fastp_run(self):
         runner = CliRunner()
 
@@ -85,9 +45,7 @@ def test_fastp_run(self):
                 args=[
                     self.fastp_run,
                     self.fastp_sheet,
-                    "./",
-                    "--pipeline",
-                    "fastp-and-minimap2",
+                    "./"
                 ],
             )
 
@@ -396,9 +354,7 @@ def test_legacy_run(self):
                 args=[
                     self.fastp_run,
                     self.v90_test_sheet,
-                    "./",
-                    "--pipeline",
-                    "fastp-and-minimap2",
+                    "./"
                 ],
             )
 
@@ -442,9 +398,7 @@ def test_fastp_run(self):
                 args=[
                     self.temp_copy,
                     self.fastp_sheet,
-                    "./",
-                    "--pipeline",
-                    "fastp-and-minimap2",
+                    "./"
                 ],
             )
             self.assertEqual(result.output, "")
diff --git a/metapool/tests/data/runs/191103_D32611_0365_G00DHB5YXX/Baz_12345/sample_2_S10_L003_R1_001.fastq.gz b/metapool/tests/data/runs/191103_D32611_0365_G00DHB5YXX/Baz_12345/sample_2_S10_L003_R1_001.fastq.gz
deleted file mode 100644
index ab6ab53e9246e53bb3208da3a25d582a74e46e46..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 71
zcmb2|=HSr0@h6Ufxi~SmAScx@KG@JO-p9beI6lZwFD<dSq>!Ol!%x@bNl^)3;qyz+
bf>@aJ*cWsNPEY1^Vqw@7-E>EVfq?-4O%xX;

diff --git a/metapool/tests/data/runs/191103_D32611_0365_G00DHB5YXX/Baz_12345/sample_2_S10_L003_R2_001.fastq.gz b/metapool/tests/data/runs/191103_D32611_0365_G00DHB5YXX/Baz_12345/sample_2_S10_L003_R2_001.fastq.gz
deleted file mode 100644
index 19ea90a9eca872cd565e89730b29eaa8bb8e0617..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 71
zcmb2|=HRft@h6Ufxi~SmAScx@KG@JO-p9beI6lZoFD<dSq>!Ol!%x@bNl^)3;qyz+
bf>@aJ*cWsNPEY1^Vqw@7-E>EVfq?-4QS}!%

diff --git a/metapool/tests/test_prep.py b/metapool/tests/test_prep.py
index 942ce7f1..7f54b032 100644
--- a/metapool/tests/test_prep.py
+++ b/metapool/tests/test_prep.py
@@ -75,6 +75,9 @@ def _check_run_191103_D32611_0365_G00DHB5YXX(self, obs):
         # make sure the columns are in the same order before comparing
         obs_df = obs_df[exp.columns].copy()
 
+        exp.to_csv('exp')
+        obs_df.to_csv('obs')
+
         pd.testing.assert_frame_equal(obs_df, exp)
 
         data = [['sample.1', 'Eqiiperiment', 'Knight Lab Kapa HP',
@@ -202,8 +205,7 @@ def test_preparations_for_run(self):
         obs = preparations_for_run(self.good_run,
                                    sample_sheet_to_dataframe(sheet),
                                    sheet.GENERATED_PREP_COLUMNS,
-                                   sheet.CARRIED_PREP_COLUMNS,
-                                   pipeline='atropos-and-bowtie2')
+                                   sheet.CARRIED_PREP_COLUMNS)
         self._check_run_191103_D32611_0365_G00DHB5YXX(obs)
 
     def test_preparations_for_run_missing_columns(self):
@@ -217,8 +219,7 @@ def test_preparations_for_run_missing_columns(self):
         with self.assertWarns(UserWarning) as cm:
             obs = preparations_for_run(self.good_run, ss,
                                        sheet.GENERATED_PREP_COLUMNS,
-                                       sheet.CARRIED_PREP_COLUMNS,
-                                       pipeline='atropos-and-bowtie2')
+                                       sheet.CARRIED_PREP_COLUMNS)
 
             self.assertEqual(str(cm.warnings[0].message), "'well_description' "
                                                           "is not present in s"
@@ -287,67 +288,57 @@ def test_remove_qiita_id(self):
 
     def test_get_run_prefix(self):
         # project 1
-        obs = get_run_prefix(self.good_run, 'Baz_12345', 'sample_1', '1',
-                             'atropos-and-bowtie2')
+        obs = get_run_prefix(self.good_run, 'Baz_12345', 'sample_1', '1')
         self.assertEqual('sample_1_S11_L001', obs)
 
-        obs = get_run_prefix(self.good_run, 'Baz_12345', 'sample_1', '3',
-                             'atropos-and-bowtie2')
+        obs = get_run_prefix(self.good_run, 'Baz_12345', 'sample_1', '3')
         self.assertEqual('sample_1_S11_L003', obs)
 
-        obs = get_run_prefix(self.good_run, 'Baz_12345', 'sample_2', '1',
-                             'atropos-and-bowtie2')
+        obs = get_run_prefix(self.good_run, 'Baz_12345', 'sample_2', '1')
         self.assertEqual('sample_2_S10_L001', obs)
 
-        obs = get_run_prefix(self.good_run, 'Baz_12345', 'sample_2', '3',
-                             'atropos-and-bowtie2')
-        self.assertIsNone(obs)
-
         # project 2
-        obs = get_run_prefix(self.good_run, 'FooBar_666', 'sample_31', '3',
-                             'atropos-and-bowtie2')
+        obs = get_run_prefix(self.good_run, 'FooBar_666', 'sample_31', '3')
         self.assertEqual('sample_31_S13_L003', obs)
 
-        obs = get_run_prefix(self.good_run, 'FooBar_666', 'sample_32', '3',
-                             'atropos-and-bowtie2')
+        obs = get_run_prefix(self.good_run, 'FooBar_666', 'sample_32', '3')
         self.assertEqual('sample_32_S19_L003', obs)
 
-        obs = get_run_prefix(self.good_run, 'FooBar_666', 'sample_34', '3',
-                             'atropos-and-bowtie2')
+        obs = get_run_prefix(self.good_run, 'FooBar_666', 'sample_34', '3')
         self.assertEqual('sample_34_S33_L003', obs)
 
     def test_get_run_prefix_fastp_minimap(self):
         obs = get_run_prefix(self.good_run_new_version, 'Baz_12345', 'sample1',
-                             '1', 'fastp-and-minimap2')
+                             '1')
         self.assertEqual('sample1_S11_L001', obs)
 
         obs = get_run_prefix(self.good_run_new_version, 'Baz_12345', 'sample1',
-                             '3', 'fastp-and-minimap2')
+                             '3')
         self.assertEqual('sample1_S11_L003', obs)
 
         obs = get_run_prefix(self.good_run_new_version, 'Baz_12345', 'sample2',
-                             '1', 'fastp-and-minimap2')
+                             '1')
         self.assertEqual('sample2_S10_L001', obs)
 
         obs = get_run_prefix(self.good_run_new_version, 'Baz_12345',
-                             'sample44', '3', 'fastp-and-minimap2')
+                             'sample44', '3')
         self.assertIsNone(obs)
 
         obs = get_run_prefix(self.good_run_new_version, 'Baz_12345', 'sample2',
-                             '3', 'fastp-and-minimap2')
+                             '3')
         self.assertIsNone(obs)
 
         # project 2
         obs = get_run_prefix(self.good_run_new_version, 'FooBar_666',
-                             'sample31', '3', 'fastp-and-minimap2')
+                             'sample31', '3')
         self.assertEqual('sample31_S13_L003', obs)
 
         obs = get_run_prefix(self.good_run_new_version, 'FooBar_666',
-                             'sample32', '3', 'fastp-and-minimap2')
+                             'sample32', '3')
         self.assertEqual('sample32_S19_L003', obs)
 
         obs = get_run_prefix(self.good_run_new_version, 'FooBar_666',
-                             'sample34', '3', 'fastp-and-minimap2')
+                             'sample34', '3')
         self.assertIsNone(obs)
 
     def test_get_run_prefix_more_than_forward_and_reverse(self):
@@ -365,15 +356,15 @@ def test_get_run_prefix_more_than_forward_and_reverse(self):
         # project 2
         with self.assertWarnsRegex(Warning, message):
             obs = get_run_prefix(self.OKish_run_new_version, 'FooBar_666',
-                                 'sample31', '3', 'fastp-and-minimap2')
+                                 'sample31', '3')
             self.assertIsNone(obs)
 
         obs = get_run_prefix(self.OKish_run_new_version, 'FooBar_666',
-                             'sample32', '3', 'fastp-and-minimap2')
+                             'sample32', '3')
         self.assertEqual('sample32_S19_L003', obs)
 
         obs = get_run_prefix(self.OKish_run_new_version, 'FooBar_666',
-                             'sample34', '3', 'fastp-and-minimap2')
+                             'sample34', '3')
         self.assertIsNone(obs)
 
     def test_is_non_empty_gz_file(self):