From 6978944656068e4f5cfe046c4cf620908c32ebf0 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Thu, 18 Apr 2024 14:46:11 -0700 Subject: [PATCH] Removed legacy atropos-and-bowtie2 setting. Tests reviewed and adjusted appropriately. --- metapool/prep.py | 50 +++++------------ metapool/scripts/seqpro.py | 29 +++------- metapool/scripts/tests/test_seqpro.py | 52 +---------------- .../sample_2_S10_L003_R1_001.fastq.gz | Bin 71 -> 0 bytes .../sample_2_S10_L003_R2_001.fastq.gz | Bin 71 -> 0 bytes metapool/tests/test_prep.py | 53 ++++++++---------- 6 files changed, 47 insertions(+), 137 deletions(-) delete mode 100644 metapool/tests/data/runs/191103_D32611_0365_G00DHB5YXX/Baz_12345/sample_2_S10_L003_R1_001.fastq.gz delete mode 100644 metapool/tests/data/runs/191103_D32611_0365_G00DHB5YXX/Baz_12345/sample_2_S10_L003_R2_001.fastq.gz diff --git a/metapool/prep.py b/metapool/prep.py index 1b28c17d..3018ca89 100644 --- a/metapool/prep.py +++ b/metapool/prep.py @@ -156,7 +156,7 @@ def remove_qiita_id(project_name): return matches[1] -def get_run_prefix(run_path, project, sample_id, lane, pipeline): +def get_run_prefix(run_path, project, sample_id, lane): """For a sample find the run prefix Parameters @@ -169,9 +169,6 @@ def get_run_prefix(run_path, project, sample_id, lane, pipeline): Sample ID (was sample_name). Changed to reflect name used for files. lane: str Lane number - pipeline: str - The pipeline used to generate the data. Should be one of - `atropos-and-bowtie2` or `fastp-and-minimap2`. Returns ------- @@ -182,32 +179,17 @@ def get_run_prefix(run_path, project, sample_id, lane, pipeline): base = os.path.join(run_path, project) path = base - # each pipeline sets up a slightly different directory structure, - # importantly fastp-and-minimap2 won't save intermediate files - if pipeline == 'atropos-and-bowtie2': - qc = os.path.join(base, 'atropos_qc') - hf = os.path.join(base, 'filtered_sequences') - - # If both folders exist and have sequence files always prefer the - # human-filtered sequences - if _exists_and_has_files(qc): - path = qc - if _exists_and_has_files(hf): - path = hf - elif pipeline == 'fastp-and-minimap2': - qc = os.path.join(base, 'trimmed_sequences') - hf = os.path.join(base, 'filtered_sequences') - - if _exists_and_has_files(qc) and _exists_and_has_files(hf): - path = hf - elif _exists_and_has_files(qc): - path = qc - elif _exists_and_has_files(hf): - path = hf - else: - path = base + qc = os.path.join(base, 'trimmed_sequences') + hf = os.path.join(base, 'filtered_sequences') + + if _exists_and_has_files(qc) and _exists_and_has_files(hf): + path = hf + elif _exists_and_has_files(qc): + path = qc + elif _exists_and_has_files(hf): + path = hf else: - raise ValueError('Invalid pipeline "%s"' % pipeline) + path = base search_me = '%s_S*_L*%s_R*.fastq.gz' % (sample_id, lane) @@ -453,7 +435,7 @@ def process_sample(sample, prep_columns, run_center, run_date, run_prefix, def preparations_for_run(run_path, sheet, generated_prep_columns, - carried_prep_columns, pipeline='fastp-and-minimap2'): + carried_prep_columns): """Given a run's path and sample sheet generates preparation files Parameters @@ -469,11 +451,6 @@ def preparations_for_run(run_path, sheet, generated_prep_columns, carried_prep_columns: list List of required columns for output that are expected in KLSampleSheet. Varies w/different versions of KLSampleSheet. - pipeline: str, optional - Which pipeline generated the data. The important difference is that - `atropos-and-bowtie2` saves intermediate files, whereas - `fastp-and-minimap2` doesn't. Default is `fastp-and-minimap2`, the - latest version of the sequence processing pipeline. Returns ------- @@ -533,8 +510,7 @@ def preparations_for_run(run_path, sheet, generated_prep_columns, else: sample_id = sample.sample_id - run_prefix = get_run_prefix(run_path, project, sample_id, lane, - pipeline) + run_prefix = get_run_prefix(run_path, project, sample_id, lane) # ignore the sample if there's no file if run_prefix is not None: diff --git a/metapool/scripts/seqpro.py b/metapool/scripts/seqpro.py index 769d0908..f3cc055d 100755 --- a/metapool/scripts/seqpro.py +++ b/metapool/scripts/seqpro.py @@ -16,13 +16,9 @@ @click.argument('sample_sheet', type=click.Path(exists=True, dir_okay=False, file_okay=True)) @click.argument('output_dir', type=click.Path(writable=True)) -@click.option('--pipeline', help='Which pipeline generated the data', - show_default=True, default='fastp-and-minimap2', - type=click.Choice(['atropos-and-bowtie2', 'fastp-and-minimap2'])) @click.option('--verbose', help='list prep-file output paths, study_ids', is_flag=True) -def format_preparation_files(run_dir, sample_sheet, output_dir, pipeline, - verbose): +def format_preparation_files(run_dir, sample_sheet, output_dir, verbose): """Generate the preparation files for the projects in a run RUN_DIR: should be the directory where the results of running bcl2fastq are @@ -41,13 +37,9 @@ def format_preparation_files(run_dir, sample_sheet, output_dir, pipeline, sample_sheet = load_sample_sheet(sample_sheet) df_sheet = sample_sheet_to_dataframe(sample_sheet) - if pipeline == 'fastp-and-minimap2': - stats = run_counts(run_dir, sample_sheet) - stats['sample_name'] = \ - df_sheet.set_index('lane', append=True)['sample_name'] - else: - click.echo('Stats collection is not supported for pipeline ' - 'atropos-and-bowtie2') + stats = run_counts(run_dir, sample_sheet) + stats['sample_name'] = \ + df_sheet.set_index('lane', append=True)['sample_name'] # sample_sheet_to_dataframe() automatically lowercases the column names # before returning df_sheet. Hence, sample_sheet.CARRIED_PREP_COLUMNS also @@ -58,20 +50,17 @@ def format_preparation_files(run_dir, sample_sheet, output_dir, pipeline, preps = preparations_for_run(run_dir, df_sheet, sample_sheet.GENERATED_PREP_COLUMNS, - c_prep_columns, - pipeline=pipeline) + c_prep_columns) os.makedirs(output_dir, exist_ok=True) for (run, project, lane), df in preps.items(): fp = os.path.join(output_dir, f'{run}.{project}.{lane}.tsv') - if pipeline == 'fastp-and-minimap2': - # stats are indexed by sample name and lane, lane is the first - # level index. When merging, make sure to select the lane subset - # that we care about, otherwise we'll end up with repeated rows - df = df.merge(stats.xs(lane, level=1), how='left', - on='sample_name') + # stats are indexed by sample name and lane, lane is the first + # level index. When merging, make sure to select the lane subset + # that we care about, otherwise we'll end up with repeated rows + df = df.merge(stats.xs(lane, level=1), how='left', on='sample_name') # strip qiita_id from project names in sample_project column df['sample_project'] = df['sample_project'].map( diff --git a/metapool/scripts/tests/test_seqpro.py b/metapool/scripts/tests/test_seqpro.py index 171fcce5..8d4f6a86 100644 --- a/metapool/scripts/tests/test_seqpro.py +++ b/metapool/scripts/tests/test_seqpro.py @@ -36,46 +36,6 @@ def setUp(self): def tearDown(self): rmtree(self.vf_test_dir, ignore_errors=True) - def test_atropos_run(self): - runner = CliRunner() - - with runner.isolated_filesystem(): - result = runner.invoke( - format_preparation_files, - args=[ - self.run, - self.sheet, - "./", - "--pipeline", - "atropos-and-bowtie2", - ], - ) - - # assert that expected error message appeared in stdout. we are - # not concerned w/warning messages that may also appear. - self.assertIn( - "Stats collection is not supported for pipeline " - "atropos-and-bowtie2", - result.output, - ) - self.assertEqual(result.exit_code, 0) - - exp_preps = [ - "191103_D32611_0365_G00DHB5YXX.Baz_12345.1.tsv", - "191103_D32611_0365_G00DHB5YXX.Baz_12345.3.tsv", - "191103_D32611_0365_G00DHB5YXX.FooBar_666.3.tsv", - ] - - self.assertEqual(sorted(os.listdir("./")), exp_preps) - - for prep, exp_lines in zip(exp_preps, [4, 4, 5]): - with open(prep) as f: - self.assertEqual( - len(f.read().split("\n")), - exp_lines, - "Assertion error in %s" % prep, - ) - def test_fastp_run(self): runner = CliRunner() @@ -85,9 +45,7 @@ def test_fastp_run(self): args=[ self.fastp_run, self.fastp_sheet, - "./", - "--pipeline", - "fastp-and-minimap2", + "./" ], ) @@ -396,9 +354,7 @@ def test_legacy_run(self): args=[ self.fastp_run, self.v90_test_sheet, - "./", - "--pipeline", - "fastp-and-minimap2", + "./" ], ) @@ -442,9 +398,7 @@ def test_fastp_run(self): args=[ self.temp_copy, self.fastp_sheet, - "./", - "--pipeline", - "fastp-and-minimap2", + "./" ], ) self.assertEqual(result.output, "") diff --git a/metapool/tests/data/runs/191103_D32611_0365_G00DHB5YXX/Baz_12345/sample_2_S10_L003_R1_001.fastq.gz b/metapool/tests/data/runs/191103_D32611_0365_G00DHB5YXX/Baz_12345/sample_2_S10_L003_R1_001.fastq.gz deleted file mode 100644 index ab6ab53e9246e53bb3208da3a25d582a74e46e46..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 71 zcmb2|=HSr0@h6Ufxi~SmAScx@KG@JO-p9beI6lZwFD!Ol!%x@bNl^)3;qyz+ bf>@aJ*cWsNPEY1^Vqw@7-E>EVfq?-4O%xX; diff --git a/metapool/tests/data/runs/191103_D32611_0365_G00DHB5YXX/Baz_12345/sample_2_S10_L003_R2_001.fastq.gz b/metapool/tests/data/runs/191103_D32611_0365_G00DHB5YXX/Baz_12345/sample_2_S10_L003_R2_001.fastq.gz deleted file mode 100644 index 19ea90a9eca872cd565e89730b29eaa8bb8e0617..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 71 zcmb2|=HRft@h6Ufxi~SmAScx@KG@JO-p9beI6lZoFD!Ol!%x@bNl^)3;qyz+ bf>@aJ*cWsNPEY1^Vqw@7-E>EVfq?-4QS}!% diff --git a/metapool/tests/test_prep.py b/metapool/tests/test_prep.py index 942ce7f1..7f54b032 100644 --- a/metapool/tests/test_prep.py +++ b/metapool/tests/test_prep.py @@ -75,6 +75,9 @@ def _check_run_191103_D32611_0365_G00DHB5YXX(self, obs): # make sure the columns are in the same order before comparing obs_df = obs_df[exp.columns].copy() + exp.to_csv('exp') + obs_df.to_csv('obs') + pd.testing.assert_frame_equal(obs_df, exp) data = [['sample.1', 'Eqiiperiment', 'Knight Lab Kapa HP', @@ -202,8 +205,7 @@ def test_preparations_for_run(self): obs = preparations_for_run(self.good_run, sample_sheet_to_dataframe(sheet), sheet.GENERATED_PREP_COLUMNS, - sheet.CARRIED_PREP_COLUMNS, - pipeline='atropos-and-bowtie2') + sheet.CARRIED_PREP_COLUMNS) self._check_run_191103_D32611_0365_G00DHB5YXX(obs) def test_preparations_for_run_missing_columns(self): @@ -217,8 +219,7 @@ def test_preparations_for_run_missing_columns(self): with self.assertWarns(UserWarning) as cm: obs = preparations_for_run(self.good_run, ss, sheet.GENERATED_PREP_COLUMNS, - sheet.CARRIED_PREP_COLUMNS, - pipeline='atropos-and-bowtie2') + sheet.CARRIED_PREP_COLUMNS) self.assertEqual(str(cm.warnings[0].message), "'well_description' " "is not present in s" @@ -287,67 +288,57 @@ def test_remove_qiita_id(self): def test_get_run_prefix(self): # project 1 - obs = get_run_prefix(self.good_run, 'Baz_12345', 'sample_1', '1', - 'atropos-and-bowtie2') + obs = get_run_prefix(self.good_run, 'Baz_12345', 'sample_1', '1') self.assertEqual('sample_1_S11_L001', obs) - obs = get_run_prefix(self.good_run, 'Baz_12345', 'sample_1', '3', - 'atropos-and-bowtie2') + obs = get_run_prefix(self.good_run, 'Baz_12345', 'sample_1', '3') self.assertEqual('sample_1_S11_L003', obs) - obs = get_run_prefix(self.good_run, 'Baz_12345', 'sample_2', '1', - 'atropos-and-bowtie2') + obs = get_run_prefix(self.good_run, 'Baz_12345', 'sample_2', '1') self.assertEqual('sample_2_S10_L001', obs) - obs = get_run_prefix(self.good_run, 'Baz_12345', 'sample_2', '3', - 'atropos-and-bowtie2') - self.assertIsNone(obs) - # project 2 - obs = get_run_prefix(self.good_run, 'FooBar_666', 'sample_31', '3', - 'atropos-and-bowtie2') + obs = get_run_prefix(self.good_run, 'FooBar_666', 'sample_31', '3') self.assertEqual('sample_31_S13_L003', obs) - obs = get_run_prefix(self.good_run, 'FooBar_666', 'sample_32', '3', - 'atropos-and-bowtie2') + obs = get_run_prefix(self.good_run, 'FooBar_666', 'sample_32', '3') self.assertEqual('sample_32_S19_L003', obs) - obs = get_run_prefix(self.good_run, 'FooBar_666', 'sample_34', '3', - 'atropos-and-bowtie2') + obs = get_run_prefix(self.good_run, 'FooBar_666', 'sample_34', '3') self.assertEqual('sample_34_S33_L003', obs) def test_get_run_prefix_fastp_minimap(self): obs = get_run_prefix(self.good_run_new_version, 'Baz_12345', 'sample1', - '1', 'fastp-and-minimap2') + '1') self.assertEqual('sample1_S11_L001', obs) obs = get_run_prefix(self.good_run_new_version, 'Baz_12345', 'sample1', - '3', 'fastp-and-minimap2') + '3') self.assertEqual('sample1_S11_L003', obs) obs = get_run_prefix(self.good_run_new_version, 'Baz_12345', 'sample2', - '1', 'fastp-and-minimap2') + '1') self.assertEqual('sample2_S10_L001', obs) obs = get_run_prefix(self.good_run_new_version, 'Baz_12345', - 'sample44', '3', 'fastp-and-minimap2') + 'sample44', '3') self.assertIsNone(obs) obs = get_run_prefix(self.good_run_new_version, 'Baz_12345', 'sample2', - '3', 'fastp-and-minimap2') + '3') self.assertIsNone(obs) # project 2 obs = get_run_prefix(self.good_run_new_version, 'FooBar_666', - 'sample31', '3', 'fastp-and-minimap2') + 'sample31', '3') self.assertEqual('sample31_S13_L003', obs) obs = get_run_prefix(self.good_run_new_version, 'FooBar_666', - 'sample32', '3', 'fastp-and-minimap2') + 'sample32', '3') self.assertEqual('sample32_S19_L003', obs) obs = get_run_prefix(self.good_run_new_version, 'FooBar_666', - 'sample34', '3', 'fastp-and-minimap2') + 'sample34', '3') self.assertIsNone(obs) def test_get_run_prefix_more_than_forward_and_reverse(self): @@ -365,15 +356,15 @@ def test_get_run_prefix_more_than_forward_and_reverse(self): # project 2 with self.assertWarnsRegex(Warning, message): obs = get_run_prefix(self.OKish_run_new_version, 'FooBar_666', - 'sample31', '3', 'fastp-and-minimap2') + 'sample31', '3') self.assertIsNone(obs) obs = get_run_prefix(self.OKish_run_new_version, 'FooBar_666', - 'sample32', '3', 'fastp-and-minimap2') + 'sample32', '3') self.assertEqual('sample32_S19_L003', obs) obs = get_run_prefix(self.OKish_run_new_version, 'FooBar_666', - 'sample34', '3', 'fastp-and-minimap2') + 'sample34', '3') self.assertIsNone(obs) def test_is_non_empty_gz_file(self):